#### `Importing the Required Libraries`

---

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.types as tp

In [2]:
spark = SparkSession.builder.getOrCreate()
spark

---
Read healthcare analytics data

---

In [3]:
# Define the schema of the data
my_schema = tp.StructType([
    tp.StructField(name= "case_id",               dataType= tp.IntegerType()),
    tp.StructField(name= "hospital_code",         dataType= tp.IntegerType()),
    tp.StructField(name= "hospital_type_code",    dataType= tp.StringType()),
    tp.StructField(name= "city_code_hospital",    dataType= tp.IntegerType()),
    tp.StructField(name= "hospital_region_code",  dataType= tp.StringType()),
    tp.StructField(name= "extra_room_available",  dataType= tp.IntegerType()),
    tp.StructField(name= "department",            dataType= tp.StringType()),
    tp.StructField(name= "ward_type",             dataType= tp.StringType()),
    tp.StructField(name= "ward_facility_code",    dataType= tp.StringType()),
    tp.StructField(name= "bed_grade",             dataType= tp.IntegerType()),
    tp.StructField(name= "patient_id",            dataType= tp.IntegerType()),
    tp.StructField(name= "city_code_patient",     dataType= tp.IntegerType()),
    tp.StructField(name= "admission_type",        dataType= tp.StringType()),
    tp.StructField(name= "severity_of_illness",   dataType= tp.StringType()),
    tp.StructField(name= "visitors_with_patient", dataType= tp.IntegerType()),
    tp.StructField(name= "age",                   dataType= tp.StringType()),
    tp.StructField(name= "admission_deposit",     dataType= tp.FloatType()),
    tp.StructField(name= "stay",                  dataType= tp.StringType()),
])

In [4]:
# Read data
healthcare_data = spark.read.csv('data/module_8_train.csv', schema=my_schema, header=True)

In [5]:
# Sample data
sample_data = healthcare_data.select("case_id",
                                     "hospital_code",
                                     "department",
                                     "ward_type",
                                     "patient_id",
                                     "age",
                                     "visitors_with_patient")

# Display data
sample_data.show()

+-------+-------------+------------+---------+----------+-----+---------------------+
|case_id|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|
+-------+-------------+------------+---------+----------+-----+---------------------+
|      1|            8|radiotherapy|        R|     31397|51-60|                    2|
|      2|            2|radiotherapy|        S|     31397|51-60|                    2|
|      3|           10|  anesthesia|        S|     31397|51-60|                    2|
|      4|           26|radiotherapy|        R|     31397|51-60|                    2|
|      5|           26|radiotherapy|        S|     31397|51-60|                    2|
|      6|           23|  anesthesia|        S|     31397|51-60|                    2|
|      7|           32|radiotherapy|        S|     31397|51-60|                    2|
|      8|           23|radiotherapy|        Q|     31397|51-60|                    2|
|      9|            1|  gynecology|        R|     313

---
---


#### `Sorting`


Sort the data using the [orderBy](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) function. 

Pass the parameter on which you want to sort the data. 

We will sort the dataframe on `hospital_code`. By default, it will sort in ascending order.


---

In [6]:
# Sort data
sorted_df = sample_data.orderBy(["hospital_code"])

In [7]:
# Display data
sorted_df.show()

+-------+-------------+------------+---------+----------+-----+---------------------+
|case_id|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|
+-------+-------------+------------+---------+----------+-----+---------------------+
| 299145|            1|  gynecology|        S|     35157|61-70|                    2|
| 300930|            1|  gynecology|        Q|     45144|21-30|                    4|
| 299248|            1|  gynecology|        R|     39870|51-60|                    2|
| 297888|            1|  gynecology|        Q|     81050|11-20|                    4|
| 299278|            1|  gynecology|        Q|     56989|41-50|                    4|
| 298197|            1|  gynecology|        R|     18486|71-80|                    3|
| 299687|            1|  gynecology|        S|     91688|11-20|                    2|
| 298190|            1|  gynecology|        S|      8795|21-30|                    2|
| 299887|            1|radiotherapy|        R|    1127

---


To sort in the decreasing order, pass the argument `ascending = False` inside the  `orderBy` function.

---

In [8]:
# Sort in descending order
sorted_df_descending = sample_data.orderBy(["hospital_code"], ascending = False)

In [9]:
# Display data
sorted_df_descending.show()

+-------+-------------+------------------+---------+----------+-----+---------------------+
|case_id|hospital_code|        department|ward_type|patient_id|  age|visitors_with_patient|
+-------+-------------+------------------+---------+----------+-----+---------------------+
| 298297|           32|        gynecology|        S|     25523|71-80|                    4|
| 299043|           32|        gynecology|        S|     14249|31-40|                    4|
| 298402|           32|        gynecology|        S|     34261|41-50|                    4|
| 298268|           32|        anesthesia|        S|     34494|41-50|                    2|
| 298419|           32|        gynecology|        S|     82148|11-20|                    2|
| 297760|           32|      radiotherapy|        S|     83845|71-80|                    2|
| 298456|           32|        gynecology|        S|     31408|31-40|                    4|
| 297862|           32|        gynecology|        S|     19562|71-80|           

---

To sort on multiple columns, just pass the list of column names in the orderBy function. Also pass a list of `True/False` with ascending parameter with respect to each of the columns.

---

In [10]:
# Sort by multiple columns

# hospital_code in ascending order
# visitors_with_patient in descending order

sorted_df_multiple = sample_data.orderBy(["hospital_code", "visitors_with_patient"], ascending = [True, False])

In [11]:
# Display data
sorted_df_multiple.show()

+-------+-------------+----------+---------+----------+-----+---------------------+
|case_id|hospital_code|department|ward_type|patient_id|  age|visitors_with_patient|
+-------+-------------+----------+---------+----------+-----+---------------------+
|  66407|            1|gynecology|        S|     29947|61-70|                   23|
| 239067|            1|gynecology|        S|     16740|61-70|                   16|
| 239066|            1|gynecology|        S|     16740|61-70|                   16|
| 133483|            1|gynecology|        S|     36014|81-90|                   16|
| 101959|            1|gynecology|        T|     81590|31-40|                   15|
|  48238|            1|anesthesia|        Q|    117410|31-40|                   15|
|  97085|            1|gynecology|        R|      3191|11-20|                   15|
|  45583|            1|gynecology|        Q|     74432|51-60|                   15|
|   4150|            1|gynecology|        R|     25950|61-70|               

---
---
#### `Aggregation & Group By`

You can get the aggregated results on the dataframe using the [groupBy](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) function. 

First group the data using the `groupBy` function, then use the function you want to apply on the grouped data.

We will find out the average number of `visitors_with_patient` for each `hospital code`.


---

In [12]:
# Import functions
from pyspark.sql.functions import countDistinct, avg

In [13]:
# Group the data on hospital_code
grouped_data = sample_data.groupBy("hospital_code")

In [14]:
# Average number of visitors_with_patient
grouped_data_average_visitors = grouped_data.agg(avg("visitors_with_patient"))

In [15]:
# Display data
grouped_data_average_visitors.show()

+-------------+--------------------------+
|hospital_code|avg(visitors_with_patient)|
+-------------+--------------------------+
|           31|        2.9238719435341567|
|           28|         3.272218007819338|
|           26|         3.333504655943887|
|           27|         3.098989048020219|
|           12|        3.3176064441887227|
|           22|          3.11620294599018|
|            1|         3.345018098685464|
|           13|        2.9574102368220014|
|           16|        2.9997275946608553|
|            6|        3.5002692778457773|
|            3|         3.188026981450253|
|           20|        2.9430604982206408|
|            5|        3.3221820946588103|
|           19|         3.621424195296668|
|           15|         3.307334989737496|
|            9|        3.3440486533449176|
|           17|        3.0563533902926743|
|            4|        2.6298387096774194|
|            8|        3.2082992082992083|
|           23|        3.4961605059098093|
+----------

In [16]:
# Sort grouped data
grouped_data_average_visitors.orderBy(['hospital_code']).show()

+-------------+--------------------------+
|hospital_code|avg(visitors_with_patient)|
+-------------+--------------------------+
|            1|         3.345018098685464|
|            2|         3.202469619756958|
|            3|         3.188026981450253|
|            4|        2.6298387096774194|
|            5|        3.3221820946588103|
|            6|        3.5002692778457773|
|            7|         2.768759571209801|
|            8|        3.2082992082992083|
|            9|        3.3440486533449176|
|           10|        3.1571807101218865|
|           11|        3.4263619575253923|
|           12|        3.3176064441887227|
|           13|        2.9574102368220014|
|           14|        3.3149238227146816|
|           15|         3.307334989737496|
|           16|        2.9997275946608553|
|           17|        3.0563533902926743|
|           18|         3.474104683195592|
|           19|         3.621424195296668|
|           20|        2.9430604982206408|
+----------

---

Now, we can see that in the above result, the name of the second column is coming as `avg(visitors_with_patients)`. In case we want to change it, we can use [alias](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy).

See the below example where we will calculate the number of unique `patient_id` in each department. We will pass the name of the column as `unique_patients` in the `alias` function.


---

In [17]:
# Group data on department column
grouped_data_department = sample_data.groupBy("department")

In [18]:
# Calculate unique patient_id per department
# Rename new column as "unique_patients"

grouped_data_department_unique_patient = grouped_data_department.agg(countDistinct("patient_id").alias("unique_patients"))

In [19]:
# Display data
grouped_data_department_unique_patient.show()

+------------------+---------------+
|        department|unique_patients|
+------------------+---------------+
|      radiotherapy|          14524|
|        anesthesia|          18204|
|TB & Chest disease|           6610|
|        gynecology|          83951|
|           surgery|            572|
+------------------+---------------+



----

Now, if you want to see aggregated results in multiple columns, you can pass the list as follows.

----

In [20]:
# Calculate unique patient_id per department
# Calculate average visitors_with_patient per department
# Rename each column

grouped_data_multiple_columns = grouped_data_department.agg(countDistinct("patient_id").alias("unique_patient"), 
                                                            avg("visitors_with_patient").alias("average_patient_visitors"))

In [21]:
# Display data
grouped_data_multiple_columns.orderBy(['unique_patient']).show()

+------------------+--------------+------------------------+
|        department|unique_patient|average_patient_visitors|
+------------------+--------------+------------------------+
|           surgery|           572|      3.7577019150707742|
|TB & Chest disease|          6610|       3.350093886918423|
|      radiotherapy|         14524|       3.376595595455183|
|        anesthesia|         18204|      3.0917062970083307|
|        gynecology|         83951|      3.2915754791852048|
+------------------+--------------+------------------------+



---
---
#### `Read case url data`

Read file on case url data of patient cases.

---

In [22]:
# Define the schema of the data
case_schema = tp.StructType([
                tp.StructField(name= "case_id", dataType= tp.IntegerType()),
                tp.StructField(name= "case_url", dataType= tp.StringType())
])

In [23]:
# Read data
case_data = spark.read.csv('data/module_8_case_url.csv', schema=case_schema, header=True)

In [24]:
# Display dataframe
case_data.show()

+-------+--------------------+
|case_id|            case_url|
+-------+--------------------+
|      0|https://www.healt...|
|      1|https://www.healt...|
|      2|https://www.healt...|
|      3|https://www.healt...|
|      4|https://www.healt...|
|      5|https://www.healt...|
|      6|https://www.healt...|
|      7|https://www.healt...|
|      8|https://www.healt...|
|      9|https://www.healt...|
|     10|https://www.healt...|
|     11|https://www.healt...|
|     12|https://www.healt...|
|     13|https://www.healt...|
|     14|https://www.healt...|
|     15|https://www.healt...|
|     16|https://www.healt...|
|     17|https://www.healt...|
|     18|https://www.healt...|
|     19|https://www.healt...|
+-------+--------------------+
only showing top 20 rows



----
----
#### `Joins`

[Join](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=join#pyspark.sql.DataFrame.join) healthcare analytics data with case url data.

---

In [25]:
# healthcare analytics data
sample_data.show()

+-------+-------------+------------+---------+----------+-----+---------------------+
|case_id|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|
+-------+-------------+------------+---------+----------+-----+---------------------+
|      1|            8|radiotherapy|        R|     31397|51-60|                    2|
|      2|            2|radiotherapy|        S|     31397|51-60|                    2|
|      3|           10|  anesthesia|        S|     31397|51-60|                    2|
|      4|           26|radiotherapy|        R|     31397|51-60|                    2|
|      5|           26|radiotherapy|        S|     31397|51-60|                    2|
|      6|           23|  anesthesia|        S|     31397|51-60|                    2|
|      7|           32|radiotherapy|        S|     31397|51-60|                    2|
|      8|           23|radiotherapy|        Q|     31397|51-60|                    2|
|      9|            1|  gynecology|        R|     313

In [26]:
# case url data
case_data.show()

+-------+--------------------+
|case_id|            case_url|
+-------+--------------------+
|      0|https://www.healt...|
|      1|https://www.healt...|
|      2|https://www.healt...|
|      3|https://www.healt...|
|      4|https://www.healt...|
|      5|https://www.healt...|
|      6|https://www.healt...|
|      7|https://www.healt...|
|      8|https://www.healt...|
|      9|https://www.healt...|
|     10|https://www.healt...|
|     11|https://www.healt...|
|     12|https://www.healt...|
|     13|https://www.healt...|
|     14|https://www.healt...|
|     15|https://www.healt...|
|     16|https://www.healt...|
|     17|https://www.healt...|
|     18|https://www.healt...|
|     19|https://www.healt...|
+-------+--------------------+
only showing top 20 rows



---
<center>Inner join</center>

<center><img src= "images/inner_join.png"></center>

---

In [27]:
# Inner join
inner_join_df = sample_data.join(case_data, 'case_id')

In [28]:
# Display data
inner_join_df.show()

+-------+-------------+------------+---------+----------+-----+---------------------+--------------------+
|case_id|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|            case_url|
+-------+-------------+------------+---------+----------+-----+---------------------+--------------------+
|      1|            8|radiotherapy|        R|     31397|51-60|                    2|https://www.healt...|
|      2|            2|radiotherapy|        S|     31397|51-60|                    2|https://www.healt...|
|      3|           10|  anesthesia|        S|     31397|51-60|                    2|https://www.healt...|
|      4|           26|radiotherapy|        R|     31397|51-60|                    2|https://www.healt...|
|      5|           26|radiotherapy|        S|     31397|51-60|                    2|https://www.healt...|
|      6|           23|  anesthesia|        S|     31397|51-60|                    2|https://www.healt...|
|      7|           32|radiotherapy| 

---
<center>Right Outer join</center>

<center><img src= "images/right_outer_join.png"></center>

---

In [29]:
# Right outer join
right_outer_join_df = sample_data.join(case_data, 'case_id', 'rightouter')

In [30]:
# Display data
right_outer_join_df.show()

+-------+-------------+------------+---------+----------+-----+---------------------+--------------------+
|case_id|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|            case_url|
+-------+-------------+------------+---------+----------+-----+---------------------+--------------------+
|      0|         null|        null|     null|      null| null|                 null|https://www.healt...|
|      1|            8|radiotherapy|        R|     31397|51-60|                    2|https://www.healt...|
|      2|            2|radiotherapy|        S|     31397|51-60|                    2|https://www.healt...|
|      3|           10|  anesthesia|        S|     31397|51-60|                    2|https://www.healt...|
|      4|           26|radiotherapy|        R|     31397|51-60|                    2|https://www.healt...|
|      5|           26|radiotherapy|        S|     31397|51-60|                    2|https://www.healt...|
|      6|           23|  anesthesia| 

#### `Exercise`

* Left Outer Join
* Full Outer Join