#### `Importing the Required Libraries`

---

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.types as tp

In [2]:
spark = SparkSession.builder.getOrCreate()
spark

In [3]:
# Define the schema of the data
my_schema = tp.StructType([
    tp.StructField(name= "case_id",               dataType= tp.IntegerType()),
    tp.StructField(name= "hospital_code",         dataType= tp.IntegerType()),
    tp.StructField(name= "hospital_type_code",    dataType= tp.StringType()),
    tp.StructField(name= "city_code_hospital",    dataType= tp.IntegerType()),
    tp.StructField(name= "hospital_region_code",  dataType= tp.StringType()),
    tp.StructField(name= "extra_room_available",  dataType= tp.IntegerType()),
    tp.StructField(name= "department",            dataType= tp.StringType()),
    tp.StructField(name= "ward_type",             dataType= tp.StringType()),
    tp.StructField(name= "ward_facility_code",    dataType= tp.StringType()),
    tp.StructField(name= "bed_grade",             dataType= tp.IntegerType()),
    tp.StructField(name= "patient_id",            dataType= tp.IntegerType()),
    tp.StructField(name= "city_code_patient",     dataType= tp.IntegerType()),
    tp.StructField(name= "admission_type",        dataType= tp.StringType()),
    tp.StructField(name= "severity_of_illness",   dataType= tp.StringType()),
    tp.StructField(name= "visitors_with_patient", dataType= tp.IntegerType()),
    tp.StructField(name= "age",                   dataType= tp.StringType()),
    tp.StructField(name= "admission_deposit",     dataType= tp.FloatType()),
    tp.StructField(name= "stay",                  dataType= tp.StringType()),
])

In [4]:
# Read data
healthcare_data = spark.read.csv('data/module_8_train.csv', schema=my_schema, header=True)

In [5]:
# Sample data
sample_data = healthcare_data.select("hospital_code",
                                     "department",
                                     "ward_type",
                                     "patient_id",
                                     "age",
                                     "visitors_with_patient")

# Display data
sample_data.show()

+-------------+------------+---------+----------+-----+---------------------+
|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|
+-------------+------------+---------+----------+-----+---------------------+
|            8|radiotherapy|        R|     31397|51-60|                    2|
|            2|radiotherapy|        S|     31397|51-60|                    2|
|           10|  anesthesia|        S|     31397|51-60|                    2|
|           26|radiotherapy|        R|     31397|51-60|                    2|
|           26|radiotherapy|        S|     31397|51-60|                    2|
|           23|  anesthesia|        S|     31397|51-60|                    2|
|           32|radiotherapy|        S|     31397|51-60|                    2|
|           23|radiotherapy|        Q|     31397|51-60|                    2|
|            1|  gynecology|        R|     31397|51-60|                    2|
|           10|  gynecology|        S|     31397|51-60|         

---
---
### `Add New Column`

* **1. Using Expression**
* **2. Using when & otherwise**
* **3. Using User Defined Function**



---





We will have to import some functions from the [pyspark sql module](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark-sql-module)

---

In [6]:
from pyspark.sql.functions import expr, when

---
---

#### `Using Expression`


Suppose we want to create a new column `country`.

All the values in this column have to be `India`.

We need to first define the expression in the [expr](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.functions.expr) function.

---

In [7]:
# Define expression
country_value = expr("'India'")


---

Now, we will use the [withColumn](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.withColumn) function and then pass the name of the new column as `country` along with the expression that we defined above.

---

In [8]:
# Add new column to dataframe
updated_sample_data = sample_data.withColumn("country", country_value)

In [9]:
# Updated dataframe
updated_sample_data.show()

+-------------+------------+---------+----------+-----+---------------------+-------+
|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|country|
+-------------+------------+---------+----------+-----+---------------------+-------+
|            8|radiotherapy|        R|     31397|51-60|                    2|  India|
|            2|radiotherapy|        S|     31397|51-60|                    2|  India|
|           10|  anesthesia|        S|     31397|51-60|                    2|  India|
|           26|radiotherapy|        R|     31397|51-60|                    2|  India|
|           26|radiotherapy|        S|     31397|51-60|                    2|  India|
|           23|  anesthesia|        S|     31397|51-60|                    2|  India|
|           32|radiotherapy|        S|     31397|51-60|                    2|  India|
|           23|radiotherapy|        Q|     31397|51-60|                    2|  India|
|            1|  gynecology|        R|     31397|51-60

---

Suppose we want to create a new column, based on the column `ward_type`. 

If the `ward_type` is `Q`, then column value is 2, otherwise 0. 

---

In [10]:
# Filter condition
filter_expression = expr("IF(ward_type = 'Q', 2, 0)")

In [11]:
# Add new column to the data
updated_sample_data = sample_data.withColumn("ward_number_using_expr", filter_expression)

In [12]:
updated_sample_data.show()

+-------------+------------+---------+----------+-----+---------------------+----------------------+
|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|ward_number_using_expr|
+-------------+------------+---------+----------+-----+---------------------+----------------------+
|            8|radiotherapy|        R|     31397|51-60|                    2|                     0|
|            2|radiotherapy|        S|     31397|51-60|                    2|                     0|
|           10|  anesthesia|        S|     31397|51-60|                    2|                     0|
|           26|radiotherapy|        R|     31397|51-60|                    2|                     0|
|           26|radiotherapy|        S|     31397|51-60|                    2|                     0|
|           23|  anesthesia|        S|     31397|51-60|                    2|                     0|
|           32|radiotherapy|        S|     31397|51-60|                    2|              

---
---

#### `Using when & otherwise`

- Add a new column  using [when](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.functions.when) & [otherwise](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.Column.otherwise) function. 
- Next, use the `withColumn` function and the pass the name of the new column as `ward_number_using_when` and pass the filter condition defined.


---

In [13]:
# Define the condition
filter_condition = when(sample_data["ward_type"] == 'Q', 2).otherwise(0)

In [14]:
# Add new column
updated_sample_data = sample_data.withColumn("ward_number_using_when", filter_condition)

In [15]:
# show the data
updated_sample_data.show()

+-------------+------------+---------+----------+-----+---------------------+----------------------+
|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|ward_number_using_when|
+-------------+------------+---------+----------+-----+---------------------+----------------------+
|            8|radiotherapy|        R|     31397|51-60|                    2|                     0|
|            2|radiotherapy|        S|     31397|51-60|                    2|                     0|
|           10|  anesthesia|        S|     31397|51-60|                    2|                     0|
|           26|radiotherapy|        R|     31397|51-60|                    2|                     0|
|           26|radiotherapy|        S|     31397|51-60|                    2|                     0|
|           23|  anesthesia|        S|     31397|51-60|                    2|                     0|
|           32|radiotherapy|        S|     31397|51-60|                    2|              

---

If we want to use condition on multiple columns. 

For example, if `ward_type` == `Q` then map value to 2. 

If it is not `Q`, then check `visitors_with_patient`. 

If `visitors_with_patient` is less than 4, then map value to 1, and for rest of the cases, map value to 0.

---

In [16]:
# Define condition
filter_condition = when(sample_data["ward_type"] == 'Q', 2).when(sample_data["visitors_with_patient"] < 4, 1).otherwise(0)

In [17]:
# Add new column
updated_sample_data = sample_data.withColumn("new_column_using_when", filter_condition)

In [18]:
# show the data
updated_sample_data.show()

+-------------+------------+---------+----------+-----+---------------------+---------------------+
|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|new_column_using_when|
+-------------+------------+---------+----------+-----+---------------------+---------------------+
|            8|radiotherapy|        R|     31397|51-60|                    2|                    1|
|            2|radiotherapy|        S|     31397|51-60|                    2|                    1|
|           10|  anesthesia|        S|     31397|51-60|                    2|                    1|
|           26|radiotherapy|        R|     31397|51-60|                    2|                    1|
|           26|radiotherapy|        S|     31397|51-60|                    2|                    1|
|           23|  anesthesia|        S|     31397|51-60|                    2|                    1|
|           32|radiotherapy|        S|     31397|51-60|                    2|                    1|


---
---

#### `Using User Define Functions`



Creating columns using [udf](https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.functions.udf) function.


---

In [19]:
from pyspark.sql.functions import udf

---


Define the python function `encode_ward_function` which will take the parameters *ward* and *visitor*. 

If the ward is `Q` then return 2.

If it is not `Q`, then check `visitors_with_patient`. 

If `visitors_with_patient` is less than 4, then return 1. Else return 0.

---

In [20]:
# Define the function to encode ward_type
def encode_ward_visitor(ward, visitor):
    
    if ward == "Q":
        return 2
    elif visitor < 4:
        return 1
    else:
        return 0

---

Now we need to convert this function into `udf` function. 

In the `udf` function we pass the function `encode_ward_visitor` that we defined and we also define the return type.

---

In [21]:
# Convert to udf function
function_with_udf = udf(f= encode_ward_visitor, returnType= tp.IntegerType())

---

Use the `withColumn` function and the pass the name of the new column as `new_column_using_udf` and pass the udf function with the parameters.

---

In [22]:
# Create new column
updated_sample_data = sample_data.withColumn("new_column_using_udf", 
                                             function_with_udf(sample_data["ward_type"], 
                                                               sample_data["visitors_with_patient"]))

In [23]:
updated_sample_data.show()

+-------------+------------+---------+----------+-----+---------------------+--------------------+
|hospital_code|  department|ward_type|patient_id|  age|visitors_with_patient|new_column_using_udf|
+-------------+------------+---------+----------+-----+---------------------+--------------------+
|            8|radiotherapy|        R|     31397|51-60|                    2|                   1|
|            2|radiotherapy|        S|     31397|51-60|                    2|                   1|
|           10|  anesthesia|        S|     31397|51-60|                    2|                   1|
|           26|radiotherapy|        R|     31397|51-60|                    2|                   1|
|           26|radiotherapy|        S|     31397|51-60|                    2|                   1|
|           23|  anesthesia|        S|     31397|51-60|                    2|                   1|
|           32|radiotherapy|        S|     31397|51-60|                    2|                   1|
|         