In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession.builder \
    .appName("Shubham-Malai") \
    .master("yarn") \
    .config("spark.ui.port", "0") \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark

In [3]:
list = [("Spring", 12.3),
("Summer", 10.5),
("Autumn", 8.2),
("Winter", 15.1)]

In [4]:
df_schema = ['season','windspeed']

In [5]:
weather_df = spark.createDataFrame(list, df_schema)

In [6]:
weather_df.show()

+------+---------+
|season|windspeed|
+------+---------+
|Spring|     12.3|
|Summer|     10.5|
|Autumn|      8.2|
|Winter|     15.1|
+------+---------+



In [7]:
weather_df.describe()

summary,season,windspeed
count,4,4.0
mean,,11.525
stddev,,2.9147612823923224
min,Autumn,8.2
max,Winter,15.1


In [8]:
weather_df.printSchema()

root
 |-- season: string (nullable = true)
 |-- windspeed: double (nullable = true)



In [9]:
from pyspark.sql.types import *

In [10]:
library_schema = StructType([
    StructField("books",StringType()),
    StructField("library_name",StringType()),
    StructField("location",StringType()),
    StructField("members",StringType())

    
])

In [11]:
library_df  = spark.read \
.format("json") \
.schema(library_schema) \
.load("/public/trendytech/datasets/library_data.json")


In [12]:
library_df.show()

+--------------------+-----------------+-----------+--------------------+
|               books|     library_name|   location|             members|
+--------------------+-----------------+-----------+--------------------+
|[{"book_id":"B001...|  Central Library|City Center|[{"member_id":"M0...|
|[{"book_id":"B003...|Community Library|     Suburb|[{"member_id":"M0...|
+--------------------+-----------------+-----------+--------------------+



In [13]:
train_df = spark.read \
.format("csv") \
.option("header","true") \
.load("/public/trendytech/datasets/train.csv")

In [14]:
train_df.show()

+------------+----------+---------------+--------------+---+-------------+-----------+
|train_number|train_name|seats_available|passenger_name|age|ticket_number|seat_number|
+------------+----------+---------------+--------------+---+-------------+-----------+
|         123|   Express|            100|          John| 25|         T123|         A1|
|         123|   Express|            100|          Emma| 30|         T124|         B2|
|         456| Superfast|            150|       Michael| 35|         T125|         C3|
|         456| Superfast|            150|        Sophia| 40|         T126|         D4|
|         789|     Local|             50|       William| 28|         T127|         E5|
|         789|     Local|             50|        Sophia| 32|         T128|         F6|
|         789|     Local|             50|        Oliver| 45|         T129|         G7|
+------------+----------+---------------+--------------+---+-------------+-----------+



In [15]:
train_df.printSchema()

root
 |-- train_number: string (nullable = true)
 |-- train_name: string (nullable = true)
 |-- seats_available: string (nullable = true)
 |-- passenger_name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- ticket_number: string (nullable = true)
 |-- seat_number: string (nullable = true)



a) Drop the columns passenger_name and age from the dataset.

In [16]:
train_df2 = train_df.drop("passenger_name","age")

In [17]:
train_df2.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         123|   Express|            100|         T123|         A1|
|         123|   Express|            100|         T124|         B2|
|         456| Superfast|            150|         T125|         C3|
|         456| Superfast|            150|         T126|         D4|
|         789|     Local|             50|         T127|         E5|
|         789|     Local|             50|         T128|         F6|
|         789|     Local|             50|         T129|         G7|
+------------+----------+---------------+-------------+-----------+



b) Count the number of rows after removing duplicates of columns
train_number and ticket_number.

In [18]:
train_df3 = train_df2.dropDuplicates(["train_number","ticket_number"])

In [19]:
train_df3.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         789|     Local|             50|         T128|         F6|
|         456| Superfast|            150|         T125|         C3|
|         789|     Local|             50|         T129|         G7|
|         123|   Express|            100|         T124|         B2|
|         456| Superfast|            150|         T126|         D4|
|         789|     Local|             50|         T127|         E5|
|         123|   Express|            100|         T123|         A1|
+------------+----------+---------------+-------------+-----------+



In [20]:
train_df2.count()

7

In [21]:
train_df3.count()

7

c) Count the number of unique train names.

In [22]:
distinct_trains = train_df3.select("train_name").distinct()

In [23]:
distinct_trains.show()

+----------+
|train_name|
+----------+
|   Express|
|     Local|
| Superfast|
+----------+



1. Read the dataset using the "permissive" mode and count the number of
records read.


In [24]:
sales_df  = spark.read \
.format("json") \
.option("mode","permissive") \
.load("/public/trendytech/datasets/sales_data.json")

In [25]:
sales_df.show(22)

+--------------------+----------+--------+-------+--------+
|     _corrupt_record|   product|quantity|revenue|store_id|
+--------------------+----------+--------+-------+--------+
|                null|     Apple|      10|  100.0|       1|
|                null|    Banana|      15|   75.0|       2|
|                null|    Orange|      12|   90.0|       3|
|                null|     Mango|       8|  120.0|       4|
|                null|     Grape|      20|  150.0|       5|
|                null|Watermelon|       5|   50.0|       6|
|                null|Strawberry|      18|  108.0|       7|
|                null| Pineapple|      14|  140.0|       8|
|                null|    Cherry|       7|  105.0|       9|
|                null|      Pear|       9|   81.0|      10|
|                null| Blueberry|      11|   88.0|      11|
|                null|      Kiwi|      16|  128.0|      12|
|                null|     Peach|      13|   91.0|      13|
|                null|      Plum|       

In [26]:
sales_df.count()

22

Read the dataset using the "dropmalformed" mode and display the
number of malformed records.

In [27]:
sales_df2  = spark.read \
.format("json") \
.option("mode","dropmalformed") \
.load("/public/trendytech/datasets/sales_data.json")

In [28]:
sales_df2.count()

21

In [29]:
sales_df2.show(21)

+----------+--------+-------+--------+
|   product|quantity|revenue|store_id|
+----------+--------+-------+--------+
|     Apple|      10|  100.0|       1|
|    Banana|      15|   75.0|       2|
|    Orange|      12|   90.0|       3|
|     Mango|       8|  120.0|       4|
|     Grape|      20|  150.0|       5|
|Watermelon|       5|   50.0|       6|
|Strawberry|      18|  108.0|       7|
| Pineapple|      14|  140.0|       8|
|    Cherry|       7|  105.0|       9|
|      Pear|       9|   81.0|      10|
| Blueberry|      11|   88.0|      11|
|      Kiwi|      16|  128.0|      12|
|     Peach|      13|   91.0|      13|
|      Plum|       6|   54.0|      14|
|     Lemon|      10|   70.0|      15|
| Raspberry|      17|  136.0|      16|
|   Coconut|       4|   80.0|      17|
|   Avocado|      11|   99.0|      18|
|Blackberry|       8|   64.0|      19|
|         G| Invalid|    NaN|      20|
|Watermelon|       5|Invalid|      22|
+----------+--------+-------+--------+



Read the dataset using the "failfast" mode

In [30]:
# sales_df3  = spark.read \
# .format("json") \
# .option("mode","failfast") \
# .load("/public/trendytech/datasets/sales_data.json")

In [31]:
from pyspark.sql.functions import *

In [32]:
hospital_df = spark.read \
.format("csv") \
.option("header","true") \
.load("/public/trendytech/datasets/hospital.csv")

In [33]:
hospital_df.show(5)

+----------+--------------+--------------+-------------+---------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
+----------+--------------+--------------+-------------+---------+----------+
|         1|    01-01-2022|    2022-01-10|    Pneumonia|      101|   5000.00|
|         2|    02-05-2022|    2022-02-09| Appendicitis|      102|   7000.00|
|         3|    03-12-2022|    2022-03-18|Fractured Arm|      103|   3500.00|
|         4|    04-02-2022|    2022-04-08| Heart Attack|      104|  15000.00|
|         5|    05-05-2022|    2022-05-07|    Influenza|      105|   2500.00|
+----------+--------------+--------------+-------------+---------+----------+
only showing top 5 rows



1. Drop the "doctor_id" column from the dataset.

In [34]:
hospital_df2 = hospital_df.drop("doctor_id")

In [35]:
hospital_df2.show(5)

+----------+--------------+--------------+-------------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|total_cost|
+----------+--------------+--------------+-------------+----------+
|         1|    01-01-2022|    2022-01-10|    Pneumonia|   5000.00|
|         2|    02-05-2022|    2022-02-09| Appendicitis|   7000.00|
|         3|    03-12-2022|    2022-03-18|Fractured Arm|   3500.00|
|         4|    04-02-2022|    2022-04-08| Heart Attack|  15000.00|
|         5|    05-05-2022|    2022-05-07|    Influenza|   2500.00|
+----------+--------------+--------------+-------------+----------+
only showing top 5 rows



. Rename the "total_cost" column to "hospital_bill"

In [36]:
hospital_df3 = hospital_df2.withColumnRenamed("total_cost","hospital_bill")

In [37]:
hospital_df3.show(5)

+----------+--------------+--------------+-------------+-------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|
+----------+--------------+--------------+-------------+-------------+
|         1|    01-01-2022|    2022-01-10|    Pneumonia|      5000.00|
|         2|    02-05-2022|    2022-02-09| Appendicitis|      7000.00|
|         3|    03-12-2022|    2022-03-18|Fractured Arm|      3500.00|
|         4|    04-02-2022|    2022-04-08| Heart Attack|     15000.00|
|         5|    05-05-2022|    2022-05-07|    Influenza|      2500.00|
+----------+--------------+--------------+-------------+-------------+
only showing top 5 rows



In [38]:
hospital_df4 = hospital_df3.withColumn("admission_date", date_format(to_date("admission_date", "MM-dd-yyyy"),
        "yyyy-MM-dd"))

hospital_df4.show(5)

3. Add a new column called "duration_of_stay" that represents the number
of days a patient stayed in the hospital. (hint: The duration should be
calculated as the difference between the "discharge_date" and
"admission_date" columns.)

In [39]:
hospital_df5 = hospital_df4.withColumn("duration_of_stay", datediff(col("discharge_date"), col("admission_date")))

In [40]:
hospital_df5.show(5)

+----------+--------------+--------------+-------------+-------------+----------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|
+----------+--------------+--------------+-------------+-------------+----------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      5000.00|               9|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      7000.00|               4|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      3500.00|               6|
|         4|    2022-04-02|    2022-04-08| Heart Attack|     15000.00|               6|
|         5|    2022-05-05|    2022-05-07|    Influenza|      2500.00|               2|
+----------+--------------+--------------+-------------+-------------+----------------+
only showing top 5 rows



4. Create a new column called "adjusted_total_cost" that calculates the
adjusted total cost based on the diagnosis as follows:
If the diagnosis is "Heart Attack", multiply the hospital_bill by 1.5.
If the diagnosis is "Appendicitis", multiply the hospital_bill by 1.2.
For any other diagnosis, keep the hospital_bill as it is

In [42]:
hospital_df6 = hospital_df5.withColumn(
    "adjusted_total_cost",
    expr("""
        CASE
            WHEN diagnosis = "Heart Attack" THEN hospital_bill*1.5
            WHEN diagnosis = "Appendicitis" THEN hospital_bill*1.2
            ELSE hospital_bill
        END  
        """)
                                      )

In [43]:
hospital_df6.show(5)

+----------+--------------+--------------+-------------+-------------+----------------+-------------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|adjusted_total_cost|
+----------+--------------+--------------+-------------+-------------+----------------+-------------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      5000.00|               9|            5000.00|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      7000.00|               4|             8400.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      3500.00|               6|            3500.00|
|         4|    2022-04-02|    2022-04-08| Heart Attack|     15000.00|               6|            22500.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      2500.00|               2|            2500.00|
+----------+--------------+--------------+-------------+-------------+----------------+-------------------+
only showing top 5 rows



Select the "patient_id", "diagnosis", "hospital_bill", and
"adjusted_total_cost" columns.

In [None]:
hospital_final_df = hospital_df6.select("patient_id", "diagnosis", "hospital_bill",
"adjusted_total_cost")

In [None]:
hospital_final_df.show()