In [160]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [161]:
spark

In [162]:
hospital_df = spark.read.format('csv').option("header", "True").load('/public/trendytech/datasets/hospital.csv')

In [163]:
hospital_df.printSchema()

root
 |-- patient_id: string (nullable = true)
 |-- admission_date: string (nullable = true)
 |-- discharge_date: string (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- doctor_id: string (nullable = true)
 |-- total_cost: string (nullable = true)



In [164]:
from pyspark.sql.functions import *

In [165]:
hospital_new_df = hospital_df.withColumn("admission_date",to_date("admission_date","mm-dd-yyyy")).withColumn("discharge_date",to_date("discharge_date","yyyy-mm-dd"))

In [166]:
hospital_new_df.printSchema()

root
 |-- patient_id: string (nullable = true)
 |-- admission_date: date (nullable = true)
 |-- discharge_date: date (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- doctor_id: string (nullable = true)
 |-- total_cost: string (nullable = true)



In [167]:
hospital_new_df.createOrReplaceTempView("hospital")

In [168]:
spark.sql("select count(*) from hospital")

count(1)
25


In [169]:
spark.sql("select * from hospital order by diagnosis")

patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost
10,2022-01-05,2022-01-10,Appendicitis,110,7500.0
2,2022-01-05,2022-01-09,Appendicitis,102,7000.0
14,2023-01-14,2023-01-18,Appendicitis,114,7200.0
6,2022-01-10,2022-01-15,Appendicitis,106,8000.0
20,2023-01-10,2023-01-16,Appendicitis,120,7800.0
3,2022-01-12,2022-01-18,Fractured Arm,103,3500.0
15,2023-01-20,2023-01-28,Fractured Arm,115,3800.0
24,2023-01-01,2023-01-07,Fractured Arm,124,4100.0
9,2022-01-15,2022-01-22,Fractured Leg,109,6000.0
19,2023-01-22,2023-01-27,Fractured Leg,119,6500.0


### Running Total of the total_cost based on diagnosis order by admission_date

In [170]:
from pyspark.sql import Window

In [171]:
mywindow = Window.partitionBy("diagnosis").orderBy("admission_date").rowsBetween(Window.unboundedPreceding,Window.currentRow)

In [172]:
result_running = hospital_new_df.withColumn("running_total",sum("total_cost").over(mywindow))

In [173]:
result_running.show()

+----------+--------------+--------------+-------------+---------+----------+-------------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|running_total|
+----------+--------------+--------------+-------------+---------+----------+-------------+
|         4|    2022-01-02|    2022-01-08| Heart Attack|      104|  15000.00|      15000.0|
|         8|    2022-01-25|    2022-01-01| Heart Attack|      108|  20000.00|      35000.0|
|        13|    2023-01-02|    2023-01-09| Heart Attack|      113|  18000.00|      53000.0|
|        17|    2023-01-08|    2023-01-11| Heart Attack|      117|  16000.00|      69000.0|
|        22|    2023-01-12|    2023-01-19| Heart Attack|      122|  21000.00|      90000.0|
|         3|    2022-01-12|    2022-01-18|Fractured Arm|      103|   3500.00|       3500.0|
|        24|    2023-01-01|    2023-01-07|Fractured Arm|      124|   4100.00|       7600.0|
|        15|    2023-01-20|    2023-01-28|Fractured Arm|      115|   3800.00|   

In [174]:
spark.sql(""" select patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost, 
sum(total_cost) over (partition by diagnosis order by admission_date) as running_total from hospital
""").show()

+----------+--------------+--------------+-------------+---------+----------+-------------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|running_total|
+----------+--------------+--------------+-------------+---------+----------+-------------+
|         4|    2022-01-02|    2022-01-08| Heart Attack|      104|  15000.00|      15000.0|
|         8|    2022-01-25|    2022-01-01| Heart Attack|      108|  20000.00|      35000.0|
|        13|    2023-01-02|    2023-01-09| Heart Attack|      113|  18000.00|      53000.0|
|        17|    2023-01-08|    2023-01-11| Heart Attack|      117|  16000.00|      69000.0|
|        22|    2023-01-12|    2023-01-19| Heart Attack|      122|  21000.00|      90000.0|
|         3|    2022-01-12|    2022-01-18|Fractured Arm|      103|   3500.00|       3500.0|
|        24|    2023-01-01|    2023-01-07|Fractured Arm|      124|   4100.00|       7600.0|
|        15|    2023-01-20|    2023-01-28|Fractured Arm|      115|   3800.00|   

In [175]:
### total cost of each diagnosis

In [176]:
result_total_cost = hospital_new_df.groupBy("diagnosis").agg(sum("total_cost").alias("totaldiagnosiscost"))
result_total_cost.show()

+-------------+------------------+
|    diagnosis|totaldiagnosiscost|
+-------------+------------------+
| Heart Attack|           90000.0|
|Fractured Arm|           11400.0|
|Fractured Leg|           12500.0|
| Appendicitis|           37500.0|
|    Influenza|           14100.0|
|    Pneumonia|           26500.0|
+-------------+------------------+



In [177]:
spark.sql("select diagnosis,sum(total_cost)as totaldiagnosiscost from hospital group by diagnosis").show()

+-------------+------------------+
|    diagnosis|totaldiagnosiscost|
+-------------+------------------+
| Heart Attack|           90000.0|
|Fractured Arm|           11400.0|
|Fractured Leg|           12500.0|
| Appendicitis|           37500.0|
|    Influenza|           14100.0|
|    Pneumonia|           26500.0|
+-------------+------------------+



In [178]:
### 2nd highest total_cost in each diagnosis

In [179]:
from pyspark.sql import Window

In [180]:
mywindow = Window.partitionBy("diagnosis").orderBy(desc("total_cost"))


In [181]:
result_total_cost = hospital_new_df.withColumn("denserank",dense_rank().over(mywindow))
result_total_cost.filter("denserank ==2").show()

+----------+--------------+--------------+-------------+---------+----------+---------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|denserank|
+----------+--------------+--------------+-------------+---------+----------+---------+
|         8|    2022-01-25|    2022-01-01| Heart Attack|      108|  20000.00|        2|
|        15|    2023-01-20|    2023-01-28|Fractured Arm|      115|   3800.00|        2|
|         9|    2022-01-15|    2022-01-22|Fractured Leg|      109|   6000.00|        2|
|        20|    2023-01-10|    2023-01-16| Appendicitis|      120|   7800.00|        2|
|        21|    2023-01-05|    2023-01-09|    Influenza|      121|   2900.00|        2|
|         7|    2022-01-20|    2022-01-25|    Pneumonia|      107|   5500.00|        2|
+----------+--------------+--------------+-------------+---------+----------+---------+



In [182]:
spark.sql(""" with cte as (select patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost,
              dense_rank()over(partition by diagnosis order by total_cost desc) as denserank from hospital )
              select * from cte where denserank =2
""").show()

+----------+--------------+--------------+-------------+---------+----------+---------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|denserank|
+----------+--------------+--------------+-------------+---------+----------+---------+
|         8|    2022-01-25|    2022-01-01| Heart Attack|      108|  20000.00|        2|
|        15|    2023-01-20|    2023-01-28|Fractured Arm|      115|   3800.00|        2|
|         9|    2022-01-15|    2022-01-22|Fractured Leg|      109|   6000.00|        2|
|        20|    2023-01-10|    2023-01-16| Appendicitis|      120|   7800.00|        2|
|        21|    2023-01-05|    2023-01-09|    Influenza|      121|   2900.00|        2|
|         7|    2022-01-20|    2022-01-25|    Pneumonia|      107|   5500.00|        2|
+----------+--------------+--------------+-------------+---------+----------+---------+



In [183]:
### Calculate the total_cost difference for each diagnosis based on the previous day

In [184]:
mywindow = Window.partitionBy("diagnosis").orderBy("admission_date")

In [185]:
result_total_cost = hospital_new_df.withColumn("next_cost",lead("total_cost").over(mywindow))

In [186]:
result_final = result_total_cost.withColumn("result_difference",expr("next_cost-total_cost"))
result_final.show()

+----------+--------------+--------------+-------------+---------+----------+---------+-----------------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|next_cost|result_difference|
+----------+--------------+--------------+-------------+---------+----------+---------+-----------------+
|         4|    2022-01-02|    2022-01-08| Heart Attack|      104|  15000.00| 20000.00|           5000.0|
|         8|    2022-01-25|    2022-01-01| Heart Attack|      108|  20000.00| 18000.00|          -2000.0|
|        13|    2023-01-02|    2023-01-09| Heart Attack|      113|  18000.00| 16000.00|          -2000.0|
|        17|    2023-01-08|    2023-01-11| Heart Attack|      117|  16000.00| 21000.00|           5000.0|
|        22|    2023-01-12|    2023-01-19| Heart Attack|      122|  21000.00|     null|             null|
|         3|    2022-01-12|    2022-01-18|Fractured Arm|      103|   3500.00|  4100.00|            600.0|
|        24|    2023-01-01|    2023-01-07|Frac

In [187]:
spark.sql(""" with cte as (select patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost,
              lead(total_cost)over(partition by diagnosis order by admission_date) as next_cost from hospital )
              select patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost
              ,(next_cost - total_cost) as resultdifference from cte
""").show()

+----------+--------------+--------------+-------------+---------+----------+----------------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|resultdifference|
+----------+--------------+--------------+-------------+---------+----------+----------------+
|         4|    2022-01-02|    2022-01-08| Heart Attack|      104|  15000.00|          5000.0|
|         8|    2022-01-25|    2022-01-01| Heart Attack|      108|  20000.00|         -2000.0|
|        13|    2023-01-02|    2023-01-09| Heart Attack|      113|  18000.00|         -2000.0|
|        17|    2023-01-08|    2023-01-11| Heart Attack|      117|  16000.00|          5000.0|
|        22|    2023-01-12|    2023-01-19| Heart Attack|      122|  21000.00|            null|
|         3|    2022-01-12|    2022-01-18|Fractured Arm|      103|   3500.00|           600.0|
|        24|    2023-01-01|    2023-01-07|Fractured Arm|      124|   4100.00|          -300.0|
|        15|    2023-01-20|    2023-01-28|Fracture

In [188]:
spark.sql("select count(distinct(doctor_id)) from hospital").show()

+-------------------------+
|count(DISTINCT doctor_id)|
+-------------------------+
|                       25|
+-------------------------+



In [189]:
### Pivot the data based on admission_year and diagnosis and count of patient

In [190]:
from pyspark.sql.functions import year

In [205]:
hospital_new1_df = hospital_new_df.select("diagnosis", year("admission_date").alias("year"))

# Group by diagnosis and pivot on year, count the number of entries, and fill null values with 0
hospital_new1_df.groupBy("diagnosis") \
    .pivot("year") \
    .count() \
    .na.fill(0) \
    .show()

+-------------+----+----+----+
|    diagnosis|2022|2023|2024|
+-------------+----+----+----+
| Heart Attack|   2|   3|   0|
|Fractured Arm|   1|   2|   0|
|Fractured Leg|   1|   1|   0|
| Appendicitis|   3|   2|   0|
|    Influenza|   2|   2|   1|
|    Pneumonia|   3|   2|   0|
+-------------+----+----+----+



In [212]:
### Pivot with spark sql is not working

In [209]:
spark.sql(""" with cte as (SELECT diagnosis,
                   YEAR(admission_date) AS year,
                   COUNT(*) AS count
                   FROM hospital
                   GROUP BY diagnosis, YEAR(admission_date))
                 select diagnosis,pivot(year),count from cte
          """).show()
spark.sql("""
    SELECT diagnosis, 
           PIVOT(
           COALESCE(count, 0) FOR year IN('2022','2023','2024')
    FROM hospital_summary
""").show()


In [215]:
spark.sql("""select diagnosis,'2022','2023','2024' 
            from hospital
              PIVOT(
                  count(patient_id) FOR year(admission_date) IN ('2022','2023','2024')
                  )
         """).show()


ParseException: 
missing 'IN' at '('(line 4, pos 44)

== SQL ==
select diagnosis,'2022','2023','2024' 
            from hospital
              PIVOT(
                  count(patient_id) FOR year(admission_date) IN ('2022','2023','2024')
--------------------------------------------^^^
                  )
         
