In [56]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [57]:
spark

In [3]:
Data = [("Spring", 12.3),("Summer", 10.5),("Autumn", 8.2),("Winter", 15.1)]

In [4]:
df1 = spark.createDataFrame(Data,schema = ["season","windspeed"])

In [5]:
df1.show()

+------+---------+
|season|windspeed|
+------+---------+
|Spring|     12.3|
|Summer|     10.5|
|Autumn|      8.2|
|Winter|     15.1|
+------+---------+



In [6]:
df1.printSchema()

root
 |-- season: string (nullable = true)
 |-- windspeed: double (nullable = true)



In [7]:
! hadoop fs -cat /public/trendytech/datasets/library_data.json 

{"library_name": "Central Library","location": "City Center","books": [{"book_id": "B001","book_name": "The Great Gatsby","author": "F. Scott Fitzgerald","copies_available": 5},{"book_id": "B002","book_name": "To Kill a Mockingbird","author": "Harper Lee","copies_available": 3}],"members": [{"member_id": "M001","member_name": "John Smith","age": 28,"books_borrowed": ["B001"]},{"member_id": "M002","member_name": "Emma Johnson","age": 35,"books_borrowed": []}]},
{"library_name": "Community Library","location": "Suburb","books": [{"book_id": "B003","book_name": "1984","author": "George Orwell","copies_available": 2},{"book_id": "B004","book_name": "Pride and Prejudice","author": "Jane Austen","copies_available": 4}],"members": [{"member_id": "M003","member_name": "Michael Brown","age": 42,"books_borrowed": ["B003","B004"]},{"member_id": "M004","member_name": "Sophia Davis","age": 31,"books_borrowed": ["B004"]}]}


In [8]:
from pyspark.sql.types import *

In [9]:
schema = StructType([
                    StructField("library_name", StringType()),
                    StructField("location", StringType()),
                    StructField("books", ArrayType(
                    StructType([
                    StructField("book_id", StringType()),
                    StructField("book_name", StringType()),
                    StructField("author", StringType()),
                    StructField("copies_available", IntegerType())
                    ])
                    )),
                    StructField("members", ArrayType(
                    StructType([
                    StructField("member_id", StringType()),
                    StructField("member_name", StringType()),
                    StructField("age", IntegerType()),
                    StructField("books_borrowed", ArrayType(StringType()))
                    ])
                    ))
                    ])

In [10]:
library_df =spark.read.schema(schema).json("/public/trendytech/datasets/library_data.json")

In [11]:
library_df.show()

+-----------------+-----------+--------------------+--------------------+
|     library_name|   location|               books|             members|
+-----------------+-----------+--------------------+--------------------+
|  Central Library|City Center|[{B001, The Great...|[{M001, John Smit...|
|Community Library|     Suburb|[{B003, 1984, Geo...|[{M003, Michael B...|
+-----------------+-----------+--------------------+--------------------+



In [12]:
library_df.printSchema()

root
 |-- library_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- books: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- book_id: string (nullable = true)
 |    |    |-- book_name: string (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- copies_available: integer (nullable = true)
 |-- members: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- member_id: string (nullable = true)
 |    |    |-- member_name: string (nullable = true)
 |    |    |-- age: integer (nullable = true)
 |    |    |-- books_borrowed: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)



In [13]:
! hadoop fs -cat /public/trendytech/datasets/train.csv |head

train_number,train_name,seats_available,passenger_name,age,ticket_number,seat_number
123,Express,100,John,25,T123,A1
123,Express,100,Emma,30,T124,B2
456,Superfast,150,Michael,35,T125,C3
456,Superfast,150,Sophia,40,T126,D4
789,Local,50,William,28,T127,E5
789,Local,50,Sophia,32,T128,F6
789,Local,50,Oliver,45,T129,G7


In [14]:
train_df = spark.read.format("csv").option("header",True).option("inferSchema", True).load("/public/trendytech/datasets/train.csv")

In [15]:
train_df.show()

+------------+----------+---------------+--------------+---+-------------+-----------+
|train_number|train_name|seats_available|passenger_name|age|ticket_number|seat_number|
+------------+----------+---------------+--------------+---+-------------+-----------+
|         123|   Express|            100|          John| 25|         T123|         A1|
|         123|   Express|            100|          Emma| 30|         T124|         B2|
|         456| Superfast|            150|       Michael| 35|         T125|         C3|
|         456| Superfast|            150|        Sophia| 40|         T126|         D4|
|         789|     Local|             50|       William| 28|         T127|         E5|
|         789|     Local|             50|        Sophia| 32|         T128|         F6|
|         789|     Local|             50|        Oliver| 45|         T129|         G7|
+------------+----------+---------------+--------------+---+-------------+-----------+



In [16]:
dropped_df = train_df.drop("passenger_name","age")

In [17]:
dropped_df.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         123|   Express|            100|         T123|         A1|
|         123|   Express|            100|         T124|         B2|
|         456| Superfast|            150|         T125|         C3|
|         456| Superfast|            150|         T126|         D4|
|         789|     Local|             50|         T127|         E5|
|         789|     Local|             50|         T128|         F6|
|         789|     Local|             50|         T129|         G7|
+------------+----------+---------------+-------------+-----------+



In [18]:
duplicates_df = train_df.dropDuplicates(["train_number","ticket_number"]).count()

In [19]:
print(duplicates_df)

7


In [20]:
unique_df = train_df.select("train_name").distinct().count()

In [21]:
print(unique_df)

3


In [22]:
! hadoop fs -cat /public/trendytech/datasets/sales_data.json |head

{"store_id": 1, "product": "Apple", "quantity": 10, "revenue": 100.0}
{"store_id": 2, "product": "Banana", "quantity": 15, "revenue": 75.0}
{"store_id": 3, "product": "Orange", "quantity": 12, "revenue": 90.0}
{"store_id": 4, "product": "Mango", "quantity": 8, "revenue": 120.0}
{"store_id": 5, "product": "Grape", "quantity": 20, "revenue": 150.0}
{"store_id": 6, "product": "Watermelon", "quantity": 5, "revenue": 50.0}
{"store_id": 7, "product": "Strawberry", "quantity": 18, "revenue": 108.0}
{"store_id": 8, "product": "Pineapple", "quantity": 14, "revenue": 140.0}
{"store_id": 9, "product": "Cherry", "quantity": 7, "revenue": 105.0}
{"store_id": 10, "product": "Pear", "quantity": 9, "revenue": 81.0}


In [23]:
schema="store_id integer,product string,quantity integer,revenue double"

In [30]:
sales_df = spark.read.format("json").schema(schema).option("mode","permissive").load("/public/trendytech/datasets/sales_data.json")

In [31]:
sales_df.count()

22

In [49]:
sales_df1 = spark.read.format("json").option("mode","dropmalformed").schema(schema).load("/public/trendytech/datasets/sales_data.json")

In [50]:
sales_df1.count()

21

In [None]:
sales_df1 = spark.read.format("json").option("mode","failfast").schema(schema).load("/public/trendytech/datasets/sales_data.json")

In [51]:
! hadoop fs -cat /public/trendytech/datasets/hospital.csv |head

patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost
1,01-01-2022,2022-01-10,Pneumonia,101,5000.00
2,02-05-2022,2022-02-09,Appendicitis,102,7000.00
3,03-12-2022,2022-03-18,Fractured Arm,103,3500.00
4,04-02-2022,2022-04-08,Heart Attack,104,15000.00
5,05-05-2022,2022-05-07,Influenza,105,2500.00
6,06-10-2022,2022-06-15,Appendicitis,106,8000.00
7,07-20-2022,2022-07-25,Pneumonia,107,5500.00
8,08-25-2022,2022-09-01,Heart Attack,108,20000.00
9,09-15-2022,2022-09-22,Fractured Leg,109,6000.00


In [58]:
schema = "patient_id integer, admission_date date, discharge_date date, diagnosis string, doctor_id integer,total_cost float"

In [64]:
hosp_df=spark.read \
.format("csv") \
.option("header","true") \
.schema(schema) \
.option("dateFormat","MM-dd-yyyy") \
.load("/public/trendytech/datasets/hospital.csv")

In [65]:
hosp_df.show()

+----------+--------------+--------------+-------------+---------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
+----------+--------------+--------------+-------------+---------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|    5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|    7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|    3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|   15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|    2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|    8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|    5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      108|   20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|      109|    6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|      11

In [66]:
dropped_df = hosp_df.drop("doctor_id")

In [67]:
dropped_df.show()

+----------+--------------+--------------+-------------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|total_cost|
+----------+--------------+--------------+-------------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|    5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|    7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|    3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|   15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|    2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|    8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|    5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|   20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|    6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|    7500.0|
|        11|    2022-11-02|    2022-11-05|    Influenza|    2800.0|
|        12|    2022-12-10|    2022-12-18|    Pn

In [69]:
renamed_df = hosp_df.withColumnRenamed("total_cost","hospital_bill")

In [70]:
renamed_df.show()

+----------+--------------+--------------+-------------+---------+-------------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|hospital_bill|
+----------+--------------+--------------+-------------+---------+-------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|       5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|       7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|       3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|      15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|       2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|       8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|       5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      108|      20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|      109|       6000.0|
|        10|    2022-10-05| 

In [75]:
from pyspark.sql.functions import *

In [76]:
discharge_df = hosp_df.withColumn("Duration_of_Stay",expr("datediff(discharge_date,admission_date)"))

In [77]:
discharge_df.show()

+----------+--------------+--------------+-------------+---------+----------+----------------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|Duration_of_Stay|
+----------+--------------+--------------+-------------+---------+----------+----------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|    5000.0|               9|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|    7000.0|               4|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|    3500.0|               6|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|   15000.0|               6|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|    2500.0|               2|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|    8000.0|               5|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|    5500.0|               5|
|         8|    2022-08-25|    2022-09-01| Heart A

In [79]:
adjusted_df = renamed_df.withColumn("adjusted_total_cost",expr("CASE WHEN diagnosis LIKE 'Heart Attack' THEN hospital_bill * 1.5 WHEN diagnosis LIKE 'Appendicitis' THEN hospital_bill * 1.2 ELSE hospital_bill END"))

In [80]:
adjusted_df.show()

+----------+--------------+--------------+-------------+---------+-------------+-------------------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|hospital_bill|adjusted_total_cost|
+----------+--------------+--------------+-------------+---------+-------------+-------------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|       5000.0|             5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|       7000.0|             8400.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|       3500.0|             3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|      15000.0|            22500.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|       2500.0|             2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|       8000.0|             9600.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|       5500.0|           

In [81]:
hospital_final_df = adjusted_df.select("patient_id", "diagnosis","hospital_bill", "adjusted_total_cost")

In [82]:
hospital_final_df.show()

+----------+-------------+-------------+-------------------+
|patient_id|    diagnosis|hospital_bill|adjusted_total_cost|
+----------+-------------+-------------+-------------------+
|         1|    Pneumonia|       5000.0|             5000.0|
|         2| Appendicitis|       7000.0|             8400.0|
|         3|Fractured Arm|       3500.0|             3500.0|
|         4| Heart Attack|      15000.0|            22500.0|
|         5|    Influenza|       2500.0|             2500.0|
|         6| Appendicitis|       8000.0|             9600.0|
|         7|    Pneumonia|       5500.0|             5500.0|
|         8| Heart Attack|      20000.0|            30000.0|
|         9|Fractured Leg|       6000.0|             6000.0|
|        10| Appendicitis|       7500.0|             9000.0|
|        11|    Influenza|       2800.0|             2800.0|
|        12|    Pneumonia|       6000.0|             6000.0|
|        13| Heart Attack|      18000.0|            27000.0|
|        14| Appendiciti