## Working with Different Types of Data


### Step 1: Initialize PySpark Session


In [80]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when,avg,regexp_extract,coalesce,regexp_replace,struct,create_map,explode,to_json,from_json

from pyspark.sql.types import StructType, StructField, StringType, IntegerType


# Create a Spark session
spark = SparkSession.builder.appName("day3").getOrCreate()


### Step 2: Load the Dataset


In [24]:
# Load the Chipotle dataset into a Spark DataFrame
data_path = "titanic.csv"  # Replace with the actual path
titanic_df = spark.read.csv(data_path, header=True, inferSchema=True)

# Load the Chipotle dataset into a Spark DataFrame
data_path = 'chipotle.csv' # Replace with the actual path
chipotle_df = spark.read.csv(data_path, header=True, inferSchema=True)

# Load the Chipotle dataset into a Spark DataFrame
data_path = 'kalimati_tarkari_dataset.csv' # Replace with the actual path
kalimati_df = spark.read.csv(data_path, header=True, inferSchema=True)


In [25]:
print(titanic_df.printSchema(),chipotle_df.printSchema(),kalimati_df.printSchema())

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

root
 |-- _c0: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: string (nullable = true)

root
 |-- SN: integer (nullable = true)
 |-- Commodity: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Minimum: double (nullable = true)
 |-- Maximum: double (nullable = true)
 |-- Average: double (nullable = true)

None None N

### Converting to Spark Types:

Question: Load the "titanic" dataset and convert the "Fare" column from double to integer.




In [26]:
# Convert the "fare" column from double to integer
titanic_df = titanic_df.withColumn("fare", col("fare").cast("int"))
titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- fare: integer (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Working with Booleans:

Question: Load the "titanic" dataset and add a new column "IsAdult" that indicates whether a passenger is an adult (age >= 18) or not.

In [27]:
titanic_df_age_gt_18=titanic_df.withColumn("IsAdult", when(col("age")>=18,True).otherwise(False))
titanic_df_age_gt_18.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|fare|Cabin|Embarked|IsAdult|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7| null|       S|   true|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  71|  C85|       C|   true|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|   7| null|       S|   true|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|  53| C123|       S|   true|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8| null|       S|   true|
|          6|       0|     3|    Moran, 

### Working with Numbers:

Question: Load the "titanic" dataset and calculate the average age of male and female passengers separately.

In [28]:
titanic_df_gender=titanic_df.groupBy("Sex").agg(avg("age")).alias("Avg")
titanic_df_gender.show()

+------+------------------+
|   Sex|          avg(age)|
+------+------------------+
|female|27.915708812260537|
|  male| 30.72664459161148|
+------+------------------+



### Working with Strings:

Question: Load the "chipotle" dataset and find the item names containing the word "Chicken."

In [33]:
# chipotle_df.select(regexp_extract (col("item_name"),"Chicken",1),"*").show()


# Filter for item names containing the word "Chicken"
chicken_items = chipotle_df.filter(col("item_name").like("%Chicken%"))
chicken_items.show()

+---+--------+--------+--------------------+--------------------+----------+
|_c0|order_id|quantity|           item_name|  choice_description|item_price|
+---+--------+--------+--------------------+--------------------+----------+
|  4|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|  5|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
| 11|       6|       1|Chicken Crispy Tacos|[Roasted Chili Co...|    $8.75 |
| 12|       6|       1|  Chicken Soft Tacos|[Roasted Chili Co...|    $8.75 |
| 13|       7|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $11.25 |
| 16|       8|       1|     Chicken Burrito|[Tomatillo-Green ...|    $8.49 |
| 17|       9|       1|     Chicken Burrito|[Fresh Tomato Sal...|    $8.49 |
| 19|      10|       1|        Chicken Bowl|[Tomatillo Red Ch...|    $8.75 |
| 23|      12|       1|     Chicken Burrito|[[Tomatillo-Green...|   $10.98 |
| 26|      13|       1|        Chicken Bowl|[Roasted Chili Co...|    $8.49 |

23/08/31 16:51:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , order_id, quantity, item_name, choice_description, item_price
 Schema: _c0, order_id, quantity, item_name, choice_description, item_price
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/Desktop/Spark_assignment/DAY-3/chipotle.csv


### Regular Expressions:

Question: Load the "chipotle" dataset and find the items with names that start with "Ch" followed by any character.



In [35]:

# Filter for item names that starts with "Ch"
chicken_items = chipotle_df.filter(col("item_name").like("Ch%"))
chicken_items.show()

+---+--------+--------+--------------------+--------------------+----------+
|_c0|order_id|quantity|           item_name|  choice_description|item_price|
+---+--------+--------+--------------------+--------------------+----------+
|  0|       1|       1|Chips and Fresh T...|                null|    $2.39 |
|  3|       1|       1|Chips and Tomatil...|                null|    $2.39 |
|  4|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|  5|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
| 10|       5|       1| Chips and Guacamole|                null|    $4.45 |
| 11|       6|       1|Chicken Crispy Tacos|[Roasted Chili Co...|    $8.75 |
| 12|       6|       1|  Chicken Soft Tacos|[Roasted Chili Co...|    $8.75 |
| 13|       7|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $11.25 |
| 14|       7|       1| Chips and Guacamole|                null|    $4.45 |
| 15|       8|       1|Chips and Tomatil...|                null|    $2.39 |

23/08/31 16:52:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , order_id, quantity, item_name, choice_description, item_price
 Schema: _c0, order_id, quantity, item_name, choice_description, item_price
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/Desktop/Spark_assignment/DAY-3/chipotle.csv


### Working with Nulls in Data:

Question: Load the "titanic" dataset and count the number of passengers with missing age information.



In [36]:
# Count the number of passengers with missing age information
missing_age_count = titanic_df.filter(col("age").isNull()).count()

# Show the count
print("Number of passengers with missing age information:", missing_age_count)

Number of passengers with missing age information: 177


### Coalesce
Question: Utilizing the Chipotle dataset, use the coalesce function to combine the "item_name" and "choice_description" columns into a new column named "OrderDetails." Display the first 5 rows of the resulting DataFrame.

In [42]:
# Combine the "item_name" and "choice_description" columns using coalesce

combined_df = chipotle_df.withColumn("OrderDetails", coalesce(col("item_name"), col("choice_description")))
combined_df.show()

+---+--------+--------+--------------------+--------------------+----------+--------------------+
|_c0|order_id|quantity|           item_name|  choice_description|item_price|        OrderDetails|
+---+--------+--------+--------------------+--------------------+----------+--------------------+
|  0|       1|       1|Chips and Fresh T...|                null|    $2.39 |Chips and Fresh T...|
|  1|       1|       1|                Izze|        [Clementine]|    $3.39 |                Izze|
|  2|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |    Nantucket Nectar|
|  3|       1|       1|Chips and Tomatil...|                null|    $2.39 |Chips and Tomatil...|
|  4|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |        Chicken Bowl|
|  5|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |        Chicken Bowl|
|  6|       3|       1|       Side of Chips|                null|    $1.69 |       Side of Chips|
|  7|       4|      

23/08/31 16:59:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , order_id, quantity, item_name, choice_description, item_price
 Schema: _c0, order_id, quantity, item_name, choice_description, item_price
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/Desktop/Spark_assignment/DAY-3/chipotle.csv


### ifnull, nullIf, nvl, and nvl2

Question: Replace the null values in the "Age" column of the Titanic dataset with the average age.

In [50]:
avg_age=titanic_df.agg(avg(col("age"))).first()[0]
# avg_age

# Replace null values in the "age" column with the average age using ifnull
# titanic_df_ifnull = titanic_df.withColumn("age", ifnull(col("age"), average_age))
# titanic_df_ifnull.show()

# Replace null values in the "age" column with the average age using when
titanic_df_replace_null = titanic_df.withColumn("age", when(col("age").isNull(), avg_age).otherwise(col("age")))
titanic_df_replace_null.show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|              age|SibSp|Parch|          Ticket|fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+----+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|             22.0|    1|    0|       A/5 21171|   7| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|             38.0|    1|    0|        PC 17599|  71|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|             26.0|    0|    0|STON/O2. 3101282|   7| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|             35.0|    1|    0|          113803|  53| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|             35.0|    0|    0|          373450|   8| null|       S|


### drop

Question: Remove the "Cabin" column from the Titanic dataset.


In [51]:
titanic_df.drop(col("Cabin")).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  71|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|   7|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|  53|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|          330877|   8|       Q|
|          7|       0|     1|McCarthy, Mr. Tim

### fill

Question: Fill the null values in the "Age" column of the Titanic dataset with a default age of 30.

In [52]:
# Fill null values in the "age" column with a default age of 30 using .na.fill

titanic_df.na.fill(30, subset=["age"]).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+----+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|  71|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|   7| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|  53| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|30.0|    0|    0|          330877|   8| null|  

###  replace

Question: Replace the gender "male" with "M" and "female" with "F" in the "Sex" column of the Titanic dataset.

In [79]:
# Replace "male" with "M" and "female" with "F" in the "Sex" column using regexp_replace
titanic_df.na.replace({"male":"m","female":"f"},"sex").show()

+-----------+--------+------+--------------------+---+----+-----+-----+----------------+----+-----+--------+
|PassengerId|Survived|Pclass|                Name|Sex| Age|SibSp|Parch|          Ticket|fare|Cabin|Embarked|
+-----------+--------+------+--------------------+---+----+-----+-----+----------------+----+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  m|22.0|    1|    0|       A/5 21171|   7| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|  f|38.0|    1|    0|        PC 17599|  71|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|  f|26.0|    0|    0|STON/O2. 3101282|   7| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|  f|35.0|    1|    0|          113803|  53| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  m|35.0|    0|    0|          373450|   8| null|       S|
|          6|       0|     3|    Moran, Mr. James|  m|null|    0|    0|          330877|   8| null|       Q|
|          7|      



### 6. Working with Complex Types: Structs

Question: Create a new DataFrame from the Kalimati Tarkari dataset, including a new column "PriceRange" that is a struct containing "Minimum" and "Maximum" prices for each commodity.

In [66]:
#struct containing "Minimum" and "Maximum" prices for each commodity
kalimati_df.select(struct("Minimum","Maximum").alias("PriceRange"),"*").show()

+------------+---+--------------------+----------+----+-------+-------+-------+
|  PriceRange| SN|           Commodity|      Date|Unit|Minimum|Maximum|Average|
+------------+---+--------------------+----------+----+-------+-------+-------+
|{35.0, 40.0}|  0|  Tomato Big(Nepali)|2013-06-16|  Kg|   35.0|   40.0|   37.5|
|{26.0, 32.0}|  1| Tomato Small(Local)|2013-06-16|  Kg|   26.0|   32.0|   29.0|
|{20.0, 21.0}|  2|          Potato Red|2013-06-16|  Kg|   20.0|   21.0|   20.5|
|{15.0, 16.0}|  3|        Potato White|2013-06-16|  Kg|   15.0|   16.0|   15.5|
|{28.0, 30.0}|  4|  Onion Dry (Indian)|2013-06-16|  Kg|   28.0|   30.0|   29.0|
|{30.0, 35.0}|  5|       Carrot(Local)|2013-06-16|  Kg|   30.0|   35.0|   32.5|
| {6.0, 10.0}|  6|      Cabbage(Local)|2013-06-16|  Kg|    6.0|   10.0|    8.0|
|{30.0, 35.0}|  7|         Cauli Local|2013-06-16|  Kg|   30.0|   35.0|   32.5|
|{35.0, 40.0}|  8|         Raddish Red|2013-06-16|  Kg|   35.0|   40.0|   37.5|
|{25.0, 30.0}|  9|Raddish White(Local)|2

### Working with Complex Types: Arrays
Question: Create a new DataFrame from the Kalimati Tarkari dataset, including a new column "CommodityList" that is an array of all the commodities.


In [74]:
#new DataFrame from the Kalimati Tarkari dataset, including a new column "CommodityList" that is an array of all the commodities

complex_array_df = kalimati_df.selectExpr("*","array(Commodity) as CommodityList")
complex_array_df.show()

+---+--------------------+----------+----+-------+-------+-------+--------------------+
| SN|           Commodity|      Date|Unit|Minimum|Maximum|Average|       CommodityList|
+---+--------------------+----------+----+-------+-------+-------+--------------------+
|  0|  Tomato Big(Nepali)|2013-06-16|  Kg|   35.0|   40.0|   37.5|[Tomato Big(Nepali)]|
|  1| Tomato Small(Local)|2013-06-16|  Kg|   26.0|   32.0|   29.0|[Tomato Small(Loc...|
|  2|          Potato Red|2013-06-16|  Kg|   20.0|   21.0|   20.5|        [Potato Red]|
|  3|        Potato White|2013-06-16|  Kg|   15.0|   16.0|   15.5|      [Potato White]|
|  4|  Onion Dry (Indian)|2013-06-16|  Kg|   28.0|   30.0|   29.0|[Onion Dry (Indian)]|
|  5|       Carrot(Local)|2013-06-16|  Kg|   30.0|   35.0|   32.5|     [Carrot(Local)]|
|  6|      Cabbage(Local)|2013-06-16|  Kg|    6.0|   10.0|    8.0|    [Cabbage(Local)]|
|  7|         Cauli Local|2013-06-16|  Kg|   30.0|   35.0|   32.5|       [Cauli Local]|
|  8|         Raddish Red|2013-0

### Working with Complex Types: explode

Question: Explode the "CommodityList" array column from the previous step to generate a new row for each commodity in the list.

In [77]:
#explode commoditylist array using explode method
complex_array_df.withColumn("exploded",explode("CommodityList")).show(5)


+---+-------------------+----------+----+-------+-------+-------+--------------------+-------------------+
| SN|          Commodity|      Date|Unit|Minimum|Maximum|Average|       CommodityList|           exploded|
+---+-------------------+----------+----+-------+-------+-------+--------------------+-------------------+
|  0| Tomato Big(Nepali)|2013-06-16|  Kg|   35.0|   40.0|   37.5|[Tomato Big(Nepali)]| Tomato Big(Nepali)|
|  1|Tomato Small(Local)|2013-06-16|  Kg|   26.0|   32.0|   29.0|[Tomato Small(Loc...|Tomato Small(Local)|
|  2|         Potato Red|2013-06-16|  Kg|   20.0|   21.0|   20.5|        [Potato Red]|         Potato Red|
|  3|       Potato White|2013-06-16|  Kg|   15.0|   16.0|   15.5|      [Potato White]|       Potato White|
|  4| Onion Dry (Indian)|2013-06-16|  Kg|   28.0|   30.0|   29.0|[Onion Dry (Indian)]| Onion Dry (Indian)|
+---+-------------------+----------+----+-------+-------+-------+--------------------+-------------------+
only showing top 5 rows



### Working with Complex Types: Maps

Question: .Create a new DataFrame from the Kalimati Tarkari dataset, including a new column "PriceMap" that is a map with "Commodity" as the key and "Average" price as the value
Answer:

In [70]:
#using map to map commodity as key and average as value
kalimati_df.select(create_map(col("Commodity"),col("Average")).alias("PriceMap"),"*").show()

+--------------------+---+--------------------+----------+----+-------+-------+-------+
|            PriceMap| SN|           Commodity|      Date|Unit|Minimum|Maximum|Average|
+--------------------+---+--------------------+----------+----+-------+-------+-------+
|{Tomato Big(Nepal...|  0|  Tomato Big(Nepali)|2013-06-16|  Kg|   35.0|   40.0|   37.5|
|{Tomato Small(Loc...|  1| Tomato Small(Local)|2013-06-16|  Kg|   26.0|   32.0|   29.0|
|{Potato Red -> 20.5}|  2|          Potato Red|2013-06-16|  Kg|   20.0|   21.0|   20.5|
|{Potato White -> ...|  3|        Potato White|2013-06-16|  Kg|   15.0|   16.0|   15.5|
|{Onion Dry (India...|  4|  Onion Dry (Indian)|2013-06-16|  Kg|   28.0|   30.0|   29.0|
|{Carrot(Local) ->...|  5|       Carrot(Local)|2013-06-16|  Kg|   30.0|   35.0|   32.5|
|{Cabbage(Local) -...|  6|      Cabbage(Local)|2013-06-16|  Kg|    6.0|   10.0|    8.0|
|{Cauli Local -> 3...|  7|         Cauli Local|2013-06-16|  Kg|   30.0|   35.0|   32.5|
|{Raddish Red -> 3...|  8|      

### Working with JSON

Question: Convert the "kalimati_df" DataFrame to JSON format and write it to a JSON file.

In [82]:
kalimatiJSON = kalimati_df.selectExpr("(Sn,Commodity,date,unit,minimum,maximum,average) as KalimatiJson").select(to_json(col("KalimatiJson")))
# jsonKalimati.show()
kalimatiJSON.write.json("kalimatiJSON.json")

AnalysisException: Invalid usage of '*' in expression 'alias'.