In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
sprk = SparkSession.builder.appName("Excercise_2").getOrCreate()

In [11]:
file_path = "/content/sample_data/restaurant-orders.csv"
df = sprk.read.csv(file_path, header=True, inferSchema=True)
df.show()

+--------+----------------+--------------------+--------+-------------+--------------+
|Order ID|      Order Date|           Item Name|Quantity|Product Price|Total products|
+--------+----------------+--------------------+--------+-------------+--------------+
|   25583|03/08/2019 21:58|Tandoori Mixed Grill|       1|        11.95|            12|
|   25583|03/08/2019 21:58|        Madras Sauce|       1|         3.95|            12|
|   25583|03/08/2019 21:58|       Mushroom Rice|       2|         3.95|            12|
|   25583|03/08/2019 21:58|         Garlic Naan|       1|         2.95|            12|
|   25583|03/08/2019 21:58|             Paratha|       1|         2.95|            12|
|   25583|03/08/2019 21:58|          Plain Rice|       1|         2.95|            12|
|   25583|03/08/2019 21:58|         Prawn Puree|       1|         4.95|            12|
|   25583|03/08/2019 21:58|       Plain Papadum|       1|          0.8|            12|
|   25583|03/08/2019 21:58|       Mango Chu

In [12]:
df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Item Name: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Product Price: double (nullable = true)
 |-- Total products: integer (nullable = true)



In [21]:
transformed_df = df.withColumn("Order Date", to_timestamp(col("Order Date"), "MM/dd/yyyy HH:mm"))
transformed_df.show()

+--------+-------------------+--------------------+--------+-------------+--------------+
|Order ID|         Order Date|           Item Name|Quantity|Product Price|Total products|
+--------+-------------------+--------------------+--------+-------------+--------------+
|   25583|2019-03-08 21:58:00|Tandoori Mixed Grill|       1|        11.95|            12|
|   25583|2019-03-08 21:58:00|        Madras Sauce|       1|         3.95|            12|
|   25583|2019-03-08 21:58:00|       Mushroom Rice|       2|         3.95|            12|
|   25583|2019-03-08 21:58:00|         Garlic Naan|       1|         2.95|            12|
|   25583|2019-03-08 21:58:00|             Paratha|       1|         2.95|            12|
|   25583|2019-03-08 21:58:00|          Plain Rice|       1|         2.95|            12|
|   25583|2019-03-08 21:58:00|         Prawn Puree|       1|         4.95|            12|
|   25583|2019-03-08 21:58:00|       Plain Papadum|       1|          0.8|            12|
|   25583|

In [22]:
transformed_df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Order Date: timestamp (nullable = true)
 |-- Item Name: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Product Price: double (nullable = true)
 |-- Total products: integer (nullable = true)



In [24]:
data = transformed_df.select("Order ID", "Order Date", "Item Name")

In [25]:
data = transformed_df.drop("Order Date")
data.show()

+--------+--------------------+--------+-------------+--------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|
+--------+--------------------+--------+-------------+--------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|
|   25583|        Madras Sauce|       1|         3.95|            12|
|   25583|       Mushroom Rice|       2|         3.95|            12|
|   25583|         Garlic Naan|       1|         2.95|            12|
|   25583|             Paratha|       1|         2.95|            12|
|   25583|          Plain Rice|       1|         2.95|            12|
|   25583|         Prawn Puree|       1|         4.95|            12|
|   25583|       Plain Papadum|       1|          0.8|            12|
|   25583|       Mango Chutney|       2|          0.5|            12|
|   25583|       Onion Chutney|       1|          0.5|            12|
|   25583|          Mint Sauce|       1|          0.5|            12|
|   25583|Chicken Ti

In [28]:
data.write.csv("/content/sample_data/restaurant-orders_update.csv", header=True)

In [29]:
data.write.parquet("/content/sample_data/restaurant-orders-update_(parquet)")

In [30]:
data.createTempView("orders_tmp")

In [32]:
sprk.sql("SELECT * FROM orders_tmp").show(5)

+--------+--------------------+--------+-------------+--------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|
+--------+--------------------+--------+-------------+--------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|
|   25583|        Madras Sauce|       1|         3.95|            12|
|   25583|       Mushroom Rice|       2|         3.95|            12|
|   25583|         Garlic Naan|       1|         2.95|            12|
|   25583|             Paratha|       1|         2.95|            12|
+--------+--------------------+--------+-------------+--------------+
only showing top 5 rows



In [33]:
data.count()

119183

In [34]:
data.groupBy("Item Name").count().show()

+--------------------+-----+
|           Item Name|count|
+--------------------+-----+
|          Mint Sauce| 2498|
|      Dupiaza - Lamb|   65|
|    Lamb Tikka Balti|   84|
|   Vegetable Biryani|  224|
|Bhuna - chicken-t...|   10|
|     Mushroom Bhajee|  576|
|     Bhuna - Chicken|  234|
|           Saag Aloo| 2039|
|Malaya - Chicken ...|   12|
|             Dhansak|  470|
|        Madras Sauce|  221|
|            Vindaloo|  379|
|         Prawn Puree|  754|
|      Grapes Special|  357|
|             Dupiaza|  244|
|     Chicken Biryani|  938|
|   Lamb Tikka (Main)|   85|
|       Mushroom Rice| 2963|
|Cylon - chicken-t...|    1|
|Korma - Chicken T...|   80|
+--------------------+-----+
only showing top 20 rows



In [37]:
sprk.sql("SELECT `Item Name`, COUNT(*) FROM orders_tmp GROUP BY `Item Name`").show(5)

+--------------------+--------+
|           Item Name|count(1)|
+--------------------+--------+
|          Mint Sauce|    2498|
|      Dupiaza - Lamb|      65|
|    Lamb Tikka Balti|      84|
|   Vegetable Biryani|     224|
|Bhuna - chicken-t...|      10|
+--------------------+--------+
only showing top 5 rows



In [36]:
data.groupBy("Item Name").count().sort("count", ascending=False).limit(20).show()

+--------------------+-----+
|           Item Name|count|
+--------------------+-----+
|          Pilau Rice| 8372|
|                Naan| 6681|
|       Plain Papadum| 6432|
|         Bombay Aloo| 4128|
|         Garlic Naan| 3921|
|         Onion Bhaji| 3593|
|Chicken Tikka Masala| 3488|
|       Mango Chutney| 3435|
|       Mushroom Rice| 2963|
|          Plain Rice| 2801|
|          Mint Sauce| 2498|
|          Keema Naan| 2465|
|       Peshwari Naan| 2173|
|           Saag Aloo| 2039|
|       Onion Chutney| 1815|
|              Madras| 1729|
|          Mini Bhaji| 1719|
|               Korma| 1691|
|      Butter Chicken| 1516|
|           Red Sauce| 1453|
+--------------------+-----+



In [38]:
data.select(col("Item Name")).distinct().count()

337

In [39]:
sprk.sql("SELECT COUNT(DISTINCT `Item Name`) as total_products FROM orders_tmp").show()

+--------------+
|total_products|
+--------------+
|           337|
+--------------+



In [40]:
data.select("Item Name", "Quantity").groupBy("Item Name").sum("Quantity").show()

+--------------------+-------------+
|           Item Name|sum(Quantity)|
+--------------------+-------------+
|          Mint Sauce|         3208|
|      Dupiaza - Lamb|           68|
|    Lamb Tikka Balti|           95|
|   Vegetable Biryani|          234|
|Bhuna - chicken-t...|           10|
|     Mushroom Bhajee|          584|
|     Bhuna - Chicken|          239|
|           Saag Aloo|         2089|
|Malaya - Chicken ...|           16|
|             Dhansak|          512|
|        Madras Sauce|          233|
|            Vindaloo|          387|
|         Prawn Puree|          844|
|      Grapes Special|          368|
|             Dupiaza|          261|
|     Chicken Biryani|         1071|
|   Lamb Tikka (Main)|           88|
|       Mushroom Rice|         3424|
|Cylon - chicken-t...|            1|
|Korma - Chicken T...|           80|
+--------------------+-------------+
only showing top 20 rows



In [41]:
sprk.sql("SELECT `Item Name`, SUM(Quantity) as Total FROM orders_tmp GROUP BY `Item Name`").show()

+--------------------+-----+
|           Item Name|Total|
+--------------------+-----+
|          Mint Sauce| 3208|
|      Dupiaza - Lamb|   68|
|    Lamb Tikka Balti|   95|
|   Vegetable Biryani|  234|
|Bhuna - chicken-t...|   10|
|     Mushroom Bhajee|  584|
|     Bhuna - Chicken|  239|
|           Saag Aloo| 2089|
|Malaya - Chicken ...|   16|
|             Dhansak|  512|
|        Madras Sauce|  233|
|            Vindaloo|  387|
|         Prawn Puree|  844|
|      Grapes Special|  368|
|             Dupiaza|  261|
|     Chicken Biryani| 1071|
|   Lamb Tikka (Main)|   88|
|       Mushroom Rice| 3424|
|Cylon - chicken-t...|    1|
|Korma - Chicken T...|   80|
+--------------------+-----+
only showing top 20 rows



In [43]:
data.select("Item Name", "Quantity").groupBy("Item Name").sum("Quantity").sort("sum(Quantity)", ascending=False).limit(1).show()

+-------------+-------------+
|    Item Name|sum(Quantity)|
+-------------+-------------+
|Plain Papadum|        18056|
+-------------+-------------+



In [44]:
sprk.sql("SELECT `Item Name`, SUM(Quantity) as Total FROM orders_tmp GROUP BY `Item Name` ORDER BY Total DESC LIMIT 1").show()

+-------------+-----+
|    Item Name|Total|
+-------------+-----+
|Plain Papadum|18056|
+-------------+-----+



In [45]:
data.select("Item Name", "Quantity").where("`Product Price` < 8.0").groupBy("Item Name").sum("Quantity").sort(sum("Quantity")).show()

+------------------+-------------+
|         Item Name|sum(Quantity)|
+------------------+-------------+
|       Kurma Sauce|            2|
|      Paner Pakora|            5|
|Perrier Water 75cl|           11|
|   Vindaloo - lamb|           12|
| Bottle Water 75cl|           18|
|   Vegetable Bhuna|           24|
|      Curry - lamb|           25|
|Vindaloo - chicken|           28|
|   Vegetable Samba|           39|
|       Egg Paratha|           42|
|  Vegetable Dansak|           44|
|     Madras - lamb|           49|
|  Vegetable Masala|           52|
|        Aloo Mithy|           54|
| Diet Coke 1.5 ltr|           58|
|              Dahi|           59|
|Vegetable Jalfrezi|           60|
|  Bottle Diet Coke|           64|
|   Vindaloo - Lamb|           76|
| Bangon Hari Mirch|           78|
+------------------+-------------+
only showing top 20 rows



In [50]:
sprk.sql("""
      SELECT `Item Name`, SUM(Quantity) as Total
      FROM orders_tmp WHERE `Product Price` < 8.0 GROUP BY `Item Name` ORDER BY Total desc""").show(5)

+-------------+-----+
|    Item Name|Total|
+-------------+-----+
|Plain Papadum|18056|
|   Pilau Rice|11754|
|         Naan| 8730|
|  Garlic Naan| 4809|
|  Bombay Aloo| 4336|
+-------------+-----+
only showing top 5 rows



In [48]:
data.withColumn("Total Amount", col("Quantity") * col("Product Price")).show(5)

+--------+--------------------+--------+-------------+--------------+------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|Total Amount|
+--------+--------------------+--------+-------------+--------------+------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|       11.95|
|   25583|        Madras Sauce|       1|         3.95|            12|        3.95|
|   25583|       Mushroom Rice|       2|         3.95|            12|         7.9|
|   25583|         Garlic Naan|       1|         2.95|            12|        2.95|
|   25583|             Paratha|       1|         2.95|            12|        2.95|
+--------+--------------------+--------+-------------+--------------+------------+
only showing top 5 rows



In [53]:
data.select("*", expr("Quantity * `Product Price` as `Total Amount`")).show(5)

+--------+--------------------+--------+-------------+--------------+------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|Total Amount|
+--------+--------------------+--------+-------------+--------------+------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|       11.95|
|   25583|        Madras Sauce|       1|         3.95|            12|        3.95|
|   25583|       Mushroom Rice|       2|         3.95|            12|         7.9|
|   25583|         Garlic Naan|       1|         2.95|            12|        2.95|
|   25583|             Paratha|       1|         2.95|            12|        2.95|
+--------+--------------------+--------+-------------+--------------+------------+
only showing top 5 rows



In [55]:
data.selectExpr("*", "Quantity * `Product Price` as `Total Amount`").show(5)

+--------+--------------------+--------+-------------+--------------+------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|Total Amount|
+--------+--------------------+--------+-------------+--------------+------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|       11.95|
|   25583|        Madras Sauce|       1|         3.95|            12|        3.95|
|   25583|       Mushroom Rice|       2|         3.95|            12|         7.9|
|   25583|         Garlic Naan|       1|         2.95|            12|        2.95|
|   25583|             Paratha|       1|         2.95|            12|        2.95|
+--------+--------------------+--------+-------------+--------------+------------+
only showing top 5 rows



In [58]:
data.withColumn("Product Price", when(col('Item Name').contains('Madras Sauce'), round(col("Product Price") * 1.10, 2))
                                .when(col('Item Name').contains('Garlic Naan'), round(col("Product Price") * 1.12, 2))
                                .otherwise(col("Product Price"))).show()

+--------+--------------------+--------+-------------+--------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|
+--------+--------------------+--------+-------------+--------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|
|   25583|        Madras Sauce|       1|         4.35|            12|
|   25583|       Mushroom Rice|       2|         3.95|            12|
|   25583|         Garlic Naan|       1|          3.3|            12|
|   25583|             Paratha|       1|         2.95|            12|
|   25583|          Plain Rice|       1|         2.95|            12|
|   25583|         Prawn Puree|       1|         4.95|            12|
|   25583|       Plain Papadum|       1|          0.8|            12|
|   25583|       Mango Chutney|       2|          0.5|            12|
|   25583|       Onion Chutney|       1|          0.5|            12|
|   25583|          Mint Sauce|       1|          0.5|            12|
|   25583|Chicken Ti

In [59]:
user_input = input("Enter the product name: ")

regex_pattern = f"^{user_input}"

data.withColumn("Product Pricw", when(col("Item Name").rlike(regex_pattern), round(col("Product Price") * 1.10, 2))
                                .when(col("Item Name").like("Garlic Naan"), round(col("Product Price") * 1.12, 2))
                                .otherwise(col("Product Price"))).show()

Enter the product name: Garlic Naan
+--------+--------------------+--------+-------------+--------------+-------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|Product Pricw|
+--------+--------------------+--------+-------------+--------------+-------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|        11.95|
|   25583|        Madras Sauce|       1|         3.95|            12|         3.95|
|   25583|       Mushroom Rice|       2|         3.95|            12|         3.95|
|   25583|         Garlic Naan|       1|         2.95|            12|         3.25|
|   25583|             Paratha|       1|         2.95|            12|         2.95|
|   25583|          Plain Rice|       1|         2.95|            12|         2.95|
|   25583|         Prawn Puree|       1|         4.95|            12|         4.95|
|   25583|       Plain Papadum|       1|          0.8|            12|          0.8|
|   25583|       Mango Chutney|       2|

In [60]:
data.selectExpr("*", """
              CASE
                  WHEN `Item Name` LIKE 'Madras Sauce' THEN ROUND(`Product Price` * 1.10, 2)
                  WHEN `Item Name` LIKE 'Garlic Naan' THEN ROUND(`Product Price` * 1.12, 2)
              ELSE
                  `Product Price`
              END `Product Price (new)`
""").show()

+--------+--------------------+--------+-------------+--------------+-------------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|Product Price (new)|
+--------+--------------------+--------+-------------+--------------+-------------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|              11.95|
|   25583|        Madras Sauce|       1|         3.95|            12|               4.35|
|   25583|       Mushroom Rice|       2|         3.95|            12|               3.95|
|   25583|         Garlic Naan|       1|         2.95|            12|                3.3|
|   25583|             Paratha|       1|         2.95|            12|               2.95|
|   25583|          Plain Rice|       1|         2.95|            12|               2.95|
|   25583|         Prawn Puree|       1|         4.95|            12|               4.95|
|   25583|       Plain Papadum|       1|          0.8|            12|                0.8|
|   25583|

In [61]:
user_input = input("Enter the product name: ")
data.selectExpr("*", f"""
              CASE
                  WHEN `Item Name` LIKE '%{user_input}%' THEN round(`Product Price` * 1.10, 2)
                  WHEN `Item Name` LIKE 'Garlic Naan' THEN ROUND(`Product Price` * 1.12, 2)
              ELSE
                  `Product Price`
              END `Product Price (new)`
""").show()

Enter the product name: Mango
+--------+--------------------+--------+-------------+--------------+-------------------+
|Order ID|           Item Name|Quantity|Product Price|Total products|Product Price (new)|
+--------+--------------------+--------+-------------+--------------+-------------------+
|   25583|Tandoori Mixed Grill|       1|        11.95|            12|              11.95|
|   25583|        Madras Sauce|       1|         3.95|            12|               3.95|
|   25583|       Mushroom Rice|       2|         3.95|            12|               3.95|
|   25583|         Garlic Naan|       1|         2.95|            12|                3.3|
|   25583|             Paratha|       1|         2.95|            12|               2.95|
|   25583|          Plain Rice|       1|         2.95|            12|               2.95|
|   25583|         Prawn Puree|       1|         4.95|            12|               4.95|
|   25583|       Plain Papadum|       1|          0.8|            12| 

In [62]:
sprk.stop()