<a href="https://colab.research.google.com/github/simantinip04/Data-Engineering/blob/main/PySpark/4June_Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from io import StringIO

spark=SparkSession.builder.appName("PySpark_SQL").getOrCreate()

spark

In [None]:
#Step 1: Data Preparation
from pyspark.sql.functions import expr
from datetime import datetime

# Customers Data
customers_data = [
    (101, 'Ali', 'ali@gmail.com', 'Mumbai', '2022-05-10'),
    (102, 'Neha', 'neha@yahoo.com', 'Delhi', '2023-01-15'),
    (103, 'Ravi', 'ravi@hotmail.com', 'Bangalore', '2021-11-01'),
    (104, 'Sneha', 'sneha@outlook.com', 'Hyderabad', '2020-07-22'),
    (105, 'Amit', 'amit@gmail.com', 'Chennai', '2023-03-10'),
]

orders_data = [
    (1, 101, 'Laptop', 'Electronics', 2, 50000.0, '2024-01-10'),
    (2, 101, 'Mouse', 'Electronics', 1, 1200.0, '2024-01-15'),
    (3, 102, 'Tablet', 'Electronics', 1, 20000.0, '2024-02-01'),
    (4, 103, 'Bookshelf', 'Furniture', 1, 3500.0, '2024-02-10'),
    (5, 104, 'Mixer', 'Appliances', 1, 5000.0, '2024-02-15'),
    (6, 105, 'Notebook', 'Stationery', 5, 500.0, '2024-03-01'),
    (7, 102, 'Phone', 'Electronics', 1, 30000.0, '2024-03-02'),
]

customers_df = spark.createDataFrame(customers_data, ["CustomerID", "Name", "Email", "City", "SignupDate"])
orders_df = spark.createDataFrame(orders_data, ["OrderID", "CustomerID", "Product", "Category", "Quantity", "Price", "OrderDate"])

In [None]:
#SECTION A: PySpark DataFrame Tasks
#1. Add column TotalAmount = Price * Quantity
from pyspark.sql.functions import col

orders_df = orders_df.withColumn("TotalAmount", col("Price") * col("Quantity"))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+---------+-----------+--------+-------+----------+-----------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+---------+-----------+--------+-------+----------+-----------+



In [None]:
#2. Filter orders with TotalAmount > 10000
orders_df.filter(col("TotalAmount") > 10000).show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|
+-------+----------+-------+-----------+--------+-------+----------+-----------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|
|      3|       102| Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|
|      7|       102|  Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|
+-------+----------+-------+-----------+--------+-------+----------+-----------+



In [None]:
#3. Standardize City field (lowercase)
from pyspark.sql.functions import lower

customers_df = customers_df.withColumn("City", lower(col("City")))
customers_df.show()

+----------+-----+-----------------+---------+----------+
|CustomerID| Name|            Email|     City|SignupDate|
+----------+-----+-----------------+---------+----------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|
+----------+-----+-----------------+---------+----------+



In [None]:
#4. Extract year from OrderDate
from pyspark.sql.functions import year, to_date

orders_df = orders_df.withColumn("OrderYear", year(to_date("OrderDate")))
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [None]:
#5. Fill null values in any column
# Assume nulls in Category, fill with 'Unknown'
orders_df.fillna({'Category': 'Unknown'}).show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|
|      7|       102|    Phone|Electronics|       1|30000.0|2024-03-02|    30000.0|     2024|
+-------+----------+---------+-----------+--------+-------+----------+

In [None]:
#6. Categorize orders by TotalAmount
from pyspark.sql.functions import when

orders_df = orders_df.withColumn("AmountCategory",
    when(col("TotalAmount") < 5000, "Low")
    .when((col("TotalAmount") >= 5000) & (col("TotalAmount") <= 20000), "Medium")
    .otherwise("High")
)
orders_df.show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|AmountCategory|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|          High|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|           Low|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|        Medium|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|           Low|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|        Medium|
|      6|       105| Notebook| Stationery|       5|  500.0|2024-03-01|     2500.0|     2024|           Low|
|      7|       102|    Phon

In [None]:
# SECTION B: Spark SQL Tasks
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

In [None]:
#7. All orders by “Ali”
spark.sql("""
SELECT * FROM orders
WHERE CustomerID IN (SELECT CustomerID FROM customers WHERE Name = 'Ali')
""").show()

+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+--------------+
|OrderID|CustomerID|Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|AmountCategory|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+--------------+
|      1|       101| Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|          High|
|      2|       101|  Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|           Low|
+-------+----------+-------+-----------+--------+-------+----------+-----------+---------+--------------+



In [None]:
#8. Total spending by each customer
spark.sql("""
SELECT CustomerID, SUM(TotalAmount) AS TotalSpent
FROM orders
GROUP BY CustomerID
""").show()

+----------+----------+
|CustomerID|TotalSpent|
+----------+----------+
|       101|  101200.0|
|       102|   50000.0|
|       103|    3500.0|
|       104|    5000.0|
|       105|    2500.0|
+----------+----------+



In [None]:
#9. Category with highest revenue
spark.sql("""
SELECT Category, SUM(TotalAmount) AS Revenue
FROM orders
GROUP BY Category
ORDER BY Revenue DESC
LIMIT 1
""").show()

+-----------+--------+
|   Category| Revenue|
+-----------+--------+
|Electronics|151200.0|
+-----------+--------+



In [None]:
#10. Create view: customer_orders
spark.sql("""
CREATE OR REPLACE TEMP VIEW customer_orders AS
SELECT c.Name AS CustomerName, o.Product, o.TotalAmount
FROM customers c
JOIN orders o ON c.CustomerID = o.CustomerID
""")

DataFrame[]

In [None]:
#11. Query view: Products ordered after Feb 2024
spark.sql("""
    SELECT c.CustomerName, o.Product, o.TotalAmount
    FROM customer_orders c
    JOIN orders o ON c.Product = o.Product AND c.CustomerName = c.CustomerName
    WHERE o.OrderDate > '2024-02-01'
""").show()

+------------+---------+-----------+
|CustomerName|  Product|TotalAmount|
+------------+---------+-----------+
|        Neha|    Phone|    30000.0|
|        Ravi|Bookshelf|     3500.0|
|        Amit| Notebook|     2500.0|
|       Sneha|    Mixer|     5000.0|
+------------+---------+-----------+



In [None]:
#12. Global Temp View and Query'
customers_df.createOrReplaceGlobalTempView("customers")
spark.sql("SELECT * FROM global_temp.customers WHERE City = 'mumbai'").show()

+----------+----+-------------+------+----------+
|CustomerID|Name|        Email|  City|SignupDate|
+----------+----+-------------+------+----------+
|       101| Ali|ali@gmail.com|mumbai|2022-05-10|
+----------+----+-------------+------+----------+



In [None]:
#13. Save transformed orders_df to Parquet
orders_df.write.mode("overwrite").parquet("/content/orders_parquet")

In [None]:
#14. Read back and count rows
read_df = spark.read.parquet("/content/orders_parquet")
read_df.count()

7

In [None]:
# SECTION D: UDF + Built-in Functions
#15. UDF: Mask emails
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def mask_email(email):
    name, domain = email.split('@')
    return name[0] + "***@" + domain

mask_email_udf = udf(mask_email, StringType())
customers_df.withColumn("MaskedEmail", mask_email_udf("Email")).show()

+----------+-----+-----------------+---------+----------+----------------+
|CustomerID| Name|            Email|     City|SignupDate|     MaskedEmail|
+----------+-----+-----------------+---------+----------+----------------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|  a***@gmail.com|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|  n***@yahoo.com|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01|r***@hotmail.com|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|s***@outlook.com|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|  a***@gmail.com|
+----------+-----+-----------------+---------+----------+----------------+



In [None]:
#16. concat_ws() to create label
from pyspark.sql.functions import concat_ws

customers_df.withColumn("Label", concat_ws(" from ", "Name", "City")).show()

+----------+-----+-----------------+---------+----------+--------------------+
|CustomerID| Name|            Email|     City|SignupDate|               Label|
+----------+-----+-----------------+---------+----------+--------------------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|     Ali from mumbai|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|     Neha from delhi|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01| Ravi from bangalore|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|Sneha from hyderabad|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|   Amit from chennai|
+----------+-----+-----------------+---------+----------+--------------------+



In [None]:
#17. regexp_replace() to clean product names
from pyspark.sql.functions import regexp_replace

orders_df.withColumn("CleanProduct", regexp_replace("Product", "[^a-zA-Z0-9]", "")).show()

+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------------+------------+
|OrderID|CustomerID|  Product|   Category|Quantity|  Price| OrderDate|TotalAmount|OrderYear|AmountCategory|CleanProduct|
+-------+----------+---------+-----------+--------+-------+----------+-----------+---------+--------------+------------+
|      1|       101|   Laptop|Electronics|       2|50000.0|2024-01-10|   100000.0|     2024|          High|      Laptop|
|      2|       101|    Mouse|Electronics|       1| 1200.0|2024-01-15|     1200.0|     2024|           Low|       Mouse|
|      3|       102|   Tablet|Electronics|       1|20000.0|2024-02-01|    20000.0|     2024|        Medium|      Tablet|
|      4|       103|Bookshelf|  Furniture|       1| 3500.0|2024-02-10|     3500.0|     2024|           Low|   Bookshelf|
|      5|       104|    Mixer| Appliances|       1| 5000.0|2024-02-15|     5000.0|     2024|        Medium|       Mixer|
|      6|       105| Notebook| S

In [None]:
#18. datediff() to get age in days from SignupDate
from pyspark.sql.functions import to_date, current_date, datediff

customers_df.withColumn("SignupDaysAgo",
    datediff(current_date(), to_date("SignupDate"))
).show()

+----------+-----+-----------------+---------+----------+-------------+
|CustomerID| Name|            Email|     City|SignupDate|SignupDaysAgo|
+----------+-----+-----------------+---------+----------+-------------+
|       101|  Ali|    ali@gmail.com|   mumbai|2022-05-10|         1121|
|       102| Neha|   neha@yahoo.com|    delhi|2023-01-15|          871|
|       103| Ravi| ravi@hotmail.com|bangalore|2021-11-01|         1311|
|       104|Sneha|sneha@outlook.com|hyderabad|2020-07-22|         1778|
|       105| Amit|   amit@gmail.com|  chennai|2023-03-10|          817|
+----------+-----+-----------------+---------+----------+-------------+

