In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RetailTransactions").getOrCreate()

In [0]:
# Basics
# 1. Load retail_data.csv into a PySpark DataFrame and display schema.
df = spark.read.option("header", True).option("inferSchema", True).csv("/Volumes/workspace/ecommerce/csv_data/retail_data.csv")
df.show()
df.printSchema()

# 2. Infer schema as False — then manually cast columns.
df_raw = spark.read.option("header", True).option("inferSchema", False).csv("/Volumes/workspace/ecommerce/csv_data/retail_data.csv")

from pyspark.sql.functions import col
df_casted = df_raw.select(
    col("TransactionID"),
    col("Customer"),
    col("City"),
    col("Product"),
    col("Category"),
    col("Quantity").cast("int"),
    col("UnitPrice").cast("int"),
    col("TotalPrice").cast("int"),
    col("TransactionDate"),
    col("PaymentMode")
)
df_casted.printSchema()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|     1000|      3000|     2024-02

In [0]:
# Data Exploration & Filtering
# 3. Filter transactions where TotalPrice > 40000 .
df.filter(df.TotalPrice > 40000).show()
df.show()

# 4. Get unique cities from the dataset.
df.select("City").distinct().show()

# 5. Find all transactions from "Delhi" using .filter() and .where() .
df.filter(df.City == "Delhi").show()
df.where(df.City == "Delhi").show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50000|     2024-02-15|       Card|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|Transaction

In [0]:
# Data Manipulation
# 6. Add a column DiscountedPrice = TotalPrice - 10%.
from pyspark.sql.functions import col

df = df.withColumn("DiscountedPrice", col("TotalPrice") * 0.9)
df.select("TransactionID", "TotalPrice", "DiscountedPrice").show()

# 7. Rename TransactionDate to TxnDate .
df = df.withColumnRenamed("TransactionDate", "TxnDate")
df.show()

# 8. Drop the column UnitPrice .
df = df.drop("UnitPrice")
df.show()

+-------------+----------+---------------+
|TransactionID|TotalPrice|DiscountedPrice|
+-------------+----------+---------------+
|        T1001|     70000|        63000.0|
|        T1002|     60000|        54000.0|
|        T1003|     15000|        13500.0|
|        T1004|     20000|        18000.0|
|        T1005|     50000|        45000.0|
|        T1006|      3000|         2700.0|
+-------------+----------+---------------+

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|     70000|2024-01-15|       Card|        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|     60000|2024-01-20|        UPI|        54000.0|

In [0]:
# Aggregations
# 9. Get total sales by city.
df.groupBy("City").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()

# 10. Get average unit price by category.
df_casted.groupBy("Category").avg("UnitPrice").withColumnRenamed("avg(UnitPrice)", "AvgUnitPrice").show()

# 11. Count of transactions grouped by PaymentMode.
df.groupBy("PaymentMode").count().show()

+---------+----------+
|     City|TotalSales|
+---------+----------+
|Bangalore|     60000|
|    Delhi|     23000|
|   Mumbai|    120000|
|Hyderabad|     15000|
+---------+----------+

+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|  Furniture|     10000.0|
|Electronics|     37750.0|
+-----------+------------+

+-----------+-----+
|PaymentMode|count|
+-----------+-----+
|Net Banking|    1|
|       Cash|    1|
|       Card|    3|
|        UPI|    1|
+-----------+-----+



In [0]:
# Window Functions
# 12. Use a window partitioned by City to rank transactions by TotalPrice .
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("City").orderBy(df["TotalPrice"].desc())
df.withColumn("Rank", rank().over(windowSpec)).select("TransactionID", "City", "TotalPrice", "Rank").show()

# 13. Use lag function to get previous transaction amount per city.
from pyspark.sql.functions import lag

df.withColumn("PrevAmount", lag("TotalPrice").over(windowSpec)).select("TransactionID", "City", "TotalPrice", "PrevAmount").show()

+-------------+---------+----------+----+
|TransactionID|     City|TotalPrice|Rank|
+-------------+---------+----------+----+
|        T1002|Bangalore|     60000|   1|
|        T1004|    Delhi|     20000|   1|
|        T1006|    Delhi|      3000|   2|
|        T1003|Hyderabad|     15000|   1|
|        T1001|   Mumbai|     70000|   1|
|        T1005|   Mumbai|     50000|   2|
+-------------+---------+----------+----+

+-------------+---------+----------+----------+
|TransactionID|     City|TotalPrice|PrevAmount|
+-------------+---------+----------+----------+
|        T1002|Bangalore|     60000|      NULL|
|        T1004|    Delhi|     20000|      NULL|
|        T1006|    Delhi|      3000|     20000|
|        T1003|Hyderabad|     15000|      NULL|
|        T1001|   Mumbai|     70000|      NULL|
|        T1005|   Mumbai|     50000|     70000|
+-------------+---------+----------+----------+



In [0]:
# Joins
# 14. Create a second DataFrame city_region :
# City,Region
# Mumbai,West
# Delhi,North
# Bangalore,South
# Hyderabad,South

data_region = [("Mumbai", "West"), ("Delhi", "North"), ("Bangalore", "South"), ("Hyderabad", "South")]
columns_region = ["City", "Region"]

df_region = spark.createDataFrame(data_region, columns_region)
df_region.show()

# 15. Join with main DataFrame and group total sales by Region.
df_joined = df.join(df_region, on="City", how="left")
df_joined.groupBy("Region").sum("TotalPrice").withColumnRenamed("sum(TotalPrice)", "TotalSales").show()


+---------+------+
|     City|Region|
+---------+------+
|   Mumbai|  West|
|    Delhi| North|
|Bangalore| South|
|Hyderabad| South|
+---------+------+

+------+----------+
|Region|TotalSales|
+------+----------+
|  West|    120000|
| North|     23000|
| South|     75000|
+------+----------+



In [0]:
# Nulls and Data Cleaning
# 16. Introduce some nulls and replace them with default values.
from pyspark.sql.functions import lit

df_null = df.withColumn("Quantity", lit(None).cast("int"))
df_filled = df_null.fillna({"Quantity": 1})
df_filled.show()

# 17. Drop rows where Quantity is null.
df_null.dropna(subset=["Quantity"]).show()

# 18. Fill null PaymentMode with "Unknown".
df.fillna({"PaymentMode": "Unknown"}).show()

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|     70000|2024-01-15|       Card|        63000.0|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       1|     60000|2024-01-20|        UPI|        54000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|Net Banking|        13500.0|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       1|     20000|2024-02-12|       Card|        18000.0|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|     50000|2024-02-15|       Card|        45000.0|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       1|      3000|2024-02-18|   

In [0]:
# Custom Functions
# 19. Write a UDF to label orders:
# def label_order(amount):
# if amount > 50000: return "High"
# elif amount >= 30000: return "Medium"
# else: return "Low"
# Apply this to classify TotalPrice .
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def label_order(amount):
    if amount > 50000:
        return "High"
    elif amount >= 30000:
        return "Medium"
    else:
        return "Low"

label_udf = udf(label_order, StringType())
df = df.withColumn("OrderLabel", label_udf(col("TotalPrice")))
df.select("TransactionID", "TotalPrice", "OrderLabel").show()


+-------------+----------+----------+
|TransactionID|TotalPrice|OrderLabel|
+-------------+----------+----------+
|        T1001|     70000|      High|
|        T1002|     60000|      High|
|        T1003|     15000|       Low|
|        T1004|     20000|       Low|
|        T1005|     50000|    Medium|
|        T1006|      3000|       Low|
+-------------+----------+----------+



In [0]:
# Date & Time
# 20. Extract year, month, and day from TxnDate .
from pyspark.sql.functions import to_date, year, month, dayofmonth

df = df.withColumn("TxnDate", to_date("TxnDate", "yyyy-MM-dd"))
df = df.withColumn("Year", year("TxnDate")).withColumn("Month", month("TxnDate")).withColumn("Day", dayofmonth("TxnDate"))
df.select("TxnDate", "Year", "Month", "Day").show()

# 21. Filter transactions that happened in February.
df.filter(month("TxnDate") == 2).show()

+----------+----+-----+---+
|   TxnDate|Year|Month|Day|
+----------+----+-----+---+
|2024-01-15|2024|    1| 15|
|2024-01-20|2024|    1| 20|
|2024-02-10|2024|    2| 10|
|2024-02-12|2024|    2| 12|
|2024-02-15|2024|    2| 15|
|2024-02-18|2024|    2| 18|
+----------+----+-----+---+

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|OrderLabel|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|Net Banking|        13500.0|       Low|2024|    2| 10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|2024-02-12|       Card|        18000.0|       Low|2024|    2| 12|
|        T1005|   K

In [0]:
# Union & Duplicate Handling
# 22. Duplicate the DataFrame using union() and remove duplicates.
df_duped = df.union(df)
df_unique = df_duped.dropDuplicates()
df_unique.show()

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountedPrice|OrderLabel|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+---------------+----------+----+-----+---+
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|      3000|2024-02-18|       Cash|         2700.0|       Low|2024|    2| 18|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     20000|2024-02-12|       Card|        18000.0|       Low|2024|    2| 12|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|     60000|2024-01-20|        UPI|        54000.0|      High|2024|    1| 20|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|     15000|2024-02-10|Net Banking|        13500.0|       Low|2024|    2| 10|
|        T1001|     