Load via Pandas

In [0]:
import pandas as pd

pdf = pd.read_csv(
    "https://raw.githubusercontent.com/Drushti2706/Ecommerce-sales-dataset/main/ecommerce_sales_data.csv"
)

pdf.head()


Unnamed: 0,Order ID,Customer ID,Gender,Age,Product Category,Product Name,Quantity,Price,Order Date,Payment Method,City,Rating
0,ORD0001,CUST9376,Female,43,Home,Lamp,1,1368.69,07-06-2025,Cash on Delivery,Hyderabad,3
1,ORD0002,CUST3289,Male,57,Toys,Lego Set,5,782.44,11-12-2024,Cash on Delivery,Chennai,5
2,ORD0003,CUST6409,Female,53,Clothing,Jacket,1,3676.18,05-05-2025,Credit Card,Bangalore,4
3,ORD0004,CUST8815,Female,51,Beauty,Perfume,2,4836.37,25-06-2025,Cash on Delivery,Mumbai,5
4,ORD0005,CUST1018,Female,39,Electronics,Smartphone,4,3580.24,25-12-2024,UPI,Kolkata,3


Convert Pandas DataFrame → Spark DataFrame

In [0]:
df = spark.createDataFrame(pdf)

df.printSchema()
df.display()


root
 |-- Order ID: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Product Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- Price: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Rating: long (nullable = true)



Order ID,Customer ID,Gender,Age,Product Category,Product Name,Quantity,Price,Order Date,Payment Method,City,Rating
ORD0001,CUST9376,Female,43,Home,Lamp,1,1368.69,07-06-2025,Cash on Delivery,Hyderabad,3
ORD0002,CUST3289,Male,57,Toys,Lego Set,5,782.44,11-12-2024,Cash on Delivery,Chennai,5
ORD0003,CUST6409,Female,53,Clothing,Jacket,1,3676.18,05-05-2025,Credit Card,Bangalore,4
ORD0004,CUST8815,Female,51,Beauty,Perfume,2,4836.37,25-06-2025,Cash on Delivery,Mumbai,5
ORD0005,CUST1018,Female,39,Electronics,Smartphone,4,3580.24,25-12-2024,UPI,Kolkata,3
ORD0006,CUST4632,Male,27,Books,Fiction,2,4593.39,30-07-2024,Netbanking,Mumbai,1
ORD0007,CUST5769,Male,53,Beauty,Perfume,4,634.38,23-10-2024,Cash on Delivery,Chennai,2
ORD0008,CUST2501,Female,38,Toys,Action Figure,5,1509.52,29-01-2025,UPI,Bangalore,2
ORD0009,CUST5260,Female,22,Clothing,Jeans,1,3102.59,19-01-2025,Credit Card,Kolkata,2
ORD0010,CUST9857,Female,51,Clothing,T-Shirt,1,4501.05,19-12-2024,Cash on Delivery,Chennai,2


PySpark vs Pandas (Comparison)

In [0]:
# Pandas (single machine)
pdf.head()

# PySpark (distributed)
df.show(5)


+--------+-----------+------+---+----------------+------------+--------+-------+----------+----------------+---------+------+
|Order ID|Customer ID|Gender|Age|Product Category|Product Name|Quantity|  Price|Order Date|  Payment Method|     City|Rating|
+--------+-----------+------+---+----------------+------------+--------+-------+----------+----------------+---------+------+
| ORD0001|   CUST9376|Female| 43|            Home|        Lamp|       1|1368.69|07-06-2025|Cash on Delivery|Hyderabad|     3|
| ORD0002|   CUST3289|  Male| 57|            Toys|    Lego Set|       5| 782.44|11-12-2024|Cash on Delivery|  Chennai|     5|
| ORD0003|   CUST6409|Female| 53|        Clothing|      Jacket|       1|3676.18|05-05-2025|     Credit Card|Bangalore|     4|
| ORD0004|   CUST8815|Female| 51|          Beauty|     Perfume|       2|4836.37|25-06-2025|Cash on Delivery|   Mumbai|     5|
| ORD0005|   CUST1018|Female| 39|     Electronics|  Smartphone|       4|3580.24|25-12-2024|             UPI|  Kolkata|

Create Derived Feature – Revenue

In [0]:
df = df.withColumn(
    "Revenue",
    F.col("Quantity") * F.col("Price")
)

df.select(
    "Order ID", "Product Name", "Quantity", "Price", "Revenue"
).display()


Order ID,Product Name,Quantity,Price,Revenue
ORD0001,Lamp,1,1368.69,1368.69
ORD0002,Lego Set,5,782.44,3912.2
ORD0003,Jacket,1,3676.18,3676.18
ORD0004,Perfume,2,4836.37,9672.74
ORD0005,Smartphone,4,3580.24,14320.96
ORD0006,Fiction,2,4593.39,9186.78
ORD0007,Perfume,4,634.38,2537.52
ORD0008,Action Figure,5,1509.52,7547.6
ORD0009,Jeans,1,3102.59,3102.59
ORD0010,T-Shirt,1,4501.05,4501.05


Perform Joins (Inner, Left, Right, Outer)

Create a Customer Dimension Table

In [0]:
customers = df.select(
    "Customer ID", "Gender", "Age"
).dropDuplicates()


Inner Join

In [0]:
inner_join_df = df.join(customers, "Customer ID", "inner")
inner_join_df.display()

Customer ID,Order ID,Gender,Age,Product Category,Product Name,Quantity,Price,Order Date,Payment Method,City,Rating,Revenue,Gender.1,Age.1
CUST9376,ORD0001,Female,43,Home,Lamp,1,1368.69,07-06-2025,Cash on Delivery,Hyderabad,3,1368.69,Female,43
CUST3289,ORD0002,Male,57,Toys,Lego Set,5,782.44,11-12-2024,Cash on Delivery,Chennai,5,3912.2,Male,57
CUST6409,ORD0003,Female,53,Clothing,Jacket,1,3676.18,05-05-2025,Credit Card,Bangalore,4,3676.18,Female,28
CUST8815,ORD0004,Female,51,Beauty,Perfume,2,4836.37,25-06-2025,Cash on Delivery,Mumbai,5,9672.74,Female,51
CUST1018,ORD0005,Female,39,Electronics,Smartphone,4,3580.24,25-12-2024,UPI,Kolkata,3,14320.96,Female,39
CUST4632,ORD0006,Male,27,Books,Fiction,2,4593.39,30-07-2024,Netbanking,Mumbai,1,9186.78,Male,27
CUST5769,ORD0007,Male,53,Beauty,Perfume,4,634.38,23-10-2024,Cash on Delivery,Chennai,2,2537.52,Male,53
CUST2501,ORD0008,Female,38,Toys,Action Figure,5,1509.52,29-01-2025,UPI,Bangalore,2,7547.6,Female,38
CUST5260,ORD0009,Female,22,Clothing,Jeans,1,3102.59,19-01-2025,Credit Card,Kolkata,2,3102.59,Female,22
CUST9857,ORD0010,Female,51,Clothing,T-Shirt,1,4501.05,19-12-2024,Cash on Delivery,Chennai,2,4501.05,Female,51


Left Join

In [0]:
left_join_df = df.join(customers, "Customer ID", "left")
left_join_df.display()


Right Join

In [0]:
right_join_df = df.join(customers, "Customer ID", "right")
right_join_df.display()


Full Outer Join

In [0]:
outer_join_df = df.join(customers, "Customer ID", "outer")

outer_join_df.display()


Customer ID,Order ID,Gender,Age,Product Category,Product Name,Quantity,Price,Order Date,Payment Method,City,Rating,Revenue,Gender.1,Age.1
CUST3289,ORD0002,Male,57,Toys,Lego Set,5,782.44,11-12-2024,Cash on Delivery,Chennai,5,3912.2,Male,57
CUST9376,ORD0001,Female,43,Home,Lamp,1,1368.69,07-06-2025,Cash on Delivery,Hyderabad,3,1368.69,Female,43
CUST4981,ORD0012,Male,35,Home,Curtains,3,4240.88,15-05-2025,Credit Card,Kolkata,2,12722.64,Male,35
CUST9857,ORD0010,Female,51,Clothing,T-Shirt,1,4501.05,19-12-2024,Cash on Delivery,Chennai,2,4501.05,Female,51
CUST2501,ORD0008,Female,38,Toys,Action Figure,5,1509.52,29-01-2025,UPI,Bangalore,2,7547.6,Female,38
CUST8815,ORD0004,Female,51,Beauty,Perfume,2,4836.37,25-06-2025,Cash on Delivery,Mumbai,5,9672.74,Female,51
CUST5260,ORD0009,Female,22,Clothing,Jeans,1,3102.59,19-01-2025,Credit Card,Kolkata,2,3102.59,Female,22
CUST4632,ORD0006,Male,27,Books,Fiction,2,4593.39,30-07-2024,Netbanking,Mumbai,1,9186.78,Male,27
CUST1018,ORD0005,Female,39,Electronics,Smartphone,4,3580.24,25-12-2024,UPI,Kolkata,3,14320.96,Female,39
CUST9071,ORD0011,Female,23,Clothing,Dress,5,1448.53,06-07-2025,Cash on Delivery,Chennai,1,7242.65,Female,23


Complex Join (Aggregated Data + Fact Table)

In [0]:
product_revenue = df.groupBy(
    "Product Category", "Product Name"
).agg(
    F.sum("Revenue").alias("Total_Product_Revenue")
)

df_complex_join = df.join(
    product_revenue,
    ["Product Category", "Product Name"],
    "left"
)

df_complex_join.display()


Product Category,Product Name,Order ID,Customer ID,Gender,Age,Quantity,Price,Order Date,Payment Method,City,Rating,Revenue,Total_Product_Revenue
Home,Lamp,ORD0001,CUST9376,Female,43,1,1368.69,07-06-2025,Cash on Delivery,Hyderabad,3,1368.69,41530.600000000006
Toys,Lego Set,ORD0002,CUST3289,Male,57,5,782.44,11-12-2024,Cash on Delivery,Chennai,5,3912.2,36345.05
Clothing,Jacket,ORD0003,CUST6409,Female,53,1,3676.18,05-05-2025,Credit Card,Bangalore,4,3676.18,10985.43
Beauty,Perfume,ORD0004,CUST8815,Female,51,2,4836.37,25-06-2025,Cash on Delivery,Mumbai,5,9672.74,37875.850000000006
Electronics,Smartphone,ORD0005,CUST1018,Female,39,4,3580.24,25-12-2024,UPI,Kolkata,3,14320.96,58226.560000000005
Books,Fiction,ORD0006,CUST4632,Male,27,2,4593.39,30-07-2024,Netbanking,Mumbai,1,9186.78,35729.68
Beauty,Perfume,ORD0007,CUST5769,Male,53,4,634.38,23-10-2024,Cash on Delivery,Chennai,2,2537.52,37875.850000000006
Toys,Action Figure,ORD0008,CUST2501,Female,38,5,1509.52,29-01-2025,UPI,Bangalore,2,7547.6,21130.34
Clothing,Jeans,ORD0009,CUST5260,Female,22,1,3102.59,19-01-2025,Credit Card,Kolkata,2,3102.59,32370.84
Clothing,T-Shirt,ORD0010,CUST9857,Female,51,1,4501.05,19-12-2024,Cash on Delivery,Chennai,2,4501.05,46895.37


Window Function – Running Total per Customer

In [0]:
window_spec = Window.partitionBy("Customer ID") \
    .orderBy("Order ID") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_running_total = df.withColumn(
    "Running_Revenue",
    F.sum("Revenue").over(window_spec)
)

df_running_total.select(
    "Customer ID", "Order ID", "Revenue", "Running_Revenue"
).display()


Customer ID,Order ID,Revenue,Running_Revenue
CUST1018,ORD0005,14320.96,14320.96
CUST1105,ORD0099,13232.88,13232.88
CUST1138,ORD0068,1944.52,1944.52
CUST1271,ORD0098,4306.4400000000005,4306.4400000000005
CUST1335,ORD0049,8542.9,8542.9
CUST1354,ORD0073,9591.36,9591.36
CUST1357,ORD0029,12701.72,12701.72
CUST1437,ORD0014,17174.45,17174.45
CUST1481,ORD0060,694.75,694.75
CUST1516,ORD0024,21926.15,21926.15


Window Function – Ranking Products by Revenue

In [0]:
product_rank = product_revenue.withColumn(
    "Rank",
    F.rank().over(Window.orderBy(F.desc("Total_Product_Revenue")))
)

product_rank.display()


UDF – Create Derived Feature (Age Group)

In [0]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def age_group(age):
    if age < 25:
        return "Young"
    elif age < 40:
        return "Adult"
    else:
        return "Senior"

age_udf = udf(age_group, StringType())

df = df.withColumn(
    "Age_Group",
    age_udf(F.col("Age"))
)

df.select("Age", "Age_Group").display()


Age,Age_Group
43,Senior
57,Senior
53,Senior
51,Senior
39,Adult
27,Adult
53,Senior
38,Adult
22,Young
51,Senior
