In [26]:
import os
import pandas as pd
from faker import Faker
import random
from pyspark.sql.functions import round as spark_round,col,when,to_date,lit,avg
from datetime import datetime,timedelta
from sqlalchemy import create_engine
from pyspark.sql import SparkSession,Window
import numpy as np

In [27]:
out_dir = "./data"
os.makedirs(out_dir, exist_ok=True)
fake = Faker()

In [36]:
customers = []
for i in range(1,71):
    name=fake.name()
    email_name=name.lower().replace(" ","")
    email=f"{email_name}@gmail.com"
    country=fake.country()
    if i%10==0:
      email=None
    if i%15==0:
      email=None
    if i%20==0:
        name="##INVALID##"

    customers.append({
        "customer_id": i,
        "name": name,
        "email": email,
        "country":country
    })
pd.DataFrame(customers).to_csv(f"{out_dir }/customers.csv", index=False)
customers_df=pd.DataFrame(customers)
print(customers_df.head(30))

    customer_id                 name                         email  \
0             1       Tiffany Wilson       tiffanywilson@gmail.com   
1             2        Samuel Mendez        samuelmendez@gmail.com   
2             3     Melissa Gray PhD      melissagrayphd@gmail.com   
3             4  Kimberly Washington  kimberlywashington@gmail.com   
4             5      Jennifer Duncan      jenniferduncan@gmail.com   
5             6          Tyler Allen          tylerallen@gmail.com   
6             7            Erik Frey            erikfrey@gmail.com   
7             8      Sydney Robinson      sydneyrobinson@gmail.com   
8             9            Todd Hall            toddhall@gmail.com   
9            10            Dana Soto                          None   
10           11       Melissa Harris       melissaharris@gmail.com   
11           12        Carl Jennings        carljennings@gmail.com   
12           13        Robert Harris        robertharris@gmail.com   
13           14     

In [37]:
statuses = ["completed", "pending", "cancelled", "returned"]
orders = []

for i in range(1, 100):
    order_date = fake.date_between(start_date='-6m', end_date='today').strftime("%Y-%m-%d")
    if i % 17 == 0:
        order_date = None
    amount = __builtins__.round(random.uniform(-50, 2000), 2)
    if i % 12 == 0:
        amount = -abs(amount)
    status = random.choice(statuses)
    orders.append({
        "order_id": i,
        "customer_id": random.randint(1, 70),
        "product_id": random.randint(1, 50),
        "order_date": order_date,
        "amount": amount,
        "status": status
    })
orders_df = pd.DataFrame(orders)
orders_df.to_csv(f"{out_dir}/orders.csv", index=False)
print(orders_df.head())

   order_id  customer_id  product_id  order_date   amount     status
0         1           26          22  2025-09-10   872.62   returned
1         2           51          16  2025-09-10   212.68    pending
2         3           24          22  2025-09-10   241.38   returned
3         4           33          19  2025-09-10   237.49    pending
4         5           51          12  2025-09-10  1210.65  completed


In [38]:
products_catalog = [
    ("iPhone 14", "Electronics"), ("Samsung Galaxy S23", "Electronics"),
    ("MacBook Pro 14", "Electronics"), ("Sony WH-1000XM5 Headphones", "Electronics"),
    ("Apple Watch Series 9", "Wearables"), ("Fitbit Charge 5", "Wearables"),
    ("Dell XPS 13 Laptop", "Electronics"), ("iPad Air", "Tablets"),
    ("Bose QuietComfort Earbuds", "Audio"), ("Samsung Galaxy Buds", "Audio"),
    ("Canon EOS Rebel Camera", "Photography"), ("GoPro HERO10", "Photography"),
    ("Nike Air Max Shoes", "Footwear"), ("Puma Running Shoes", "Footwear"),
    ("Levi's 501 Jeans", "Apparel"), ("Adidas Hoodie", "Apparel"),
    ("Zara T-Shirt", "Apparel"), ("H&M Casual Dress", "Apparel"),
    ("Gucci Leather Belt", "Accessories"), ("Under Armour Shorts", "Sportswear"),
    ("Harry Potter Book Set", "Books"), ("The Lord of the Rings Trilogy", "Books"),
    ("Atomic Habits", "Books"), ("Rich Dad Poor Dad", "Books"),
    ("Python Crash Course", "Books"), ("Clean Code", "Books"),
    ("Game of Thrones Box Set", "Books"), ("The Lean Startup", "Books"),
    ("To Kill a Mockingbird", "Books"), ("1984", "Books"),
    ("Ikea Dining Table", "Furniture"), ("Sealy Memory Foam Mattress", "Furniture"),
    ("Ikea Chair Set", "Furniture"), ("Philips Air Fryer", "Kitchen Appliances"),
    ("NutriBullet Blender", "Kitchen Appliances"), ("Instant Pot Cooker", "Kitchen Appliances"),
    ("Hamilton Beach Toaster", "Kitchen Appliances"), ("Keurig Coffee Maker", "Kitchen Appliances"),
    ("Philips LED Desk Lamp", "Lighting"), ("Dyson V15 Vacuum Cleaner", "Home Appliances"),
    ("Wilson Tennis Racket", "Sports Equipment"), ("Adidas Football", "Sports Equipment"),
    ("Spalding Basketball", "Sports Equipment"), ("Yonex Badminton Racket", "Sports Equipment"),
    ("Nike Yoga Mat", "Fitness"), ("Reebok Jump Rope", "Fitness"),
    ("Speedo Swimming Goggles", "Sports Accessories"), ("Titleist Golf Balls", "Sports Equipment"),
    ("LEGO Star Wars Set", "Toys & Games"), ("Barbie Dreamhouse", "Dolls"),
    ("Hot Wheels Track Set", "Toys & Games"), ("Monopoly Board Game", "Board Games"),
    ("Rubik's Cube", "Puzzles"), ("Nerf Elite Blaster", "Outdoor Toys"),
    ("Play-Doh Fun Pack", "Arts & Crafts"), ("Fisher-Price Baby Gym", "Baby Toys"),
    ("Disney Princess Doll", "Dolls"), ("LEGO Technic Car", "Toys & Games")
]
products = []
for i, (name, category) in enumerate(products_catalog, start=1):
    products.append({
        "product_id": i,
        "name": name,
        "category": category,
        "price": round(random.uniform(10, 1200), 2)
    })
pd.DataFrame(products).to_csv(f"{out_dir}/products.csv", index=False)
products_df=pd.DataFrame(products)
print(products_df.head())

   product_id                        name     category    price
0           1                   iPhone 14  Electronics   237.28
1           2          Samsung Galaxy S23  Electronics  1015.56
2           3              MacBook Pro 14  Electronics    70.02
3           4  Sony WH-1000XM5 Headphones  Electronics   795.36
4           5        Apple Watch Series 9    Wearables  1197.66


In [31]:
jdbc_jar = r"C:\spark\spark-3.5.6-bin-hadoop3\jars\postgresql-42.7.7.jar"

spark = SparkSession.builder \
    .appName("DataEngineeringPipeline") \
    .config("spark.driver.extraClassPath", jdbc_jar) \
    .getOrCreate()

In [39]:
customers_df = spark.read.csv(f"{out_dir}/customers.csv", header=True, inferSchema=True)
products_df= spark.read.csv(f"{out_dir}/products.csv", header=True, inferSchema=True)
orders_df = spark.read.csv(f"{out_dir}/orders.csv", header=True, inferSchema=True)

In [40]:
customers_df = customers_df.toDF(*[c.lower().strip() for c in customers_df.columns])
products_df = products_df.toDF(*[c.lower().strip() for c in products_df.columns])
orders_df   = orders_df.toDF(*[c.lower().strip() for c in orders_df.columns])

In [41]:
customers_df=customers_df.withColumn("name",when((col("name")=="##INVALID##")| (col("name").isNull()),None).otherwise(col("name")))
customers_df = customers_df.withColumn("email",when((col("email")=="None") | (col("email").isNull()), None).otherwise(col("email")))
customers_df = customers_df.dropDuplicates(["customer_id"])

In [49]:
orders_df = orders_df.withColumn("order_date",to_date("order_date", "yyyy-MM-dd"))
orders_df = orders_df.withColumn("amount",when(col("amount") <= 0, 0).otherwise(col("amount")))

In [50]:
category_avg = products_df.groupBy("category").agg(avg("price").alias("avg_price"))
products_df = products_df.join(category_avg, on="category", how="left")
products_df = products_df.withColumn("price",when(col("price") <= 0, col("avg_price")).otherwise(col("price")))
products_df = products_df.drop("avg_price")

In [51]:
customers_df = customers_df.withColumnRenamed("name", "customer_name")
products_df  = products_df.withColumnRenamed("name", "product_name")

In [52]:
jdbc_url = "jdbc:postgresql://localhost:5432/de_pipeline"

In [53]:
connection_props = {
    "user": "postgres",
    "password": "root",
    "driver": "org.postgresql.Driver"
}

In [73]:
customers_df.write.jdbc(url=jdbc_url, table="customers", mode="append", properties=connection_props)

In [75]:
products_df.write.jdbc(url=jdbc_url, table="products", mode="append", properties=connection_props)

In [76]:
orders_df.write.jdbc(url=jdbc_url, table="orders", mode="append", properties=connection_props)

In [54]:
order_details = (
    orders_df
    .join(customers_df, "customer_id", "inner")
    .join(products_df, "product_id", "inner")
    .select(
        "order_id",
        customers_df["customer_name"],
        "country",
        products_df["product_name"],
        "category",
        "amount",
        "status",
        "order_date"
    )
)

In [61]:
order_details.show(20)

+--------+-------------------+--------------+--------------------+------------------+-------+---------+----------+
|order_id|      customer_name|       country|        product_name|          category| amount|   status|order_date|
+--------+-------------------+--------------+--------------------+------------------+-------+---------+----------+
|      93|     Tiffany Wilson|    Madagascar|To Kill a Mocking...|             Books| 369.56| returned|2025-09-10|
|      45|     Tiffany Wilson|    Madagascar|            iPad Air|           Tablets| 446.23| returned|2025-09-10|
|      33|     Tiffany Wilson|    Madagascar|  Nike Air Max Shoes|          Footwear|1249.39|  pending|2025-09-10|
|      23|      Samuel Mendez|      Colombia|      MacBook Pro 14|       Electronics|  634.8|cancelled|2025-09-10|
|      78|   Melissa Gray PhD|         Korea| Titleist Golf Balls|  Sports Equipment|1237.29| returned|2025-09-10|
|      16|   Melissa Gray PhD|         Korea|Hamilton Beach To...|Kitchen Applia

In [77]:
order_details.write.jdbc(url=jdbc_url, table="order_details", mode="append", properties=connection_props)

In [56]:
order_details.createOrReplaceTempView("order_details")

In [57]:
top_customers_df = spark.sql("""
    SELECT customer_name, ROUND(SUM(amount),2)AS total_spend
    FROM order_details
    where status ="completed"
    GROUP BY customer_name
    ORDER BY total_spend DESC
    LIMIT 5
""")
top_customers_df.show()

+--------------+-----------+
| customer_name|total_spend|
+--------------+-----------+
|  James Garner|    3691.17|
|Maureen Turner|     2842.9|
|Melissa Harris|    2529.28|
| Karen Bernard|    2480.26|
|    Terry Ward|    2344.79|
+--------------+-----------+



In [58]:
monthly_revenue_df = spark.sql("""
    SELECT DATE_FORMAT(order_date, 'yyyy-MM') AS month,
           ROUND(SUM(amount),2) AS revenue
    FROM order_details
    WHERE order_date >= add_months(current_date(), -6)
    GROUP BY DATE_FORMAT(order_date, 'yyyy-MM')
    ORDER BY month
""")
monthly_revenue_df.show()

+-------+--------+
|  month| revenue|
+-------+--------+
|2025-09|79842.69|
+-------+--------+



In [59]:
popular_category_df = spark.sql("""
    SELECT category, COUNT(order_id) AS total_orders
    FROM order_details
    GROUP BY category
    ORDER BY total_orders DESC
    LIMIT 1
""")
popular_category_df.show()

+--------+------------+
|category|total_orders|
+--------+------------+
|   Books|          16|
+--------+------------+



In [62]:
order_status_df = spark.sql("""
    SELECT 
        LOWER(TRIM(status)) AS order_status,
        COUNT(*) AS count
    FROM order_details
    WHERE LOWER(TRIM(status)) IN ('completed', 'cancelled')
    GROUP BY LOWER(TRIM(status))
""")
order_status_df.show()

+------------+-----+
|order_status|count|
+------------+-----+
|   completed|   25|
|   cancelled|   21|
+------------+-----+



In [65]:
top_customers_df.write.jdbc(url=jdbc_url, table="top_customers", mode="overwrite", properties=connection_props)

In [66]:
monthly_revenue_df.write.jdbc(url=jdbc_url, table="monthly_revenue", mode="overwrite", properties=connection_props)

In [68]:
popular_category_df.write.jdbc(url=jdbc_url, table="popular_category", mode="overwrite", properties=connection_props)

In [69]:
order_status_df.write.jdbc(url=jdbc_url, table="order_status", mode="overwrite", properties=connection_props)