In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum, desc

# Step 1: Create Spark Session
spark = SparkSession.builder.appName("ECommerce-Capstone").getOrCreate()

# Customers DataFrame
customers_data = [
    (1, "Rahul Sharma", "Bangalore", 28),
    (2, "Priya Singh", "Delhi", 32),
    (3, "Aman Kumar", "Hyderabad", 25),
    (4, "Sneha Reddy", "Chennai", 35),
    (5, "Arjun Mehta", "Mumbai", 30),
    (6, "Divya Nair", "Delhi", 29)
]
customers_cols = ["customer_id", "name", "city", "age"]
customers_df = spark.createDataFrame(customers_data, customers_cols)

# Products DataFrame
products_data = [
    (101, "Laptop", "Electronics", 55000),
    (102, "Mobile", "Electronics", 25000),
    (103, "Headphones", "Electronics", 3000),
    (104, "Chair", "Furniture", 5000),
    (105, "Book", "Stationery", 700),
    (106, "Shoes", "Fashion", 2500)
]
products_cols = ["product_id", "product_name", "category", "price"]
products_df = spark.createDataFrame(products_data, products_cols)

# Orders DataFrame
orders_data = [
    (1001, 1, 101, 1),
    (1002, 2, 102, 2),
    (1003, 1, 103, 3),
    (1004, 3, 104, 1),
    (1005, 5, 105, 5),
    (1006, 6, 106, 2),
    (1007, 7, 101, 1) # Non-existent customer
]
orders_cols = ["order_id", "customer_id", "product_id", "quantity"]
orders_df = spark.createDataFrame(orders_data, orders_cols)

In [2]:
customers_df.select("name", "city").show()

products_df.select("category").distinct().show()

customers_df.filter(col("age") > 30).show()

+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|    Delhi|
+------------+---------+

+-----------+
|   category|
+-----------+
|Electronics|
| Stationery|
|    Fashion|
|  Furniture|
+-----------+

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          2|Priya Singh|  Delhi| 32|
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+



In [3]:
orders_df.groupBy("customer_id").agg(count("order_id").alias("total_orders")).show()

customers_df.groupBy("city").agg(avg("age").alias("avg_age")).show()

orders_df.join(products_df, "product_id") \
    .withColumn("revenue", col("quantity") * col("price")) \
    .groupBy("product_name").agg(sum("revenue").alias("total_revenue")).show()

+-----------+------------+
|customer_id|total_orders|
+-----------+------------+
|          1|           2|
|          2|           1|
|          7|           1|
|          6|           1|
|          5|           1|
|          3|           1|
+-----------+------------+

+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   28.0|
|    Delhi|   30.5|
|Hyderabad|   25.0|
|  Chennai|   35.0|
|   Mumbai|   30.0|
+---------+-------+

+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|       Chair|         5000|
|        Book|         3500|
|      Laptop|       110000|
|       Shoes|         5000|
|      Mobile|        50000|
|  Headphones|         9000|
+------------+-------------+



In [4]:
customers_df.join(orders_df, "customer_id", "inner") \
    .select("name", "order_id", "product_id").show()

orders_df.join(products_df, "product_id") \
    .select("order_id", "product_name", "price", "quantity").show()

customers_df.join(orders_df, "customer_id", "left_anti").show()

products_df.join(orders_df, "product_id", "left_anti").show()


+------------+--------+----------+
|        name|order_id|product_id|
+------------+--------+----------+
|Rahul Sharma|    1001|       101|
|Rahul Sharma|    1003|       103|
| Priya Singh|    1002|       102|
|  Aman Kumar|    1004|       104|
| Arjun Mehta|    1005|       105|
|  Divya Nair|    1006|       106|
+------------+--------+----------+

+--------+------------+-----+--------+
|order_id|product_name|price|quantity|
+--------+------------+-----+--------+
|    1001|      Laptop|55000|       1|
|    1007|      Laptop|55000|       1|
|    1002|      Mobile|25000|       2|
|    1003|  Headphones| 3000|       3|
|    1004|       Chair| 5000|       1|
|    1005|        Book|  700|       5|
|    1006|       Shoes| 2500|       2|
+--------+------------+-----+--------+

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+

+----------+------------+---

In [5]:
orders_df.join(products_df, "product_id") \
    .orderBy(desc("price")).select("product_name", "price").distinct().show(3)

orders_df.join(products_df, "product_id") \
    .withColumn("revenue", col("quantity") * col("price")) \
    .groupBy("category").agg(sum("revenue").alias("total_revenue")).show()

orders_df.join(products_df, "product_id") \
    .join(customers_df, "customer_id") \
    .withColumn("spend", col("quantity") * col("price")) \
    .groupBy("name").agg(sum("spend").alias("total_spent")) \
    .orderBy(desc("total_spent")).show()

+------------+-----+
|product_name|price|
+------------+-----+
|        Book|  700|
|      Mobile|25000|
|       Chair| 5000|
+------------+-----+
only showing top 3 rows

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
| Stationery|         3500|
|    Fashion|         5000|
|Electronics|       169000|
|  Furniture|         5000|
+-----------+-------------+

+------------+-----------+
|        name|total_spent|
+------------+-----------+
|Rahul Sharma|      64000|
| Priya Singh|      50000|
|  Divya Nair|       5000|
|  Aman Kumar|       5000|
| Arjun Mehta|       3500|
+------------+-----------+



In [6]:
customers_df.createOrReplaceTempView("customers")
products_df.createOrReplaceTempView("products")
orders_df.createOrReplaceTempView("orders")

spark.sql("""
    SELECT c.city, SUM(p.price * o.quantity) AS total_revenue
    FROM orders o
    JOIN customers c ON o.customer_id = c.customer_id
    JOIN products p ON o.product_id = p.product_id
    GROUP BY c.city
    ORDER BY total_revenue DESC
    LIMIT 2
""").show()

spark.sql("""
    SELECT c.name, SUM(p.price * o.quantity) AS total_spent
    FROM orders o
    JOIN customers c ON o.customer_id = c.customer_id
    JOIN products p ON o.product_id = p.product_id
    GROUP BY c.name
    HAVING SUM(p.price * o.quantity) > 50000
""").show()

spark.sql("""
    SELECT p.category, SUM(p.price * o.quantity) AS total_revenue
    FROM orders o
    JOIN products p ON o.product_id = p.product_id
    GROUP BY p.category
    ORDER BY total_revenue DESC
    LIMIT 1
""").show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        64000|
|    Delhi|        55000|
+---------+-------------+

+------------+-----------+
|        name|total_spent|
+------------+-----------+
|Rahul Sharma|      64000|
+------------+-----------+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|       169000|
+-----------+-------------+

