In [19]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ecommerce_data").getOrCreate()
from pyspark.sql.functions import col, avg, sum, desc

customers_data = [
(1, "Rahul Sharma", "Bangalore", 28),
(2, "Priya Singh", "Delhi", 32),
(3, "Aman Kumar", "Hyderabad", 25),
(4, "Sneha Reddy", "Chennai", 35),
(5, "Arjun Mehta", "Mumbai", 30),
(6, "Divya Nair", "Delhi", 29)
]
customers_cols = ["customer_id", "name", "city", "age"]
customers_df = spark.createDataFrame(customers_data, customers_cols)
customers_df.show()

products_data = [
(101, "Laptop", "Electronics", 55000),
(102, "Mobile", "Electronics", 25000),
(103, "Headphones", "Electronics", 3000),
(104, "Chair", "Furniture", 5000),
(105, "Book", "Stationery", 700),
(106, "Shoes", "Fashion", 2500)
]
products_cols = ["product_id", "product_name", "category", "price"]
products_df = spark.createDataFrame(products_data, products_cols)
products_df.show()

orders_data = [
(1001, 1, 101, 1),
(1002, 2, 102, 2),
(1003, 1, 103, 3),
(1004, 3, 104, 1),
(1005, 5, 105, 5),
(1006, 6, 106, 2),
(1007, 7, 101, 1)
]
orders_cols = ["order_id", "customer_id", "product_id", "quantity"]
orders_df = spark.createDataFrame(orders_data, orders_cols)
orders_df.show()


+-----------+------------+---------+---+
|customer_id|        name|     city|age|
+-----------+------------+---------+---+
|          1|Rahul Sharma|Bangalore| 28|
|          2| Priya Singh|    Delhi| 32|
|          3|  Aman Kumar|Hyderabad| 25|
|          4| Sneha Reddy|  Chennai| 35|
|          5| Arjun Mehta|   Mumbai| 30|
|          6|  Divya Nair|    Delhi| 29|
+-----------+------------+---------+---+

+----------+------------+-----------+-----+
|product_id|product_name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|      Mobile|Electronics|25000|
|       103|  Headphones|Electronics| 3000|
|       104|       Chair|  Furniture| 5000|
|       105|        Book| Stationery|  700|
|       106|       Shoes|    Fashion| 2500|
+----------+------------+-----------+-----+

+--------+-----------+----------+--------+
|order_id|customer_id|product_id|quantity|
+--------+-----------+----------+--------+
|    1001|         

In [20]:
customers_df.select("name", "city").show()

+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|    Delhi|
+------------+---------+



In [21]:
products_df.select("category").distinct().show()

+-----------+
|   category|
+-----------+
|Electronics|
| Stationery|
|    Fashion|
|  Furniture|
+-----------+



In [10]:
customers_df.filter(customers_df.age > 30).show()

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          2|Priya Singh|  Delhi| 32|
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+



In [14]:
orders_df.groupBy("customer_id").count().show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|          1|    2|
|          2|    1|
|          7|    1|
|          6|    1|
|          5|    1|
|          3|    1|
+-----------+-----+



In [17]:
customers_df.groupBy("city").agg(avg("age").alias("avg_age")).show()

+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   28.0|
|    Delhi|   30.5|
|Hyderabad|   25.0|
|  Chennai|   35.0|
|   Mumbai|   30.0|
+---------+-------+



In [23]:
orders_df.join(products_df, "product_id") \
    .withColumn("revenue", col("quantity") * col("price")) \
    .groupBy("product_id", "product_name") \
    .agg(sum("revenue").alias("total_revenue")) \
    .show()

+----------+------------+-------------+
|product_id|product_name|total_revenue|
+----------+------------+-------------+
|       101|      Laptop|       110000|
|       102|      Mobile|        50000|
|       103|  Headphones|         9000|
|       104|       Chair|         5000|
|       105|        Book|         3500|
|       106|       Shoes|         5000|
+----------+------------+-------------+



In [25]:
customers_df.join(orders_df,"customer_id", "inner")\
  .select("name", "order_id", "product_id", "quantity").show()

+------------+--------+----------+--------+
|        name|order_id|product_id|quantity|
+------------+--------+----------+--------+
|Rahul Sharma|    1001|       101|       1|
|Rahul Sharma|    1003|       103|       3|
| Priya Singh|    1002|       102|       2|
|  Aman Kumar|    1004|       104|       1|
| Arjun Mehta|    1005|       105|       5|
|  Divya Nair|    1006|       106|       2|
+------------+--------+----------+--------+



In [28]:
orders_df.join(products_df, "product_id", "inner")\
  .select("order_id", "product_name", "price", "quantity").show()

+--------+------------+-----+--------+
|order_id|product_name|price|quantity|
+--------+------------+-----+--------+
|    1001|      Laptop|55000|       1|
|    1007|      Laptop|55000|       1|
|    1002|      Mobile|25000|       2|
|    1003|  Headphones| 3000|       3|
|    1004|       Chair| 5000|       1|
|    1005|        Book|  700|       5|
|    1006|       Shoes| 2500|       2|
+--------+------------+-----+--------+



In [30]:
customers_df.join(orders_df, "customer_id", "left_anti").show()

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+



In [33]:
products_df.join(orders_df, "product_id", "left_anti").show()

+----------+------------+--------+-----+
|product_id|product_name|category|price|
+----------+------------+--------+-----+
+----------+------------+--------+-----+



In [34]:
orders_df.join(products_df, "product_id") \
    .select("product_name", "price") \
    .distinct() \
    .orderBy(desc("price")) \
    .limit(3).show()

+------------+-----+
|product_name|price|
+------------+-----+
|      Laptop|55000|
|      Mobile|25000|
|       Chair| 5000|
+------------+-----+



In [36]:
orders_df.join(products_df, "product_id") \
    .withColumn("revenue", col("quantity") * col("price")) \
    .groupBy("category") \
    .agg(sum("revenue").alias("total_revenue")) \
    .show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
| Stationery|         3500|
|    Fashion|         5000|
|Electronics|       169000|
|  Furniture|         5000|
+-----------+-------------+



In [37]:
customers_df.join(orders_df, "customer_id") \
    .join(products_df, "product_id") \
    .withColumn("spend", col("quantity") * col("price")) \
    .groupBy("customer_id", "name") \
    .agg(sum("spend").alias("total_spent")) \
    .orderBy(desc("total_spent")).show()

+-----------+------------+-----------+
|customer_id|        name|total_spent|
+-----------+------------+-----------+
|          1|Rahul Sharma|      64000|
|          2| Priya Singh|      50000|
|          6|  Divya Nair|       5000|
|          3|  Aman Kumar|       5000|
|          5| Arjun Mehta|       3500|
+-----------+------------+-----------+



In [39]:
customers_df.createOrReplaceTempView("customers")
products_df.createOrReplaceTempView("products")
orders_df.createOrReplaceTempView("orders")

In [41]:
spark.sql("""
    SELECT c.city, SUM(o.quantity * p.price) AS total_revenue
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    JOIN products p ON o.product_id = p.product_id
    GROUP BY c.city
    ORDER BY total_revenue DESC
    LIMIT 2
""").show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        64000|
|    Delhi|        55000|
+---------+-------------+



In [43]:
spark.sql("""
    SELECT c.name, SUM(o.quantity * p.price) AS total_spent
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    JOIN products p ON o.product_id = p.product_id
    GROUP BY c.name
    HAVING total_spent > 50000
""").show()

+------------+-----------+
|        name|total_spent|
+------------+-----------+
|Rahul Sharma|      64000|
+------------+-----------+



In [45]:
spark.sql("""
    SELECT p.category, SUM(o.quantity * p.price) AS total_revenue
    FROM orders o
    JOIN products p ON o.product_id = p.product_id
    GROUP BY p.category
    ORDER BY total_revenue DESC
    LIMIT 1
""").show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|       169000|
+-----------+-------------+

