In [2]:
!pip install pyspark



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, count, sum

spark = SparkSession.builder.appName("Customers-Orders-Exercises").getOrCreate()

customers_data = [
    (1, "Rahul Sharma", "Bangalore", 28),
    (2, "Priya Singh", "Delhi", 32),
    (3, "Aman Kumar", "Hyderabad", 25),
    (4, "Sneha Reddy", "Chennai", 35),
    (5, "Arjun Mehta", "Mumbai", 30),
    (6, "Divya Nair", "Delhi", 29)
]
customers_cols = ["customer_id", "name", "city", "age"]
customers_df = spark.createDataFrame(customers_data, customers_cols)

orders_data = [
    (101, 1, "Laptop", 55000),
    (102, 2, "Mobile", 25000),
    (103, 1, "Headphones", 3000),
    (104, 3, "Chair", 5000),
    (105, 5, "Book", 700),
    (106, 2, "Tablet", 20000),
    (107, 6, "Shoes", 2500),
    (108, 7, "Camera", 30000)   # Order with non-existent customer
]
orders_cols = ["order_id", "customer_id", "product", "amount"]
orders_df = spark.createDataFrame(orders_data, orders_cols)

customers_df.show()
orders_df.show()

+-----------+------------+---------+---+
|customer_id|        name|     city|age|
+-----------+------------+---------+---+
|          1|Rahul Sharma|Bangalore| 28|
|          2| Priya Singh|    Delhi| 32|
|          3|  Aman Kumar|Hyderabad| 25|
|          4| Sneha Reddy|  Chennai| 35|
|          5| Arjun Mehta|   Mumbai| 30|
|          6|  Divya Nair|    Delhi| 29|
+-----------+------------+---------+---+

+--------+-----------+----------+------+
|order_id|customer_id|   product|amount|
+--------+-----------+----------+------+
|     101|          1|    Laptop| 55000|
|     102|          2|    Mobile| 25000|
|     103|          1|Headphones|  3000|
|     104|          3|     Chair|  5000|
|     105|          5|      Book|   700|
|     106|          2|    Tablet| 20000|
|     107|          6|     Shoes|  2500|
|     108|          7|    Camera| 30000|
+--------+-----------+----------+------+



In [8]:
customers_df.select("name", "city").show()

+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|    Delhi|
+------------+---------+



In [10]:
customers_df.filter(col("age") > 30).show()

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          2|Priya Singh|  Delhi| 32|
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+



In [12]:
print(customers_df.filter(col("city") == "Delhi").count())

2


In [13]:
customers_df.select("city").distinct().show()

+---------+
|     city|
+---------+
|Bangalore|
|    Delhi|
|Hyderabad|
|  Chennai|
|   Mumbai|
+---------+



In [14]:
customers_df.agg(avg("age")).show()


+------------------+
|          avg(age)|
+------------------+
|29.833333333333332|
+------------------+



In [15]:
orders_df.agg(max("amount"), min("amount")).show()

+-----------+-----------+
|max(amount)|min(amount)|
+-----------+-----------+
|      55000|        700|
+-----------+-----------+



In [16]:
orders_df.groupBy("customer_id").agg(count("*").alias("order_count")).show()

+-----------+-----------+
|customer_id|order_count|
+-----------+-----------+
|          1|          2|
|          3|          1|
|          2|          2|
|          7|          1|
|          6|          1|
|          5|          1|
+-----------+-----------+



In [17]:
orders_df.groupBy("customer_id").agg(sum("amount").alias("total_spend")).show()

+-----------+-----------+
|customer_id|total_spend|
+-----------+-----------+
|          1|      58000|
|          3|       5000|
|          2|      45000|
|          7|      30000|
|          6|       2500|
|          5|        700|
+-----------+-----------+



In [18]:
customers_df.join(orders_df, "customer_id", "inner").show()

+-----------+------------+---------+---+--------+----------+------+
|customer_id|        name|     city|age|order_id|   product|amount|
+-----------+------------+---------+---+--------+----------+------+
|          1|Rahul Sharma|Bangalore| 28|     101|    Laptop| 55000|
|          1|Rahul Sharma|Bangalore| 28|     103|Headphones|  3000|
|          2| Priya Singh|    Delhi| 32|     102|    Mobile| 25000|
|          2| Priya Singh|    Delhi| 32|     106|    Tablet| 20000|
|          3|  Aman Kumar|Hyderabad| 25|     104|     Chair|  5000|
|          5| Arjun Mehta|   Mumbai| 30|     105|      Book|   700|
|          6|  Divya Nair|    Delhi| 29|     107|     Shoes|  2500|
+-----------+------------+---------+---+--------+----------+------+



In [19]:
customers_df.join(orders_df, "customer_id", "left").show()

+-----------+------------+---------+---+--------+----------+------+
|customer_id|        name|     city|age|order_id|   product|amount|
+-----------+------------+---------+---+--------+----------+------+
|          1|Rahul Sharma|Bangalore| 28|     103|Headphones|  3000|
|          1|Rahul Sharma|Bangalore| 28|     101|    Laptop| 55000|
|          3|  Aman Kumar|Hyderabad| 25|     104|     Chair|  5000|
|          2| Priya Singh|    Delhi| 32|     106|    Tablet| 20000|
|          2| Priya Singh|    Delhi| 32|     102|    Mobile| 25000|
|          6|  Divya Nair|    Delhi| 29|     107|     Shoes|  2500|
|          5| Arjun Mehta|   Mumbai| 30|     105|      Book|   700|
|          4| Sneha Reddy|  Chennai| 35|    NULL|      NULL|  NULL|
+-----------+------------+---------+---+--------+----------+------+



In [20]:
customers_df.join(orders_df, "customer_id", "left")\
    .filter(orders_df.order_id.isNull()).show()

+-----------+-----------+-------+---+--------+-------+------+
|customer_id|       name|   city|age|order_id|product|amount|
+-----------+-----------+-------+---+--------+-------+------+
|          4|Sneha Reddy|Chennai| 35|    NULL|   NULL|  NULL|
+-----------+-----------+-------+---+--------+-------+------+



In [21]:
orders_df.join(customers_df, "customer_id", "left")\
    .filter(customers_df.customer_id.isNull()).show()


+-----------+--------+-------+------+----+----+----+
|customer_id|order_id|product|amount|name|city| age|
+-----------+--------+-------+------+----+----+----+
|          7|     108| Camera| 30000|NULL|NULL|NULL|
+-----------+--------+-------+------+----+----+----+



In [22]:
customers_df.orderBy(col("age").desc()).show()

+-----------+------------+---------+---+
|customer_id|        name|     city|age|
+-----------+------------+---------+---+
|          4| Sneha Reddy|  Chennai| 35|
|          2| Priya Singh|    Delhi| 32|
|          5| Arjun Mehta|   Mumbai| 30|
|          6|  Divya Nair|    Delhi| 29|
|          1|Rahul Sharma|Bangalore| 28|
|          3|  Aman Kumar|Hyderabad| 25|
+-----------+------------+---------+---+



In [23]:
orders_df.orderBy(col("amount").desc()).limit(3).show()

+--------+-----------+-------+------+
|order_id|customer_id|product|amount|
+--------+-----------+-------+------+
|     101|          1| Laptop| 55000|
|     108|          7| Camera| 30000|
|     102|          2| Mobile| 25000|
+--------+-----------+-------+------+



In [24]:
customers_df.groupBy("city").agg(avg("age").alias("avg_age")).show()

+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   28.0|
|    Delhi|   30.5|
|Hyderabad|   25.0|
|  Chennai|   35.0|
|   Mumbai|   30.0|
+---------+-------+



In [25]:
orders_df.groupBy("product").agg(sum("amount").alias("total_sales")).show()

+----------+-----------+
|   product|total_sales|
+----------+-----------+
|     Chair|       5000|
|    Laptop|      55000|
|    Mobile|      25000|
|Headphones|       3000|
|      Book|        700|
|    Camera|      30000|
|     Shoes|       2500|
|    Tablet|      20000|
+----------+-----------+



In [29]:
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

In [30]:
spark.sql("""
    SELECT c.city, SUM(o.amount) as total_revenue
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.city
""").show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|   Mumbai|          700|
|    Delhi|        47500|
|Hyderabad|         5000|
+---------+-------------+



In [31]:
spark.sql("""
    SELECT c.name, SUM(o.amount) as total_spend
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.name
    ORDER BY total_spend DESC
    LIMIT 2
""").show()

+------------+-----------+
|        name|total_spend|
+------------+-----------+
|Rahul Sharma|      58000|
| Priya Singh|      45000|
+------------+-----------+



In [32]:
spark.sql("""
    SELECT c.name, SUM(o.amount) as total_spend
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.name
    HAVING SUM(o.amount) > 20000
""").show()

+------------+-----------+
|        name|total_spend|
+------------+-----------+
|Rahul Sharma|      58000|
| Priya Singh|      45000|
+------------+-----------+

