In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, count, sum, desc

spark = SparkSession.builder.appName("DataFrame-Exercises").getOrCreate()

customers_data = [
    (1, "Rahul Sharma", "Bangalore", 28),
    (2, "Priya Singh", "Delhi", 32),
    (3, "Aman Kumar", "Hyderabad", 25),
    (4, "Sneha Reddy", "Chennai", 35),
    (5, "Arjun Mehta", "Mumbai", 30),
    (6, "Divya Nair", "Delhi", 29)
]
customers_cols = ["customer_id", "name", "city", "age"]
customers_df = spark.createDataFrame(customers_data, customers_cols)

orders_data = [
    (101, 1, "Laptop", 55000),
    (102, 2, "Mobile", 25000),
    (103, 1, "Headphones", 3000),
    (104, 3, "Chair", 5000),
    (105, 5, "Book", 700),
    (106, 2, "Tablet", 20000),
    (107, 6, "Shoes", 2500),
    (108, 7, "Camera", 30000)
]
orders_cols = ["order_id", "customer_id", "product", "amount"]
orders_df = spark.createDataFrame(orders_data, orders_cols)

customers_df.select("name", "city").show()
customers_df.filter(col("age") > 30).show()
customers_df.filter(col("city") == "Delhi").count()
customers_df.select("city").distinct().show()

+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|    Delhi|
+------------+---------+

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          2|Priya Singh|  Delhi| 32|
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+

+---------+
|     city|
+---------+
|Bangalore|
|    Delhi|
|Hyderabad|
|  Chennai|
|   Mumbai|
+---------+



In [2]:
customers_df.agg(avg("age").alias("avg_age")).show()
orders_df.agg(max("amount").alias("max_amount"), min("amount").alias("min_amount")).show()
orders_df.groupBy("customer_id").agg(count("*").alias("order_count")).show()
orders_df.groupBy("customer_id").agg(sum("amount").alias("total_spending")).show()

+------------------+
|           avg_age|
+------------------+
|29.833333333333332|
+------------------+

+----------+----------+
|max_amount|min_amount|
+----------+----------+
|     55000|       700|
+----------+----------+

+-----------+-----------+
|customer_id|order_count|
+-----------+-----------+
|          1|          2|
|          3|          1|
|          2|          2|
|          7|          1|
|          6|          1|
|          5|          1|
+-----------+-----------+

+-----------+--------------+
|customer_id|total_spending|
+-----------+--------------+
|          1|         58000|
|          3|          5000|
|          2|         45000|
|          7|         30000|
|          6|          2500|
|          5|           700|
+-----------+--------------+



In [3]:
customers_df.join(orders_df, "customer_id", "inner").show()
customers_df.join(orders_df, "customer_id", "left").show()
customers_df.join(orders_df, "customer_id", "left_anti").show()
orders_df.join(customers_df, "customer_id", "left_anti").show()

+-----------+------------+---------+---+--------+----------+------+
|customer_id|        name|     city|age|order_id|   product|amount|
+-----------+------------+---------+---+--------+----------+------+
|          1|Rahul Sharma|Bangalore| 28|     101|    Laptop| 55000|
|          1|Rahul Sharma|Bangalore| 28|     103|Headphones|  3000|
|          2| Priya Singh|    Delhi| 32|     102|    Mobile| 25000|
|          2| Priya Singh|    Delhi| 32|     106|    Tablet| 20000|
|          3|  Aman Kumar|Hyderabad| 25|     104|     Chair|  5000|
|          5| Arjun Mehta|   Mumbai| 30|     105|      Book|   700|
|          6|  Divya Nair|    Delhi| 29|     107|     Shoes|  2500|
+-----------+------------+---------+---+--------+----------+------+

+-----------+------------+---------+---+--------+----------+------+
|customer_id|        name|     city|age|order_id|   product|amount|
+-----------+------------+---------+---+--------+----------+------+
|          1|Rahul Sharma|Bangalore| 28|     10

In [4]:
customers_df.orderBy(col("age").desc()).show()
orders_df.orderBy(col("amount").desc()).limit(3).show()
customers_df.groupBy("city").agg(avg("age").alias("avg_age")).show()
orders_df.groupBy("product").agg(sum("amount").alias("total_sales")).show()


+-----------+------------+---------+---+
|customer_id|        name|     city|age|
+-----------+------------+---------+---+
|          4| Sneha Reddy|  Chennai| 35|
|          2| Priya Singh|    Delhi| 32|
|          5| Arjun Mehta|   Mumbai| 30|
|          6|  Divya Nair|    Delhi| 29|
|          1|Rahul Sharma|Bangalore| 28|
|          3|  Aman Kumar|Hyderabad| 25|
+-----------+------------+---------+---+

+--------+-----------+-------+------+
|order_id|customer_id|product|amount|
+--------+-----------+-------+------+
|     101|          1| Laptop| 55000|
|     108|          7| Camera| 30000|
|     102|          2| Mobile| 25000|
+--------+-----------+-------+------+

+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   28.0|
|    Delhi|   30.5|
|Hyderabad|   25.0|
|  Chennai|   35.0|
|   Mumbai|   30.0|
+---------+-------+

+----------+-----------+
|   product|total_sales|
+----------+-----------+
|     Chair|       5000|
|    Laptop|      55000|
|    Mobile|    

In [5]:
customers_df.createOrReplaceTempView("customers")
orders_df.createOrReplaceTempView("orders")

spark.sql("""
    SELECT c.city, SUM(o.amount) AS total_revenue
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.city
""").show()

spark.sql("""
    SELECT c.name, SUM(o.amount) AS total_spend
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.name
    ORDER BY total_spend DESC
    LIMIT 2
""").show()

spark.sql("""
    SELECT DISTINCT c.name
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    GROUP BY c.customer_id, c.name
    HAVING SUM(o.amount) > 20000
""").show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|   Mumbai|          700|
|    Delhi|        47500|
|Hyderabad|         5000|
+---------+-------------+

+------------+-----------+
|        name|total_spend|
+------------+-----------+
|Rahul Sharma|      58000|
| Priya Singh|      45000|
+------------+-----------+

+------------+
|        name|
+------------+
|Rahul Sharma|
| Priya Singh|
+------------+

