In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName('SparkAssignment') \
                    .getOrCreate()

1. Basic DataFrame Operations

In [3]:
df_sales=spark.read.format("csv")\
             .option("header",True)\
             .option("inferSchema",True)\
             .load("sales.txt")
df_sales.printSchema()
df_sales.show(truncate=False)

root
 |-- sales_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- region: string (nullable = true)

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount|sale_date |region|
+--------+-----------+-------+------+----------+------+
|1       |101        |Laptop |50000 |2023-01-15|North |
|2       |102        |Mobile |15000 |2023-02-10|South |
|3       |103        |Tablet |20000 |2023-03-05|West  |
|4       |104        |Laptop |55000 |2023-03-15|East  |
|5       |105        |Desktop|40000 |2023-04-20|North |
|6       |101        |Mobile |15000 |2023-05-10|South |
|7       |102        |Laptop |60000 |2023-06-15|East  |
|8       |103        |Tablet |20000 |2023-07-05|North |
|9       |104        |Desktop|45000 |2023-08-10|West  |
|10      |105        |Laptop |70000 |2023-09-25|North |
+--------+-----------+----

In [4]:
df_customer=spark.read.format("csv")\
             .option("header",True)\
             .option("inferSchema",True)\
             .load("customer.txt")
df_customer.printSchema()
df_customer.show(truncate=False)

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)

+-----------+-------------+---------------------+---+---------+
|customer_id|customer_name|email                |age|city     |
+-----------+-------------+---------------------+---+---------+
|101        |Arun Sharma  |arun.sharma@email.com|28 |Delhi    |
|102        |Meena Verma  |meena.verma@email.com|34 |Mumbai   |
|103        |Rahul Yadav  |rahul.yadav@email.com|30 |Bangalore|
|104        |Priya Patel  |priya.patel@email.com|27 |Ahmedabad|
|105        |Sneha Reddy  |sneha.reddy@email.com|29 |Hyderabad|
|106        |Vikas Jain   |vikas.jain@email.com |31 |Chennai  |
|107        |Amit Roy     |amit.roy@email.com   |35 |Kolkata  |
+-----------+-------------+---------------------+---+---------+



In [52]:
df_sales.createOrReplaceTempView('sales')

In [5]:
df_sales.show(5,truncate=False)

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount|sale_date |region|
+--------+-----------+-------+------+----------+------+
|1       |101        |Laptop |50000 |2023-01-15|North |
|2       |102        |Mobile |15000 |2023-02-10|South |
|3       |103        |Tablet |20000 |2023-03-05|West  |
|4       |104        |Laptop |55000 |2023-03-15|East  |
|5       |105        |Desktop|40000 |2023-04-20|North |
+--------+-----------+-------+------+----------+------+
only showing top 5 rows



In [7]:
print("number of columns:",len(df_customer.columns))
print("number of rows:",df_customer.count())

number of columns: 5
number of rows: 7


2. Data Cleaning

In [8]:
df_sales.dropDuplicates(['customer_id','amount','product','sale_date','region']).show(truncate=False)

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount|sale_date |region|
+--------+-----------+-------+------+----------+------+
|9       |104        |Desktop|45000 |2023-08-10|West  |
|3       |103        |Tablet |20000 |2023-03-05|West  |
|7       |102        |Laptop |60000 |2023-06-15|East  |
|6       |101        |Mobile |15000 |2023-05-10|South |
|1       |101        |Laptop |50000 |2023-01-15|North |
|10      |105        |Laptop |70000 |2023-09-25|North |
|4       |104        |Laptop |55000 |2023-03-15|East  |
|5       |105        |Desktop|40000 |2023-04-20|North |
|8       |103        |Tablet |20000 |2023-07-05|North |
|2       |102        |Mobile |15000 |2023-02-10|South |
+--------+-----------+-------+------+----------+------+



In [9]:
df_sales.dropna().show(truncate=False)

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount|sale_date |region|
+--------+-----------+-------+------+----------+------+
|1       |101        |Laptop |50000 |2023-01-15|North |
|2       |102        |Mobile |15000 |2023-02-10|South |
|3       |103        |Tablet |20000 |2023-03-05|West  |
|4       |104        |Laptop |55000 |2023-03-15|East  |
|5       |105        |Desktop|40000 |2023-04-20|North |
|6       |101        |Mobile |15000 |2023-05-10|South |
|7       |102        |Laptop |60000 |2023-06-15|East  |
|8       |103        |Tablet |20000 |2023-07-05|North |
|9       |104        |Desktop|45000 |2023-08-10|West  |
|10      |105        |Laptop |70000 |2023-09-25|North |
+--------+-----------+-------+------+----------+------+



In [10]:
df_sales.na.fill(0,'amount').show(truncate=False)

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount|sale_date |region|
+--------+-----------+-------+------+----------+------+
|1       |101        |Laptop |50000 |2023-01-15|North |
|2       |102        |Mobile |15000 |2023-02-10|South |
|3       |103        |Tablet |20000 |2023-03-05|West  |
|4       |104        |Laptop |55000 |2023-03-15|East  |
|5       |105        |Desktop|40000 |2023-04-20|North |
|6       |101        |Mobile |15000 |2023-05-10|South |
|7       |102        |Laptop |60000 |2023-06-15|East  |
|8       |103        |Tablet |20000 |2023-07-05|North |
|9       |104        |Desktop|45000 |2023-08-10|West  |
|10      |105        |Laptop |70000 |2023-09-25|North |
+--------+-----------+-------+------+----------+------+



In [11]:
df_customer.na.fill("unknown",'email').show(truncate=False)

+-----------+-------------+---------------------+---+---------+
|customer_id|customer_name|email                |age|city     |
+-----------+-------------+---------------------+---+---------+
|101        |Arun Sharma  |arun.sharma@email.com|28 |Delhi    |
|102        |Meena Verma  |meena.verma@email.com|34 |Mumbai   |
|103        |Rahul Yadav  |rahul.yadav@email.com|30 |Bangalore|
|104        |Priya Patel  |priya.patel@email.com|27 |Ahmedabad|
|105        |Sneha Reddy  |sneha.reddy@email.com|29 |Hyderabad|
|106        |Vikas Jain   |vikas.jain@email.com |31 |Chennai  |
|107        |Amit Roy     |amit.roy@email.com   |35 |Kolkata  |
+-----------+-------------+---------------------+---+---------+



3. Column Manipulation

In [12]:
# 9. Add a new column discounted_amount to the sales DataFrame that applies a 10% discount on amount
from pyspark.sql.functions import expr

df_sales.withColumn("discounted_amount",expr("amount*0.9")).show()

+--------+-----------+-------+------+----------+------+-----------------+
|sales_id|customer_id|product|amount| sale_date|region|discounted_amount|
+--------+-----------+-------+------+----------+------+-----------------+
|       1|        101| Laptop| 50000|2023-01-15| North|          45000.0|
|       2|        102| Mobile| 15000|2023-02-10| South|          13500.0|
|       3|        103| Tablet| 20000|2023-03-05|  West|          18000.0|
|       4|        104| Laptop| 55000|2023-03-15|  East|          49500.0|
|       5|        105|Desktop| 40000|2023-04-20| North|          36000.0|
|       6|        101| Mobile| 15000|2023-05-10| South|          13500.0|
|       7|        102| Laptop| 60000|2023-06-15|  East|          54000.0|
|       8|        103| Tablet| 20000|2023-07-05| North|          18000.0|
|       9|        104|Desktop| 45000|2023-08-10|  West|          40500.0|
|      10|        105| Laptop| 70000|2023-09-25| North|          63000.0|
+--------+-----------+-------+------+-

In [13]:
# 10.Rename the city column in the customer DataFrame to customer_city

df_customer=df_customer.withColumnRenamed("city","customer_city")
df_customer.show()

+-----------+-------------+--------------------+---+-------------+
|customer_id|customer_name|               email|age|customer_city|
+-----------+-------------+--------------------+---+-------------+
|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|
|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|        104|  Priya Patel|priya.patel@email...| 27|    Ahmedabad|
|        105|  Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|
|        106|   Vikas Jain|vikas.jain@email.com| 31|      Chennai|
|        107|     Amit Roy|  amit.roy@email.com| 35|      Kolkata|
+-----------+-------------+--------------------+---+-------------+



In [14]:
# 11. Drop the region column from the sales DataFrame.
df_sales.drop("region").show()


+--------+-----------+-------+------+----------+
|sales_id|customer_id|product|amount| sale_date|
+--------+-----------+-------+------+----------+
|       1|        101| Laptop| 50000|2023-01-15|
|       2|        102| Mobile| 15000|2023-02-10|
|       3|        103| Tablet| 20000|2023-03-05|
|       4|        104| Laptop| 55000|2023-03-15|
|       5|        105|Desktop| 40000|2023-04-20|
|       6|        101| Mobile| 15000|2023-05-10|
|       7|        102| Laptop| 60000|2023-06-15|
|       8|        103| Tablet| 20000|2023-07-05|
|       9|        104|Desktop| 45000|2023-08-10|
|      10|        105| Laptop| 70000|2023-09-25|
+--------+-----------+-------+------+----------+



In [15]:
# 12. Create a new column customer_age_category in the customer DataFrame based on age:
# a. "Youth" for age < 30
# b. "Adult" for 30 <= age < 50
# c. "Senior" for age >= 50
from pyspark.sql.functions import col ,when
df_customer.withColumn("customer_age_category",when(col("age")<30, "Youth")\
                                   .when(col("age")>=50,"Senior")\
                                   .when((col("age")>=30) & (col("age")<50),"Adult")).show()

+-----------+-------------+--------------------+---+-------------+---------------------+
|customer_id|customer_name|               email|age|customer_city|customer_age_category|
+-----------+-------------+--------------------+---+-------------+---------------------+
|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|                Youth|
|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|                Adult|
|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|                Adult|
|        104|  Priya Patel|priya.patel@email...| 27|    Ahmedabad|                Youth|
|        105|  Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|                Youth|
|        106|   Vikas Jain|vikas.jain@email.com| 31|      Chennai|                Adult|
|        107|     Amit Roy|  amit.roy@email.com| 35|      Kolkata|                Adult|
+-----------+-------------+--------------------+---+-------------+---------------------+



FILTERING


In [16]:
# 13. Filter the sales DataFrame to show only rows where amount is greater than 50,000.
df_sales.filter(expr("amount>50000")).show()

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount| sale_date|region|
+--------+-----------+-------+------+----------+------+
|       4|        104| Laptop| 55000|2023-03-15|  East|
|       7|        102| Laptop| 60000|2023-06-15|  East|
|      10|        105| Laptop| 70000|2023-09-25| North|
+--------+-----------+-------+------+----------+------+



In [17]:
# 14. Filter the customer DataFrame to show customers aged between 25 and 30.
df_customer.filter(expr("age>=25 and age<=30")).show()

+-----------+-------------+--------------------+---+-------------+
|customer_id|customer_name|               email|age|customer_city|
+-----------+-------------+--------------------+---+-------------+
|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|        104|  Priya Patel|priya.patel@email...| 27|    Ahmedabad|
|        105|  Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|
+-----------+-------------+--------------------+---+-------------+



In [18]:
# 15. Identify all customers who have made purchases in more than one region.
from pyspark.sql.functions import count_distinct
df_sales.groupBy("customer_id").agg(count_distinct(col("region")).alias("new_count")).filter(col("new_count")>1).show()

+-----------+---------+
|customer_id|new_count|
+-----------+---------+
|        101|        2|
|        103|        2|
|        102|        2|
|        104|        2|
+-----------+---------+



In [19]:
# 16. Filter the top 3 sales based on amount for each product

from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank
window=Window.partitionBy("product").orderBy(col("amount").desc())

new_df_topsales=df_sales.withColumn("top_sales",dense_rank().over(window)).filter(col("top_sales")<=3).orderBy("product")
new_df_topsales.show()

+--------+-----------+-------+------+----------+------+---------+
|sales_id|customer_id|product|amount| sale_date|region|top_sales|
+--------+-----------+-------+------+----------+------+---------+
|       9|        104|Desktop| 45000|2023-08-10|  West|        1|
|       5|        105|Desktop| 40000|2023-04-20| North|        2|
|      10|        105| Laptop| 70000|2023-09-25| North|        1|
|       7|        102| Laptop| 60000|2023-06-15|  East|        2|
|       4|        104| Laptop| 55000|2023-03-15|  East|        3|
|       2|        102| Mobile| 15000|2023-02-10| South|        1|
|       6|        101| Mobile| 15000|2023-05-10| South|        1|
|       3|        103| Tablet| 20000|2023-03-05|  West|        1|
|       8|        103| Tablet| 20000|2023-07-05| North|        1|
+--------+-----------+-------+------+----------+------+---------+



JOINS

In [20]:
# 17. Perform an inner join between sales and customer DataFrames on customer_id.
df_sales.join(df_customer,df_sales.customer_id== df_customer.customer_id,"inner").show()

+--------+-----------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|sales_id|customer_id|product|amount| sale_date|region|customer_id|customer_name|               email|age|customer_city|
+--------+-----------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|       1|        101| Laptop| 50000|2023-01-15| North|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|       2|        102| Mobile| 15000|2023-02-10| South|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|
|       3|        103| Tablet| 20000|2023-03-05|  West|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|       4|        104| Laptop| 55000|2023-03-15|  East|        104|  Priya Patel|priya.patel@email...| 27|    Ahmedabad|
|       5|        105|Desktop| 40000|2023-04-20| North|        105|  Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|
|       6|        101| Mobile| 1

In [21]:
# 18. Perform a left join to include all records from sales and matching records from customer.
df_sales.join(df_customer,df_sales.customer_id== df_customer.customer_id,"left").show()

+--------+-----------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|sales_id|customer_id|product|amount| sale_date|region|customer_id|customer_name|               email|age|customer_city|
+--------+-----------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|       1|        101| Laptop| 50000|2023-01-15| North|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|       2|        102| Mobile| 15000|2023-02-10| South|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|
|       3|        103| Tablet| 20000|2023-03-05|  West|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|       4|        104| Laptop| 55000|2023-03-15|  East|        104|  Priya Patel|priya.patel@email...| 27|    Ahmedabad|
|       5|        105|Desktop| 40000|2023-04-20| North|        105|  Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|
|       6|        101| Mobile| 1

In [22]:
# 19. Perform a full outer join between sales and customer DataFrames.
df_sales.join(df_customer,df_sales.customer_id== df_customer.customer_id,"full").show()

+--------+-----------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|sales_id|customer_id|product|amount| sale_date|region|customer_id|customer_name|               email|age|customer_city|
+--------+-----------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|       1|        101| Laptop| 50000|2023-01-15| North|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|       6|        101| Mobile| 15000|2023-05-10| South|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|       2|        102| Mobile| 15000|2023-02-10| South|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|
|       7|        102| Laptop| 60000|2023-06-15|  East|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|
|       3|        103| Tablet| 20000|2023-03-05|  West|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|       8|        103| Tablet| 2

In [23]:
# 20. Identify customers who have not made any purchases by performing an anti-join.
df_customer.join(df_sales,df_sales.customer_id== df_customer.customer_id,"leftanti").show()

+-----------+-------------+--------------------+---+-------------+
|customer_id|customer_name|               email|age|customer_city|
+-----------+-------------+--------------------+---+-------------+
|        106|   Vikas Jain|vikas.jain@email.com| 31|      Chennai|
|        107|     Amit Roy|  amit.roy@email.com| 35|      Kolkata|
+-----------+-------------+--------------------+---+-------------+



AGGREGATIONS

In [24]:
# 21. Calculate the total sales amount for each product.
from pyspark.sql.functions import *
df_sales.groupBy("product").agg(sum("amount").alias("total_sales")).show()

+-------+-----------+
|product|total_sales|
+-------+-----------+
| Laptop|     235000|
| Mobile|      30000|
| Tablet|      40000|
|Desktop|      85000|
+-------+-----------+



In [25]:
# 22. Find the average age of customers in the customer DataFrame
df_customer.select(avg("age").alias("average_age")).show()

+------------------+
|       average_age|
+------------------+
|30.571428571428573|
+------------------+



In [26]:
# 23. Calculate the maximum and minimum sales amounts in the sales DataFrame.
df_sales.select(max("amount").alias("maximum_amt"),min("amount").alias("minimum_amt")).show()

+-----------+-----------+
|maximum_amt|minimum_amt|
+-----------+-----------+
|      70000|      15000|
+-----------+-----------+



In [27]:
# 24. Group the customer DataFrame by customer_city and count the number of customers in each city
df_customer.groupBy("customer_city").agg(count("customer_id").alias("no. of customers")).show()

+-------------+----------------+
|customer_city|no. of customers|
+-------------+----------------+
|    Bangalore|               1|
|      Chennai|               1|
|       Mumbai|               1|
|    Ahmedabad|               1|
|      Kolkata|               1|
|        Delhi|               1|
|    Hyderabad|               1|
+-------------+----------------+



SORTING


In [28]:
# 25. Sort the sales DataFrame by amount in descending order
df_sales.orderBy(col("amount").desc()).show()

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount| sale_date|region|
+--------+-----------+-------+------+----------+------+
|      10|        105| Laptop| 70000|2023-09-25| North|
|       7|        102| Laptop| 60000|2023-06-15|  East|
|       4|        104| Laptop| 55000|2023-03-15|  East|
|       1|        101| Laptop| 50000|2023-01-15| North|
|       9|        104|Desktop| 45000|2023-08-10|  West|
|       5|        105|Desktop| 40000|2023-04-20| North|
|       3|        103| Tablet| 20000|2023-03-05|  West|
|       8|        103| Tablet| 20000|2023-07-05| North|
|       2|        102| Mobile| 15000|2023-02-10| South|
|       6|        101| Mobile| 15000|2023-05-10| South|
+--------+-----------+-------+------+----------+------+



In [29]:
# 26. Sort the customer DataFrame by age in ascending order.
df_customer.orderBy("age").show()

+-----------+-------------+--------------------+---+-------------+
|customer_id|customer_name|               email|age|customer_city|
+-----------+-------------+--------------------+---+-------------+
|        104|  Priya Patel|priya.patel@email...| 27|    Ahmedabad|
|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|        105|  Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|
|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|        106|   Vikas Jain|vikas.jain@email.com| 31|      Chennai|
|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|
|        107|     Amit Roy|  amit.roy@email.com| 35|      Kolkata|
+-----------+-------------+--------------------+---+-------------+



In [30]:
df_customer.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- customer_city: string (nullable = true)



In [33]:
import pandas as pd
new_customer_data = [
    [108, 'Pooja Joshi', 'pooja.joshi@email.com', 33, 'Chennai'],
    [109, 'Pooja Joshi', 'pooja.joshi@email.com', 35, 'Kolkata'],
    [110, 'Vikram Chauhan', 'vikram.chauhan@email.com', 34, 'Pune'],
    [111, 'Neha Gupta', 'neha.gupta@email.com', 35, 'Kolkata'],
    [112, 'Sunita Rao', 'sunita.rao@email.com', 28, 'Lucknow'],
    [113, 'Sunita Rao', 'sunita.rao@email.com', 33, 'Lucknow'],
    [114, 'Rakesh Bansal', 'rakesh.bansal@email.com', 28, 'Lucknow'],
    [115, 'Sunita Rao', 'sunita.rao@email.com', 25, 'Chennai']
]

# Create a DataFrame from the data
df = pd.DataFrame(new_customer_data, columns=['customer_id','customer_name','email','age','customer_city'])

# Save the DataFrame to a CSV file
df.to_csv('new_customer_data.csv', index=False)

print("Data has been successfully saved to new_customer_data.csv")


Data has been successfully saved to new_customer_data.csv


In [34]:
# Define the new sales data
new_sales_data = [
    [11, 114, 'Tablet', 70000, '2023-09-13','North'],
    [12, 115, 'Mobile', 50000, '2023-11-16','West'],
    [13, 112, 'Tablet', 60000, '2023-01-04','South'],
    [14, 109, 'Desktop', 90000, '2023-03-08','East'],
    [15, 108, 'Laptop', 80000, '2023-06-25','East'],
    [16, 113, 'Tablet', 50000, '2023-01-18','North'],
    [17, 115, 'Desktop', 80000, '2023-02-05','North'],
    [18, 112, 'Laptop', 60000, '2023-07-24','South'],
    [19, 110, 'Mobile', 40000, '2023-07-08','West'],
    [20, 108, 'Tablet', 70000, '2023-03-18','East']
]

# Create a DataFrame from the data
df = pd.DataFrame(new_sales_data, columns=['sales_id','customer_id','product','amount','sale_date','region'])

# Save the DataFrame to a CSV file
df.to_csv('new_sales_data.csv', index=False)

print("Data has been successfully saved to new_sales_data.csv")

Data has been successfully saved to new_sales_data.csv


UNION OPERATIONS

In [35]:
new_customer_df = spark.read.option("header", "true").option("inferSchema","true").csv("new_customer_data.csv")
new_customer_df.show(truncate=False)

new_sales_df = spark.read.option("header", "true").option("inferSchema","true").csv("new_sales_data.csv")
new_sales_df.show(truncate=False)

+-----------+--------------+------------------------+---+-------------+
|customer_id|customer_name |email                   |age|customer_city|
+-----------+--------------+------------------------+---+-------------+
|108        |Pooja Joshi   |pooja.joshi@email.com   |33 |Chennai      |
|109        |Pooja Joshi   |pooja.joshi@email.com   |35 |Kolkata      |
|110        |Vikram Chauhan|vikram.chauhan@email.com|34 |Pune         |
|111        |Neha Gupta    |neha.gupta@email.com    |35 |Kolkata      |
|112        |Sunita Rao    |sunita.rao@email.com    |28 |Lucknow      |
|113        |Sunita Rao    |sunita.rao@email.com    |33 |Lucknow      |
|114        |Rakesh Bansal |rakesh.bansal@email.com |28 |Lucknow      |
|115        |Sunita Rao    |sunita.rao@email.com    |25 |Chennai      |
+-----------+--------------+------------------------+---+-------------+

+--------+-----------+-------+------+----------+------+
|sales_id|customer_id|product|amount|sale_date |region|
+--------+-----------+-

In [36]:
union_df_customer=df_customer.union(new_customer_df)
union_df_customer.show()

+-----------+--------------+--------------------+---+-------------+
|customer_id| customer_name|               email|age|customer_city|
+-----------+--------------+--------------------+---+-------------+
|        101|   Arun Sharma|arun.sharma@email...| 28|        Delhi|
|        102|   Meena Verma|meena.verma@email...| 34|       Mumbai|
|        103|   Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|        104|   Priya Patel|priya.patel@email...| 27|    Ahmedabad|
|        105|   Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|
|        106|    Vikas Jain|vikas.jain@email.com| 31|      Chennai|
|        107|      Amit Roy|  amit.roy@email.com| 35|      Kolkata|
|        108|   Pooja Joshi|pooja.joshi@email...| 33|      Chennai|
|        109|   Pooja Joshi|pooja.joshi@email...| 35|      Kolkata|
|        110|Vikram Chauhan|vikram.chauhan@em...| 34|         Pune|
|        111|    Neha Gupta|neha.gupta@email.com| 35|      Kolkata|
|        112|    Sunita Rao|sunita.rao@email.com

WINDOW FUNCTIONS

In [37]:
# 29. Rank the sales records based on the amount column.
from pyspark.sql.window import Window
window=Window.orderBy('amount')
df_sales.withColumn("sales_rank",dense_rank().over(window)).show()

+--------+-----------+-------+------+----------+------+----------+
|sales_id|customer_id|product|amount| sale_date|region|sales_rank|
+--------+-----------+-------+------+----------+------+----------+
|       2|        102| Mobile| 15000|2023-02-10| South|         1|
|       6|        101| Mobile| 15000|2023-05-10| South|         1|
|       3|        103| Tablet| 20000|2023-03-05|  West|         2|
|       8|        103| Tablet| 20000|2023-07-05| North|         2|
|       5|        105|Desktop| 40000|2023-04-20| North|         3|
|       9|        104|Desktop| 45000|2023-08-10|  West|         4|
|       1|        101| Laptop| 50000|2023-01-15| North|         5|
|       4|        104| Laptop| 55000|2023-03-15|  East|         6|
|       7|        102| Laptop| 60000|2023-06-15|  East|         7|
|      10|        105| Laptop| 70000|2023-09-25| North|         8|
+--------+-----------+-------+------+----------+------+----------+



In [38]:
# 30. Add a cumulative sum of amount for each product in the sales DataFrame
from pyspark.sql.window import Window

window=Window.partitionBy("product").orderBy('amount').rowsBetween(Window.unboundedPreceding, 0)
df_sales.withColumn("sum_amt",sum("amount").over(window)).show()

+--------+-----------+-------+------+----------+------+-------+
|sales_id|customer_id|product|amount| sale_date|region|sum_amt|
+--------+-----------+-------+------+----------+------+-------+
|       5|        105|Desktop| 40000|2023-04-20| North|  40000|
|       9|        104|Desktop| 45000|2023-08-10|  West|  85000|
|       1|        101| Laptop| 50000|2023-01-15| North|  50000|
|       4|        104| Laptop| 55000|2023-03-15|  East| 105000|
|       7|        102| Laptop| 60000|2023-06-15|  East| 165000|
|      10|        105| Laptop| 70000|2023-09-25| North| 235000|
|       2|        102| Mobile| 15000|2023-02-10| South|  15000|
|       6|        101| Mobile| 15000|2023-05-10| South|  30000|
|       3|        103| Tablet| 20000|2023-03-05|  West|  20000|
|       8|        103| Tablet| 20000|2023-07-05| North|  40000|
+--------+-----------+-------+------+----------+------+-------+



In [39]:
# 31. Add a column that calculates the difference between each customer's amount and the average amount within their product group.
window=Window.partitionBy("product").orderBy('amount').rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
df_sales.withColumn("difference",col("amount")-avg("amount").over(window)).show()

+--------+-----------+-------+------+----------+------+----------+
|sales_id|customer_id|product|amount| sale_date|region|difference|
+--------+-----------+-------+------+----------+------+----------+
|       5|        105|Desktop| 40000|2023-04-20| North|   -2500.0|
|       9|        104|Desktop| 45000|2023-08-10|  West|    2500.0|
|       1|        101| Laptop| 50000|2023-01-15| North|   -8750.0|
|       4|        104| Laptop| 55000|2023-03-15|  East|   -3750.0|
|       7|        102| Laptop| 60000|2023-06-15|  East|    1250.0|
|      10|        105| Laptop| 70000|2023-09-25| North|   11250.0|
|       2|        102| Mobile| 15000|2023-02-10| South|       0.0|
|       6|        101| Mobile| 15000|2023-05-10| South|       0.0|
|       3|        103| Tablet| 20000|2023-03-05|  West|       0.0|
|       8|        103| Tablet| 20000|2023-07-05| North|       0.0|
+--------+-----------+-------+------+----------+------+----------+



PARTITIONING

In [40]:
# 32. Write the sales DataFrame to a partitioned Parquet file by region.
df_sales.write.partitionBy("region").mode("overwrite").parquet("sales.parquet")

In [41]:
# 33. Partition the customer DataFrame by customer_city and save it as a CSV file.
df_customer.write.partitionBy("customer_city").mode("overwrite").csv("customers.csv")

REAL-WORLD SCENARIOS

In [44]:
# 34. Calculate the percentage contribution of each product to the total sales.
total_sales=df_sales.groupBy().sum('amount').collect()[0][0]
df_sales.groupBy('product').agg((sum('amount')*100/total_sales).alias('contribution_perc'))\
    .show()

+-------+------------------+
|product| contribution_perc|
+-------+------------------+
| Laptop|60.256410256410255|
| Mobile|7.6923076923076925|
| Tablet|10.256410256410257|
|Desktop|21.794871794871796|
+-------+------------------+



In [45]:
# Extract the year from sale_date and group by year to calculate total sales.
df_sales.groupBy(year('sale_date').alias('year')).agg(sum('amount').alias('total_sales')).show()

+----+-----------+
|year|total_sales|
+----+-----------+
|2023|     390000|
+----+-----------+



In [46]:
# Identify the most purchased product in each region.
sales_new=df_sales.groupBy('region','product').agg(count('product').alias('prod_count'))
window5=Window.partitionBy('region').orderBy(col('prod_count').desc())
sales_new.withColumn('most_purchased',dense_rank().over(window5))\
        .filter(col('most_purchased')==1)\
        .select('region','product')\
        .show()

+------+-------+
|region|product|
+------+-------+
|  East| Laptop|
| North| Laptop|
| South| Mobile|
|  West|Desktop|
|  West| Tablet|
+------+-------+



In [53]:
# Add a column to show the difference between the highest and lowest sales for each product
spark.sql('''
        select product,max(amount)-min(amount) as difference
        from sales
        group by product
''').show()

+-------+----------+
|product|difference|
+-------+----------+
| Laptop|     20000|
| Mobile|         0|
| Tablet|         0|
|Desktop|      5000|
+-------+----------+



In [49]:
# Write the result of the join between sales and customer to parquet file.
sale_customer_joined=df_sales.join(df_customer,df_sales['customer_id']==df_customer['customer_id'],'inner')\
    .select('sales_id','product','amount','sale_date','region',df_sales['customer_id'],'customer_name','email','age','customer_city')
sale_customer_joined.show()
sale_customer_joined.write.mode('overwrite').parquet('sales_customer_joined.parquet')

+--------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|sales_id|product|amount| sale_date|region|customer_id|customer_name|               email|age|customer_city|
+--------+-------+------+----------+------+-----------+-------------+--------------------+---+-------------+
|       1| Laptop| 50000|2023-01-15| North|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|       2| Mobile| 15000|2023-02-10| South|        102|  Meena Verma|meena.verma@email...| 34|       Mumbai|
|       3| Tablet| 20000|2023-03-05|  West|        103|  Rahul Yadav|rahul.yadav@email...| 30|    Bangalore|
|       4| Laptop| 55000|2023-03-15|  East|        104|  Priya Patel|priya.patel@email...| 27|    Ahmedabad|
|       5|Desktop| 40000|2023-04-20| North|        105|  Sneha Reddy|sneha.reddy@email...| 29|    Hyderabad|
|       6| Mobile| 15000|2023-05-10| South|        101|  Arun Sharma|arun.sharma@email...| 28|        Delhi|
|       7| Laptop| 

In [50]:
# Identify products that were sold in the last 6 months.
df_sales.withColumn('months_diff',months_between(current_date(),col('sale_date')))\
    .filter(col('months_diff')<=6)\
    .show()

+--------+-----------+-------+------+---------+------+-----------+
|sales_id|customer_id|product|amount|sale_date|region|months_diff|
+--------+-----------+-------+------+---------+------+-----------+
+--------+-----------+-------+------+---------+------+-----------+



In [54]:
# Calculate the average sales amount per customer.
spark.sql('''
    select customer_id,avg(amount) as avg_sale
    from sales
    group by customer_id
    order by customer_id
''').show()

+-----------+--------+
|customer_id|avg_sale|
+-----------+--------+
|        101| 32500.0|
|        102| 37500.0|
|        103| 20000.0|
|        104| 50000.0|
|        105| 55000.0|
+-----------+--------+

