In [76]:
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql import SparkSession

In [77]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [78]:
salesDF = spark.read.format("csv") \
    .option("inferschema","true").option("header","true") \
    .load("sales.csv").withColumn("Sales Id", monotonically_increasing_id())

productsDF = spark.read.format("csv") \
    .option("inferschema","true").option("header","true").load("products.csv")

# Data Format:
salesDF.printSchema()
productsDF.printSchema()

root
 |-- Product Id: string (nullable = true)
 |-- Customer Id: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Sales Id: long (nullable = false)

root
 |-- Product Id: integer (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Product Price: double (nullable = true)



In [79]:
# Sneak peek:
salesDF.show(5)
productsDF.show()

# It's SQL'ing Time!
salesDF.createOrReplaceTempView("sales")
productsDF.createOrReplaceTempView("products")


joinQuery = """
select
    `Sales Id`,
    products.`Product Id`,
    `Product Name`,
    `Customer Id`,
    `Date`,
    `Location`,
    `Source`,
    `Quantity`,
    `Product Price`
from
    sales full outer join products on sales.`Product Id` = products.`Product Id`
"""
spark.sql(joinQuery).createOrReplaceTempView("completeTable")

+----------+-----------+-------------------+--------+----------+--------+--------+
|Product Id|Customer Id|               Date|Location|    Source|Quantity|Sales Id|
+----------+-----------+-------------------+--------+----------+--------+--------+
|         1|          A|2023-01-01 00:00:00|   India|    Swiggy|       1|       0|
|         2|          A|2022-01-01 00:00:00|   India|    Swiggy|       2|       1|
|         2|          A|2023-01-07 00:00:00|   India|    Swiggy|       3|       2|
|         3|          A|2023-01-10 00:00:00|   India|Restaurant|       1|       3|
|         3|          A|2022-01-11 00:00:00|   India|    Swiggy|       1|       4|
+----------+-----------+-------------------+--------+----------+--------+--------+
only showing top 5 rows

+----------+------------+-------------+
|Product Id|Product Name|Product Price|
+----------+------------+-------------+
|         1|       PIZZA|        100.0|
|         2|     Chowmin|        150.0|
|         3|    sandwich|   

In [80]:
# Anomalies:
query = """
select
    *
from
    completeTable
where
    `Sales Id` is null or
    `Product Id` is null or
    `Product Name` is null or
    `Customer Id` is null or
    `Date` is null or
    `Location` is null or
    `Source` is null or
    `Quantity` is null or
    `Product Price` is null
""" 
spark.sql(query).show()

+--------+----------+------------+-----------+----+--------+------+--------+-------------+
|Sales Id|Product Id|Product Name|Customer Id|Date|Location|Source|Quantity|Product Price|
+--------+----------+------------+-----------+----+--------+------+--------+-------------+
|     117|      NULL|        NULL|       NULL|NULL|    NULL|  NULL|    NULL|         NULL|
|    NULL|         7|     Boogers|       NULL|NULL|    NULL|  NULL|    NULL|        999.0|
+--------+----------+------------+-----------+----+--------+------+--------+-------------+



In [81]:
# Clean NULL record:
filter = """
select * from completeTable
where `Sales Id` is null or `Sales Id` != 117
"""
table = spark.sql(filter)

table.createOrReplaceTempView("completeTable")

In [82]:
# Q. Total amount spend by each customer:
query = """
select
    `Customer Id`,
    sum(ifnull(`Product Price`,0)*`Quantity`) as `Total Spent`,
    sum(`Quantity`) as `Items Bought`
from
    completeTable
where
    `Sales Id` is not null
group by
    `Customer Id`
order by
    `Total Spent` desc
"""
spark.sql(query).show()

+-----------+-----------+------------+
|Customer Id|Total Spent|Items Bought|
+-----------+-----------+------------+
|          B|    19440.0|         154|
|          E|    15630.0|         133|
|          A|    13830.0|         103|
|          C|     6560.0|          47|
|          D|     4280.0|          44|
+-----------+-----------+------------+



In [83]:
# Q. Total spend on each Product: (Assumption: Product Id 1-to-1 Product Name)
query = """
select
    `Product Name`,
    sum(ifnull(`Product Price`,0)*ifnull(`Quantity`,0)) as `Total Spent`,
    sum(ifnull(`Quantity`,0)) as `Units Bought`
from
    completeTable
where
    `Product Id` is not null
group by
    `Product Name`
order by
    `Total Spent` desc
"""
spark.sql(query).show()

+------------+-----------+------------+
|Product Name|Total Spent|Units Bought|
+------------+-----------+------------+
|    sandwich|    28560.0|         238|
|     Chowmin|    16350.0|         109|
|       PIZZA|     5600.0|          56|
|        Dosa|     3630.0|          33|
|       Pasta|     3600.0|          20|
|     Biryani|     2000.0|          25|
|     Boogers|        0.0|           0|
+------------+-----------+------------+



In [84]:
# Q. Total amount of sales in each month:
query = """
select
    year(`Date`) as Year,
    month(`Date`) as Month,
    sum(ifnull(`Product Price`,0)*Quantity) as `Total Spent`,
    sum(`Quantity`) as `Items Bought`
from
    completeTable
where
    `Date` is not null
group by
    year(`Date`), month(`Date`)
order by
    year(`Date`) asc, month(`Date`) asc
"""

spark.sql(query).show()

+----+-----+-----------+------------+
|Year|Month|Total Spent|Items Bought|
+----+-----+-----------+------------+
|2022|    1|     1860.0|          13|
|2022|    2|     6470.0|          55|
|2022|    3|      880.0|           7|
|2022|    5|     1890.0|          13|
|2022|    6|     2640.0|          18|
|2022|    7|      950.0|           7|
|2022|   11|     1560.0|          12|
|2023|    1|     6740.0|          50|
|2023|    2|    15680.0|         134|
|2023|    3|      880.0|           8|
|2023|    5|    10910.0|          81|
|2023|    6|     5590.0|          46|
|2023|    7|     1590.0|          15|
|2023|   11|     2100.0|          22|
+----+-----+-----------+------------+



In [85]:
# Q. Yearly Sales:
query = """
select
    year(`Date`) as Year,
    sum(ifnull(`Product Price`,0)*Quantity) as `Total Spent`,
    sum(`Quantity`) as `Items Bought`
from
    completeTable
where
    `Date` is not null
group by
    year(`Date`)
order by
    year(`Date`) asc
"""

spark.sql(query).show()

+----+-----------+------------+
|Year|Total Spent|Items Bought|
+----+-----------+------------+
|2022|    16250.0|         125|
|2023|    43490.0|         356|
+----+-----------+------------+



In [86]:
# Q. Quarterly Sales:
query = """
select
    year(`Date`) as Year,
    round((month(`Date`)-1)/4,0)+1 as Quarter,
    sum(ifnull(`Product Price`,0)*Quantity) as `Total Spent`,
    sum(Quantity) as `Items Bought`
from
    completeTable
where
    `Date` is not null
group by
    year(`Date`), round((month(`Date`)-1)/4,0)+1
order by
    year(`Date`), Quarter
"""

spark.sql(query).show()

+----+-------+-----------+------------+
|Year|Quarter|Total Spent|Items Bought|
+----+-------+-----------+------------+
|2022|    1.0|     8330.0|          68|
|2022|    2.0|     5410.0|          38|
|2022|    3.0|      950.0|           7|
|2022|    4.0|     1560.0|          12|
|2023|    1.0|    22420.0|         184|
|2023|    2.0|    17380.0|         135|
|2023|    3.0|     1590.0|          15|
|2023|    4.0|     2100.0|          22|
+----+-------+-----------+------------+



In [87]:
# Q. Total number of orders by each category: 
query = """
select
    `Product Id`,
    `Product Name`,
    count(`Sales Id`) as `Orders Placed`
from
    completeTable
where
    `Product Id` is not null
group by
    `Product Id`,`Product Name`
"""
spark.sql(query).show()

+----------+------------+-------------+
|Product Id|Product Name|Orders Placed|
+----------+------------+-------------+
|         5|     Biryani|            6|
|         3|    sandwich|           48|
|         4|        Dosa|           12|
|         2|     Chowmin|           24|
|         1|       PIZZA|           21|
|         6|       Pasta|            6|
|         7|     Boogers|            0|
+----------+------------+-------------+



In [94]:
# Q. Top 5 ordered items:
query = """
select
    `Product Id`,
    `Product Name`,
    ifnull(sum(Quantity),0) as `Items Bought`,
    count(`Sales Id`) as Orders
from
    completeTable
where
    `Product Id` is not null
group by
    `Product Id`, `Product Name`
order by
    `Items Bought` desc
limit
    5
"""
spark.sql(query).show()

+----------+------------+------------+------+
|Product Id|Product Name|Items Bought|Orders|
+----------+------------+------------+------+
|         3|    sandwich|         238|    48|
|         2|     Chowmin|         109|    24|
|         1|       PIZZA|          56|    21|
|         4|        Dosa|          33|    12|
|         5|     Biryani|          25|     6|
+----------+------------+------------+------+



In [95]:
# Q. Frequency of Customer visit:
query = """
select
    `Customer Id`,
    count(`Sales Id`) as `Frequency of Purchases`
from
    completeTable
where
    `Customer Id` is not null
group by
    `Customer Id`
"""
spark.sql(query).show()

+-----------+----------------------+
|Customer Id|Frequency of Purchases|
+-----------+----------------------+
|          E|                    18|
|          B|                    36|
|          D|                    12|
|          C|                    18|
|          A|                    33|
+-----------+----------------------+



In [32]:
# Q. Total sales by each country:
query = """
select
    A.`Location`,
    sum(ifnull(B.`Product Price`,0)*`Quantity`) as `Country Sales Amount`
from
    sales as A left outer join products as B on A.`Product Id` = B.`Product Id`
where
    A.`Location` is not null
group by
     A.`Location`
"""
spark.sql(query).show()

+--------+--------------------+
|Location|Country Sales Amount|
+--------+--------------------+
|   India|               19600|
|     USA|                7310|
|      UK|               32830|
+--------+--------------------+



In [176]:
# Q. Total sales by order source:
query = """
select
    `Source`,
    sum(ifnull(`Product Price`,0)*`Quantity`) as `Country Sales Amount`,
    count(`Sales Id`) as Orders
from
    completeTable
where
    `Source` is not null
group by
     `Source`
"""

spark.sql(query).show()

+----------+--------------------+------+
|    Source|Country Sales Amount|Orders|
+----------+--------------------+------+
|    zomato|             20900.0|    39|
|    Swiggy|             20260.0|    51|
|Restaurant|             18580.0|    27|
+----------+--------------------+------+

