In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, DateType, TimeType
from pyspark.sql import Window as W
import pyspark.sql.functions as F

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("WalmartSales") \
    .getOrCreate()
spark

In [None]:
# Define the schema
schema = StructType([
    StructField("invoice_id", StringType(), True),
    StructField("branch", StringType(), True),
    StructField("city", StringType(), True),
    StructField("customer_type", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("product_line", StringType(), True),
    StructField("unit_price", FloatType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("vat", FloatType(), True),
    StructField("total", FloatType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),  # TimeType may not be supported, consider keeping it as StringType
    StructField("payment_method", StringType(), True),
    StructField("rating", FloatType(), True)
])

# Read the CSV file into a DataFrame
df = spark.read.csv("/data/Walmart-Sales-dataset.csv", header=True, schema=schema)

df.createOrReplaceTempView("walmart_sales")
spark.sql("SELECT * FROM walmart_sales LIMIT 10").show()

In [None]:
# -- ---------------------------------------------
# -- Business Problems :: Basic Level
# -- ---------------------------------------------
# Q.1 Find the total sales amount for each branch
# Q.2 Calculate the average customer rating for each city.
# Q.3 Count the number of sales transactions for each customer type.
# Q.4 Find the total quantity of products sold for each product line.
# Q.4 v1 Calculate the total VAT collected for each payment method.


In [None]:
# -- ---------------------------------------------
# -- Business Problems :: Medium Level
# -- ---------------------------------------------
# Q.5 Find the total sales amount and average customer rating for each branch.
# Q.6 Calculate the total sales amount for each city and gender combination.
# Q.7 Find the average quantity of products sold for each product line to female customers.
# Q.8 Count the number of sales transactions for members in each branch.
# Q.9 Find the total sales amount for each day. (Return day name and their total sales order DESC by amt)

In [None]:
# -- ---------------------------------------------
# -- Business Problems :: Advanced Level
# -- ---------------------------------------------
# Q.10 Calculate the total sales amount for each hour of the day
# Q.11 Find the total sales amount for each month. (return month name and their sales)
# Q.12 Calculate the total sales amount for each branch where the average customer rating is greater than 8.
# Q.13 Find the total VAT collected for each product line where the total sales amount is more than 500.
# Q.14 Calculate the average sales amount for each gender in each branch.
# Q.15 Count the number of sales transactions for each day of the week.
# Q.16 Find the total sales amount for each city and customer type combination where the number of sales transactions is greater than 50.
# Q.17 Calculate the average unit price for each product line and payment method combination.
# Q.18 Find the total sales amount for each branch and hour of the day combination.
# Q.19 Calculate the total sales amount and average customer rating for each product line where the total sales amount is greater than 1000.
# Q.20 Calculate the total sales amount for morning (6 AM to 12 PM), afternoon (12 PM to 6 PM), and evening (6 PM to 12 AM) periods using the time condition.