## Install PySpark and Java

In [None]:
# Installing PySpark and Java
# Installing  Java and the compatible version of PySpark

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Spark (adjusting the version when needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

# Unzip the downloaded file
!tar xf spark-3.1.2-bin-hadoop3.2.tgz



In [None]:
! pip install openpyxl



## Set up the Environment Variables

In [None]:
# Configuring the environment for Java and Spark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


In [None]:
!pip install -q pyspark==3.1.2


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.4/212.4 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
import pyspark
print("PySpark version:", pyspark.__version__)


PySpark version: 3.1.2


## Creating a Spark Session

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("WalmartSalesDataPipeline") \
    .master("local[*]") \
    .getOrCreate()


# Load the Data

In [None]:
# Load the data into DataFrames
#customers_df = spark.read.csv("/content/customers.tsv", header=True, inferSchema=True)



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Start Spark session
spark = SparkSession.builder.appName("AddHeadersToTSV").getOrCreate()

# Define the schema with the required column names and data types
schema = StructType([
    StructField("Customer ID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Zip Code", StringType(), True)
])

# Load TSV data with the defined schema
file_path = "/content/customers.tsv"
customers_df = spark.read.option("delimiter", "\t").csv(file_path, schema=schema, header=False)

# Show the result
#df.show()


In [None]:
# Show the result
customers_df.show()

+-----------+----------------+-------------+-----+--------+
|Customer ID|            Name|         City|State|Zip Code|
+-----------+----------------+-------------+-----+--------+
|      11039|     Mary Torres|       Caguas|   PR|     725|
|       5623|      Jose Haley|     Columbus|   OH|   43207|
|       5829|      Mary Smith|      Houston|   TX|   77015|
|       6336|  Richard Maddox|       Caguas|   PR|     725|
|       1708|  Margaret Booth|    Arlington|   TX|   76010|
|      10227|  Mary Henderson|       Caguas|   PR|     725|
|        839|     Lisa Walker|       Caguas|   PR|     725|
|       7604|   Jonathan Hill|      Phoenix|   AZ|   85040|
|       6485|Carolyn Sheppard|Pompano Beach|   FL|   33063|
|       4737|    Mary Mendoza|       Caguas|   PR|     725|
|       5973|   Michael Smith|       Caguas|   PR|     725|
|       9205|    James Holmes|     Hilliard|   OH|   43026|
|        138|     Mary Dawson|       Caguas|   PR|     725|
|        371|    Adam Marquez|  San Anto

In [None]:
#sales_df = spark.read.csv("/content/salestxns.tsv", header=True, inferSchema=True)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Start Spark session
spark = SparkSession.builder.appName("AddHeadersToTSV").getOrCreate()

# Define the schema with the required column names and data types
schema = StructType([
    StructField("Sales Txn ID", StringType(), True),
    StructField("Category  ID", StringType(), True),
    StructField("Category Name", StringType(), True),
    StructField("Product  ID", StringType(), True),
    StructField("Product Name", StringType(), True),
    StructField("Price", StringType(), True),
    StructField("Quantity", StringType(), True),
    StructField("Customer ID", StringType(), True)
])

# Load TSV data with the defined schema
file_path = "/content/salestxns.tsv"
sales_df = spark.read.option("delimiter", "\t").csv(file_path, schema=schema, header=False)

In [None]:
sales_df.show()

+------------+------------+--------------------+-----------+--------------------+------+--------+-----------+
|Sales Txn ID|Category  ID|       Category Name|Product  ID|        Product Name| Price|Quantity|Customer ID|
+------------+------------+--------------------+-----------+--------------------+------+--------+-----------+
|           1|          43|    Camping & Hiking|        957|Diamondback Women...|299.98|       1|      11599|
|           2|          48|        Water Sports|       1073|Pelican Sunstream...|199.99|       1|        256|
|           3|          24|     Women's Apparel|        502|Nike Men's Dri-FI...|    50|       5|        256|
|           4|          18|      Men's Footwear|        403|Nike Men's CJ Eli...|129.99|       1|        256|
|           5|          40|         Accessories|        897|Team Golf New Eng...| 24.99|       2|       8827|
|           6|          17|              Cleats|        365|Perfect Fitness P...| 59.99|       5|       8827|
|         

## Data Preprocessing ✅

#### Convert necessary columns to appropriate data types.

In [None]:
from pyspark.sql.types import IntegerType, FloatType

# Convert data types for necessary columns
sales_df = sales_df.withColumn("Price", sales_df["Price"].cast(FloatType())) \
                   .withColumn("Quantity", sales_df["Quantity"].cast(IntegerType()))


## SQL Queries

#### 1. Total Number of Customers:
How many unique customers are there in the dataset?


In [None]:
unique_customers = customers_df.select("Customer ID").distinct().count()
print("Total number of unique customers:", unique_customers)


Total number of unique customers: 1244


#### 2.Total Sales by State:
What is the total sales amount for each state?

In [None]:
from pyspark.sql import functions as F

# Join sales and customers data
joined_df = sales_df.join(customers_df, on="Customer ID", how="outer")

# Calculate total sales by state
total_sales_by_state = joined_df.withColumn("Total_Price", F.col("Price") * F.col("Quantity")) \
                                .groupBy("State") \
                                .agg(F.sum("Total_Price").alias("Total_Sales"))

total_sales_by_state.show()


+-----+--------------------+
|State|         Total_Sales|
+-----+--------------------+
|   AZ|            48702.68|
|   SC|             4144.68|
|   LA|            24449.42|
|   MN|  3549.6000000000004|
|   NJ|  52303.090000000004|
|   DC|             8798.76|
|   OR|             9544.78|
| null|3.0858028790000077E7|
|   VA|  30488.970000000005|
|   RI|   5424.410000000001|
|   KY|              2749.7|
|   MI|            83347.09|
|   NV|   47103.60999999999|
|   WI|             24561.3|
|   ID|  10098.949999999997|
|   CA|   503205.4899999998|
|   CT|  19206.769999999997|
|   NC|   45275.88999999999|
|   MD|  51982.490000000005|
|   DE|             1305.76|
+-----+--------------------+
only showing top 20 rows



#### 3. Top 10 Most Purchased Products:
Which are the top 10 most purchased products based on the quantity sold?

In [None]:
top_products = sales_df.groupBy("Product Name") \
                       .agg(F.sum("Quantity").alias("Total_Quantity")) \
                       .orderBy(F.desc("Total_Quantity")) \
                       .limit(10)

top_products.show()


+--------------------+--------------+
|        Product Name|Total_Quantity|
+--------------------+--------------+
|Perfect Fitness P...|       73698.0|
|Nike Men's Dri-FI...|       62956.0|
|O'Brien Men's Neo...|       57803.0|
|Nike Men's Free 5...|       36680.0|
|Under Armour Girl...|       31735.0|
|Nike Men's CJ Eli...|       22246.0|
|Field & Stream Sp...|       17325.0|
|Pelican Sunstream...|       15500.0|
|Diamondback Women...|       13729.0|
|ENO Atlas Hammock...|         998.0|
+--------------------+--------------+



#### 4. Average Transaction Value:
What is the average price of transactions across all sales?


In [None]:
avg_transaction_value = sales_df.withColumn("Total_Price", F.col("Price") * F.col("Quantity")) \
                                .agg(F.avg("Total_Price").alias("Avg_Transaction_Value"))

avg_transaction_value.show()


+---------------------+
|Avg_Transaction_Value|
+---------------------+
|   199.32066533882224|
+---------------------+



#### 5. Top 5 Customers by Expenditure:
Who are the top 5 customers by total amount spent?

In [None]:
top_customers = joined_df.withColumn("Total_Price", F.col("Price") * F.col("Quantity")) \
                         .groupBy("Customer ID", "Name") \
                         .agg(F.sum("Total_Price").alias("Total_Expenditure")) \
                         .orderBy(F.desc("Total_Expenditure")) \
                         .limit(5)

top_customers.show()


+-----------+--------------+------------------+
|Customer ID|          Name| Total_Expenditure|
+-----------+--------------+------------------+
|        791|          null|10524.169999999993|
|       9371|Mary Patterson| 9299.029999999997|
|       8766|          null| 9296.139999999998|
|       1657|          null| 9223.709999999994|
|       2641|          null| 9130.919999999995|
+-----------+--------------+------------------+



#### 6. Product Purchases by a Specific Customer:
List all products purchased by a specific customer (e.g., customer with ID 256)

In [None]:
customer_id = 256  # Example Customer ID

customer_purchases = sales_df.filter(sales_df["Customer ID"] == customer_id) \
                             .select("Product Name", "Quantity", (F.col("Price") * F.col("Quantity")).alias("Total_Spent"))

customer_purchases.show()




+--------------------+--------+------------------+
|        Product Name|Quantity|       Total_Spent|
+--------------------+--------+------------------+
|Pelican Sunstream...|       1|            199.99|
|Nike Men's Dri-FI...|       5|             250.0|
|Nike Men's CJ Eli...|       1|            129.99|
|Team Golf St. Lou...|       5|124.94999999999999|
|TYR Boys' Team Di...|       5|199.95000000000002|
|Field & Stream Sp...|       1|            399.98|
|Field & Stream Sp...|       1|            399.98|
|Nike Men's Dri-FI...|       5|             250.0|
|Nike Men's CJ Eli...|       1|            129.99|
|Nike Men's CJ Eli...|       1|            129.99|
|Perfect Fitness P...|       5|            299.95|
|O'Brien Men's Neo...|       5|249.89999999999998|
|Nike Men's CJ Eli...|       1|            129.99|
|O'Brien Men's Neo...|       4|            199.92|
|Under Armour Wome...|       1|             54.97|
|Nike Women's Temp...|       4|             120.0|
|Nike Men's Dri-FI...|       1|

#### 7. Monthly Sales Trends:
Assuming there is a date field, analyze the sales trends over the months. Which month had the highest sales?


- Since There is no 'Date' and 'Month' data column available in `Sales_df` dataset, so we can not extract Monthly Sales Data from it .

- Since the document doesn’t indicate the presence of a date field in `sales_df`, we need to add a date column manually .

- We will generate random dates within a specified range for each transaction, we can add a `Transaction_Date` column with dates spread randomly over a given range.


So What the things we need to able to get the desired output...? ☝


We can implement three things here:


1) **Define the Date Range** Choose a start and end date.

2)**Generate Random Dates** within that range for each row.

3)**Extract the Month from the random** dates for monthly analysis.



In [None]:
import random
from pyspark.sql import functions as F
from pyspark.sql.types import DateType
from datetime import date, timedelta

# Define the start and end dates for the random date range
start_date = date(2023, 1, 1)
end_date = date(2023, 12, 31)

# Function to generate a random date within the specified range
def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)
    return start + timedelta(days=random_days)

# Register the function as a UDF
random_date_udf = F.udf(lambda: random_date(start_date, end_date), DateType())   # UDF = User Defined Function

# Add a Transaction_Date column with random dates for each transaction
sales_df = sales_df.withColumn("Transaction_Date", random_date_udf())

# Extract the month from the new Transaction_Date column
sales_df = sales_df.withColumn("Month", F.month("Transaction_Date"))

# Calculate monthly sales
monthly_sales = sales_df.withColumn("Total_Price", F.col("Price") * F.col("Quantity")) \
                        .groupBy("Month") \
                        .agg(F.sum("Total_Price").alias("Monthly_Sales")) \
                        .orderBy(F.desc("Monthly_Sales"))

# Show the result
monthly_sales.show()


+-----+------------------+
|Month|     Monthly_Sales|
+-----+------------------+
|    7| 2940069.659999812|
|    8|  2926173.94999981|
|    1| 2924858.619999815|
|   10|2910291.5399998133|
|    5|2897295.5099998154|
|    3| 2875720.519999815|
|   12|2860295.8299998157|
|   11| 2848909.149999821|
|    9|2847548.4699998177|
|    4|  2831549.59999982|
|    6|  2826857.76999982|
|    2|2633049.3099998385|
+-----+------------------+



####8. Category with Highest Sales:
Which product category generated the highest total sales revenue?


In [None]:
category_sales = sales_df.withColumn("Total_Price", F.col("Price") * F.col("Quantity")) \
                         .groupBy("Category Name") \
                         .agg(F.sum("Total_Price").alias("Total_Category_Sales")) \
                         .orderBy(F.desc("Total_Category_Sales")) \
                         .limit(1)

category_sales.show()


+-------------+--------------------+
|Category Name|Total_Category_Sales|
+-------------+--------------------+
|      Fishing|   6929653.500000114|
+-------------+--------------------+



#### 9. State-wise Sales Comparison:
Compare the total sales between two specific states (e.g., Texas vs. Ohio). Which state had higher sales?

In [None]:
states_to_compare = ["TX", "OH"]  # Example states

state_comparison = joined_df.filter(joined_df.State.isin(states_to_compare)) \
                            .withColumn("Total_Price", F.col("Price") * F.col("Quantity")) \
                            .groupBy("State") \
                            .agg(F.sum("Total_Price").alias("Total_Sales")) \
                            .orderBy(F.desc("Total_Sales"))

state_comparison.show()


+-----+------------------+
|State|       Total_Sales|
+-----+------------------+
|   TX|184629.30000000045|
|   OH| 82342.95000000014|
+-----+------------------+



#### 9. Detailed Customer Purchase Report:
Generate a detailed report showing each customer along with their total purchases, the total number of transactions they have made, and the average transaction value.


In [None]:
customer_report = joined_df.withColumn("Total_Price", F.col("Price") * F.col("Quantity")) \
                           .groupBy("Customer ID", "Name") \
                           .agg(F.sum("Total_Price").alias("Total_Purchases"),
                                F.count("Sales Txn ID").alias("Total_Transactions"),
                                F.avg("Price").alias("Avg_Transaction_Value"))

customer_report.show()


+-----------+----+------------------+------------------+---------------------+
|Customer ID|Name|   Total_Purchases|Total_Transactions|Avg_Transaction_Value|
+-----------+----+------------------+------------------+---------------------+
|      10096|null|           2211.76|                11|   158.53272727272727|
|      10351|null| 8339.259999999998|                28|    212.4871428571428|
|      10436|null|2484.6299999999997|                16|   112.80125000000001|
|       1090|null|1387.8100000000002|                 7|   141.70285714285714|
|      11078|null| 919.9000000000001|                 4|              84.9925|
|      11332|null|3295.6399999999994|                22|   108.71818181818183|
|      11563|null|2619.7100000000005|                16|   118.11562500000001|
|       1159|null|3567.6500000000005|                19|   139.25052631578947|
|      11722|null|           1529.81|                 9|    71.65777777777778|
|      12394|null|3882.5199999999995|               