<a href="https://colab.research.google.com/github/shubacca/pyspark-retail/blob/main/retail_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install necessary packages
!pip install faker pyspark

Collecting faker
  Downloading Faker-26.2.0-py3-none-any.whl.metadata (15 kB)
Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Faker-26.2.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=31e742df5f474472184e49b4610ec3c89aa1802989c10107541349d7119fb8f7
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark, faker
Successfully installed faker-26.2.0 pyspark-3.5.1


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, BooleanType, DateType
from faker import Faker
import random
from datetime import datetime, timedelta

In [4]:
spark = SparkSession.builder.master("local[*]").appName("RetailDataset").getOrCreate()

In [10]:
fake = Faker()

def generate_customers(n):
    customers = []
    for i in range(n):
        customer = {
            "customer_id": i,
            "first_name": fake.first_name(),
            "last_name": fake.last_name(),
            "email": fake.email(),
            "is_active": fake.boolean(),
            "is_loyalty_member": fake.boolean()
        }
        customers.append(customer)
    return customers

def generate_products(n):
    products = []
    for i in range(n):
        product = {
            "product_id": i,
            "product_name": fake.word(),
            "category": fake.random_element(elements=("Electronics", "Clothing", "Food", "Books")),
            "price": round(random.uniform(10, 1000), 2)
        }
        products.append(product)
    return products

def generate_sales(n, num_customers, num_products):
    sales = []
    for i in range(n):
        sale = {
            "sale_id": i,
            "customer_id": random.randint(0, num_customers - 1),
            "product_id": random.randint(0, num_products - 1),
            "quantity": random.randint(1, 10),
            "total_amount": round(random.uniform(20, 2000), 2),
            "sale_date": fake.date_between(start_date='-1y', end_date='today')
        }
        sales.append(sale)
    return sales


In [14]:
# Generate data
num_customers = 500
num_products = 50
num_sales = 10000

customers_data = generate_customers(num_customers)
products_data = generate_products(num_products)
sales_data = generate_sales(num_sales, num_customers, num_products)

# Define schemas
customer_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("is_active", BooleanType(), True),
    StructField("is_loyalty_member", BooleanType(), True)
])

product_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", FloatType(), True)
])

sales_schema = StructType([
    StructField("sale_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("total_amount", FloatType(), True),
    StructField("sale_date", DateType(), True)
])

# Create Spark DataFrames
customers_df = spark.createDataFrame(customers_data, schema=customer_schema)
products_df = spark.createDataFrame(products_data, schema=product_schema)
sales_df = spark.createDataFrame(sales_data, schema=sales_schema)


In [15]:
sales_df.show()

+-------+-----------+----------+--------+------------+----------+
|sale_id|customer_id|product_id|quantity|total_amount| sale_date|
+-------+-----------+----------+--------+------------+----------+
|      0|        450|        38|       9|     1285.13|2023-09-15|
|      1|         17|        45|       4|     1953.44|2024-07-08|
|      2|        104|        38|       4|     1185.28|2024-03-20|
|      3|        481|         4|       8|      318.61|2024-02-18|
|      4|         83|        35|       1|      238.84|2024-03-08|
|      5|        478|        46|       6|      527.09|2023-12-25|
|      6|        403|        27|       2|      893.08|2024-01-03|
|      7|        198|        24|       5|      264.86|2024-05-24|
|      8|         62|        22|       5|      565.63|2024-02-06|
|      9|         20|        39|       3|      965.33|2023-10-13|
|     10|        347|        20|       2|     1464.17|2023-09-23|
|     11|        462|        10|      10|       515.0|2023-08-27|
|     12| 

In [16]:
# How would you select the first_name and last_name columns from the customers_df DataFrame using the col() function?
from pyspark.sql.functions import col

names = customers_df.select(col("first_name"), col('last_name'))
names.show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|    Martha|  Ramirez|
|   Tiffany|   Harris|
|   Tristan|   Benson|
| Elizabeth|   Herman|
|     Lance|     Wade|
|   Rebecca|  Jackson|
|      Dana|   Nguyen|
|      Tara|  Watkins|
|   Vanessa|   Garcia|
|  Jonathan|    White|
|    Thomas|  Collins|
|   Spencer| Thompson|
|    Daniel|Hernandez|
|    Deanna| Johnston|
|    Isabel|    Clark|
|    Dennis|    Casey|
|   Michael|     Wood|
|   Michael|  Sanchez|
|     Barry|    Perez|
|      Paul|     Cole|
+----------+---------+
only showing top 20 rows



In [19]:
# Using the expr() function, how would you create a new column in the sales_df DataFrame that calculates the total price as quantity * total_amount?

sales_df.withColumn('total_price', expr('quantity * total_amount')).show()

+-------+-----------+----------+--------+------------+----------+-----------+
|sale_id|customer_id|product_id|quantity|total_amount| sale_date|total_price|
+-------+-----------+----------+--------+------------+----------+-----------+
|      0|        450|        38|       9|     1285.13|2023-09-15|   11566.17|
|      1|         17|        45|       4|     1953.44|2024-07-08|    7813.76|
|      2|        104|        38|       4|     1185.28|2024-03-20|    4741.12|
|      3|        481|         4|       8|      318.61|2024-02-18|    2548.88|
|      4|         83|        35|       1|      238.84|2024-03-08|     238.84|
|      5|        478|        46|       6|      527.09|2023-12-25|    3162.54|
|      6|        403|        27|       2|      893.08|2024-01-03|    1786.16|
|      7|        198|        24|       5|      264.86|2024-05-24|  1324.2999|
|      8|         62|        22|       5|      565.63|2024-02-06|    2828.15|
|      9|         20|        39|       3|      965.33|2023-10-13

In [18]:
# Using the expr() function, how would you create a new column in the sales_df DataFrame that calculates the total price as quantity * total_amount?

from pyspark.sql.functions import expr

sales_df = sales_df.withColumn("total_price", col('quantity') * col('total_amount'))
sales_df.show()

+-------+-----------+----------+--------+------------+----------+-----------+
|sale_id|customer_id|product_id|quantity|total_amount| sale_date|total_price|
+-------+-----------+----------+--------+------------+----------+-----------+
|      0|        450|        38|       9|     1285.13|2023-09-15|   11566.17|
|      1|         17|        45|       4|     1953.44|2024-07-08|    7813.76|
|      2|        104|        38|       4|     1185.28|2024-03-20|    4741.12|
|      3|        481|         4|       8|      318.61|2024-02-18|    2548.88|
|      4|         83|        35|       1|      238.84|2024-03-08|     238.84|
|      5|        478|        46|       6|      527.09|2023-12-25|    3162.54|
|      6|        403|        27|       2|      893.08|2024-01-03|    1786.16|
|      7|        198|        24|       5|      264.86|2024-05-24|  1324.2999|
|      8|         62|        22|       5|      565.63|2024-02-06|    2828.15|
|      9|         20|        39|       3|      965.33|2023-10-13