In [1]:
# ===== SALES ANALYSIS PROJECT =====

# Part II: SQL Queries for Analysis

In [2]:
# Import libraries

import pandas as pd
import os
from sqlalchemy import create_engine
from sqlalchemy import text

# Fetch environment variables
username = os.getenv('MYSQL_USER')
password = os.getenv('MYSQL_PASSWORD')
host = os.getenv('MYSQL_HOST')
database = os.getenv('MYSQL_DB')

# Create sqlalchemy engine
engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}/{database}')

# Formats all floats to two decimal places
pd.options.display.float_format = '{:,.2f}'.format 

In [3]:
# Test Query: Retrieve column names and sample rows

query = "SELECT * FROM sales_data LIMIT 5;"
sample_data = pd.read_sql(query, engine)
sample_data.head()

Unnamed: 0,row_id,order_id,product,quantity_ordered,price_each,order_date,street,city,state,zip,total_price
0,1,176558.0,USB-C Charging Cable,2.0,11.95,2019-04-19 08:46:00,917 1st St,Dallas,TX,75001,23.9
1,2,176559.0,Bose SoundSport Headphones,1.0,99.99,2019-04-07 22:30:00,682 Chestnut St,Boston,MA,2215,99.99
2,3,176560.0,Google Phone,1.0,600.0,2019-04-12 14:38:00,669 Spruce St,Los Angeles,CA,90001,600.0
3,4,176560.0,Wired Headphones,1.0,11.99,2019-04-12 14:38:00,669 Spruce St,Los Angeles,CA,90001,11.99
4,5,176561.0,Wired Headphones,1.0,11.99,2019-04-30 09:27:00,333 8th St,Los Angeles,CA,90001,11.99


In [4]:
# I had to use .text() to wrap the queries bc .read_sql was not interpreting the raw SQL string 
# Comments for SQL queries are included in the queries themselves, denoted by "--" at the beginning of each comment

sql_sales_by_month = text("""
-- Total Sales by Month
SELECT 
    DATE_FORMAT(order_date, '%m') AS month,
    SUM(total_price) AS monthly_sales
FROM
    sales_data
GROUP BY month
ORDER BY monthly_sales DESC;
""")

total_sales_by_month = pd.read_sql(sql_sales_by_month, engine)
total_sales_by_month.head(10)

Unnamed: 0,month,monthly_sales
0,12,4608295.7
1,10,3734777.86
2,4,3389217.98
3,11,3197875.05
4,5,3150616.23
5,3,2804973.35
6,7,2646461.32
7,6,2576280.15
8,8,2241083.37
9,2,2200078.08


In [5]:
sql_sales_by_dayofweek = text( """
-- Total Sales by Day of the Week
SELECT 
    DAYNAME(order_date) AS day_of_week,
    SUM(total_price) AS daily_sales
FROM
    sales_data
GROUP BY day_of_week
ORDER BY daily_sales DESC;
""")

total_sales_by_dayofweek = pd.read_sql(sql_sales_by_dayofweek, engine)
total_sales_by_dayofweek.head(10)

Unnamed: 0,day_of_week,daily_sales
0,Tuesday,5086275.4
1,Wednesday,4986823.36
2,Sunday,4927249.4
3,Saturday,4900195.58
4,Monday,4877588.21
5,Friday,4853642.26
6,Thursday,4833763.73


In [6]:
sql_sales_by_day = text( """
-- Total Sales by Day
SELECT 
    DATE(order_date) AS day, SUM(total_price) AS daily_sales
FROM
    sales_data
GROUP BY day
ORDER BY daily_sales DESC
LIMIT 10;
""")

total_sales_by_day = pd.read_sql(sql_sales_by_day, engine)
total_sales_by_day.head(10)

Unnamed: 0,day,daily_sales
0,2019-12-04,166577.69
1,2019-12-16,162970.61
2,2019-12-10,162820.75
3,2019-12-20,160178.31
4,2019-12-17,157357.04
5,2019-12-29,156009.67
6,2019-12-05,155079.3
7,2019-12-21,154656.88
8,2019-12-18,154598.77
9,2019-12-24,152888.82


In [7]:
sql_sales_by_hour = text("""
-- Total Sales by Hour
SELECT 
    HOUR(order_date) AS hour, SUM(total_price) AS hourly_sales
FROM
    sales_data
GROUP BY hour
ORDER BY hourly_sales DESC;
""")

total_sales_by_hour = pd.read_sql(sql_sales_by_hour, engine)
total_sales_by_hour.head(12)

Unnamed: 0,hour,hourly_sales
0,19,2411971.14
1,12,2314359.85
2,11,2296619.84
3,20,2280784.36
4,18,2218374.01
5,13,2152369.98
6,17,2126553.97
7,14,2082513.7
8,21,2040790.48
9,10,1942988.08


In [8]:
sql_repeat_v_onetime = text("""
-- Number of Repeat Customers vs One-time Customers 
SELECT 
    CASE
        WHEN order_count = 1 THEN 'One-time Customer'
        ELSE 'Repeat Customer'
    END AS customer_type,
    COUNT(*) AS customer_count
FROM
    (SELECT 
        order_id, COUNT(*) AS order_count
    FROM
        sales_data
    GROUP BY order_id) customer_orders
GROUP BY customer_type;
""")

number_repeat_v_onetime = pd.read_sql(sql_repeat_v_onetime, engine)
number_repeat_v_onetime.head()

Unnamed: 0,customer_type,customer_count
0,One-time Customer,171558
1,Repeat Customer,6879


In [9]:
sql_top5_quantity_repeat_v_onetime = text("""
-- Top 5 Products by Repeat Customer vs One-time Customer
WITH customer_orders AS (
    -- Identify One-time vs Repeat Customers using order_id
    SELECT order_id, COUNT(*) AS order_count
    FROM sales_data
    GROUP BY order_id
),
customer_classification AS (
    -- Classify Orders into One-time or Repeat Customers
    SELECT 
        order_id,
        CASE 
            WHEN order_count = 1 THEN 'One-time Customer'
            ELSE 'Repeat Customer'
        END AS customer_type
    FROM customer_orders
),
customer_product_sales AS (
    -- Join with sales_data to get Product Purchases
    SELECT 
        c.customer_type,
        s.product AS product_name,
        SUM(s.quantity_ordered) AS total_quantity
    FROM sales_data s
    JOIN customer_classification c ON s.order_id = c.order_id
    GROUP BY c.customer_type, s.product
)
-- Top 5 Products for One-time or Repeat Customers
SELECT * FROM (
    SELECT 
        customer_type,
        product_name,
        total_quantity,
        RANK() OVER (PARTITION BY customer_type ORDER BY total_quantity DESC) AS rank_order
    FROM customer_product_sales
) ranked
WHERE rank_order <= 5;
""")

top5_quantity_repeat_v_onetime = pd.read_sql(sql_top5_quantity_repeat_v_onetime, engine)
top5_quantity_repeat_v_onetime.head(10)

Unnamed: 0,customer_type,product_name,total_quantity,rank_order
0,One-time Customer,AAA Batteries (4-pack),29797.0,1
1,One-time Customer,AA Batteries (4-pack),26642.0,2
2,One-time Customer,USB-C Charging Cable,21725.0,3
3,One-time Customer,Lightning Charging Cable,21313.0,4
4,One-time Customer,Wired Headphones,18764.0,5
5,Repeat Customer,USB-C Charging Cable,2206.0,1
6,Repeat Customer,iPhone,1864.0,2
7,Repeat Customer,Lightning Charging Cable,1856.0,3
8,Repeat Customer,Wired Headphones,1760.0,4
9,Repeat Customer,Google Phone,1636.0,5


In [10]:
sql_top5_sales_repeat_v_onetime = text("""
-- Top 5 Products by Sales for Repeat Customers and One-time Customers
WITH customer_orders AS (
    -- Identify One-time vs Repeat Customers using order_id
    SELECT order_id, COUNT(*) AS order_count
    FROM sales_data
    GROUP BY order_id
),
customer_classification AS (
    -- Classify Orders into One-time or Repeat Customers
    SELECT 
        order_id,
        CASE 
            WHEN order_count = 1 THEN 'One-time Customer'
            ELSE 'Repeat Customer'
        END AS customer_type
    FROM customer_orders
),
customer_product_sales AS (
    -- Join with sales_data to get Product Purchases by Total Revenue
    SELECT 
        c.customer_type,
        s.product AS product_name,
        SUM(s.total_price) AS total_sales  
    FROM sales_data s
    JOIN customer_classification c ON s.order_id = c.order_id
    GROUP BY c.customer_type, s.product
)
-- Get the Top 5 Products for Each Customer Type (One-time vs. Repeat)
SELECT * FROM (
    SELECT 
        customer_type,
        product_name,
        total_sales,  
        RANK() OVER (PARTITION BY customer_type ORDER BY total_sales DESC) AS rank_order
    FROM customer_product_sales
) ranked
WHERE rank_order <= 5;
""")

top5_sales_repeat_v_onetime = pd.read_sql(sql_top5_sales_repeat_v_onetime, engine)
top5_sales_repeat_v_onetime.head(10)

Unnamed: 0,customer_type,product_name,total_sales,rank_order
0,One-time Customer,Macbook Pro Laptop,7707800.0,1
1,One-time Customer,ThinkPad Laptop,3955960.44,2
2,One-time Customer,iPhone,3488100.0,3
3,One-time Customer,27in 4K Gaming Monitor,2339160.02,4
4,One-time Customer,Google Phone,2335800.0,5
5,Repeat Customer,iPhone,1304800.0,1
6,Repeat Customer,Google Phone,981600.0,2
7,Repeat Customer,Macbook Pro Laptop,324700.0,3
8,Repeat Customer,Vareebadd Phone,240800.0,4
9,Repeat Customer,ThinkPad Laptop,171998.28,5


In [11]:
sql_ltv_repeat_v_onetime = text("""
-- Lifetime Value of Repeat and One-Time Customers and their Average Order Value
WITH customer_orders AS (
    -- Count how many times each order_id appears and sum their total_price
    SELECT order_id, SUM(total_price) AS total_spent, COUNT(order_id) AS order_count
    FROM sales_data
    GROUP BY order_id
),
customer_classification AS (
    -- Label Orders as One-time or Repeat Customers
    SELECT 
        order_id,
        total_spent,
        CASE 
            WHEN order_count = 1 THEN 'One-time Customer'
            ELSE 'Repeat Customer'
        END AS customer_type
    FROM customer_orders
)
-- Calculate Lifetime Value by Customer Type
SELECT 
    customer_type,
    COUNT(order_id) AS total_customers,
    SUM(total_spent) AS total_revenue,
    AVG(total_spent) AS avg_lifetime_value
FROM customer_classification
GROUP BY customer_type;
""")

ltv_repeat_v_onetime = pd.read_sql(sql_ltv_repeat_v_onetime, engine)
ltv_repeat_v_onetime.head()

Unnamed: 0,customer_type,total_customers,total_revenue,avg_lifetime_value
0,One-time Customer,171558,30818273.75,179.64
1,Repeat Customer,6879,3647264.19,530.2


In [12]:
sql_products_sold_together = text("""
-- Products Frequently Purchased Together
SELECT 
    LEAST(a.product, b.product) AS product_1, 
    GREATEST(a.product, b.product) AS product_2, 
    COUNT(*) AS purchase_count
FROM sales_data a
JOIN sales_data b 
    ON a.order_id = b.order_id 
    AND a.product <> b.product
GROUP BY product_1, product_2
ORDER BY purchase_count DESC
LIMIT 20;
""")

products_most_sold_together = pd.read_sql(sql_products_sold_together, engine)
products_most_sold_together.head(20)

Unnamed: 0,product_1,product_2,purchase_count
0,iPhone,Lightning Charging Cable,2022
1,Google Phone,USB-C Charging Cable,1994
2,iPhone,Wired Headphones,924
3,Google Phone,Wired Headphones,844
4,Apple Airpods Headphones,iPhone,746
5,USB-C Charging Cable,Vareebadd Phone,736
6,Bose SoundSport Headphones,Google Phone,456
7,USB-C Charging Cable,Wired Headphones,406
8,Vareebadd Phone,Wired Headphones,298
9,Lightning Charging Cable,Wired Headphones,258


In [13]:
sql_missed_product_bundles = text("""
-- Commonly Bundled Products That Were Purchased Separately
-- Step 1: Top 10 Frequently Purchased Together Product Pairs
WITH top_product_pairs AS (
    SELECT 
        a.product AS product_1, 
        b.product AS product_2, 
        COUNT(*) AS purchase_count
    FROM sales_data a
    JOIN sales_data b 
        ON a.order_id = b.order_id 
        AND a.product <> b.product  -- Ensures it's not the same product
    GROUP BY product_1, product_2
    ORDER BY purchase_count DESC
    LIMIT 20
)

-- Step 2: Count How Often Each Product Was Purchased Alone
SELECT 
    p.product, 
    COUNT(*) AS purchase_alone_count
FROM sales_data p
WHERE p.product IN (
    -- Get all products from the top 10 pairs
    SELECT product_1 FROM top_product_pairs
    UNION
    SELECT product_2 FROM top_product_pairs
)
AND p.order_id NOT IN (
    -- Exclude orders that contain both products in a top 10 pair
    SELECT DISTINCT a.order_id 
    FROM sales_data a
    JOIN sales_data b 
        ON a.order_id = b.order_id 
        AND a.product <> b.product
    JOIN top_product_pairs tp 
        ON (a.product = tp.product_1 AND b.product = tp.product_2)
)
GROUP BY p.product
ORDER BY purchase_alone_count DESC;
""")

missed_product_bundles = pd.read_sql(sql_missed_product_bundles, engine)
missed_product_bundles.head(20)

Unnamed: 0,product,purchase_alone_count
0,Lightning Charging Cable,20531
1,USB-C Charging Cable,20410
2,Wired Headphones,17671
3,Apple Airpods Headphones,15147
4,Bose SoundSport Headphones,13050
5,iPhone,5126
6,Google Phone,4017
7,Vareebadd Phone,1580


In [14]:
missed_product_bundles['purchase_alone_count'].sum()

97532

In [15]:
sql_zip_by_sales = text("""
-- Zip Codes by Total Sales (There are only Ten Distinct Zip Codes in the Dataset)
SELECT 
    zip, SUM(total_price) AS total_sales
FROM
    sales_data
GROUP BY zip
ORDER BY total_sales DESC;
""")

zip_by_sales = pd.read_sql(sql_zip_by_sales, engine)
zip_by_sales.head(20)

Unnamed: 0,zip,total_sales
0,94016,8254743.55
1,90001,5448304.28
2,10001,4661867.14
3,2215,3658627.65
4,30301,2794199.07
5,75001,2765373.96
6,98101,2745046.02
7,97035,1870010.56
8,73301,1818044.33
9,4101,449321.38


In [16]:
sql_top5_quantity_by_top5_zip = text("""
-- Top 5 Most Sold Products for Each of the Top 5 Zip Codes by Total Sales
-- Step 1: Get the Top 5 Zip Codes by Total Sales
WITH top_zipcodes AS (
    SELECT 
        zip, 
        SUM(total_price) AS total_sales
    FROM sales_data
    GROUP BY zip
    ORDER BY total_sales DESC
    LIMIT 5  -- Fix: Changed from 10 to 5
)
-- Step 2: Get the Top 5 Products Sold in Each of These Zip Codes
SELECT s.zip, s.product, s.total_quantity_sold, s.rank_order
FROM (
    SELECT 
        s.zip, 
        s.product, 
        SUM(s.quantity_ordered) AS total_quantity_sold,
        RANK() OVER (PARTITION BY s.zip ORDER BY SUM(s.quantity_ordered) DESC) AS rank_order
    FROM sales_data s
    JOIN top_zipcodes tz ON s.zip = tz.zip
    GROUP BY s.zip, s.product
) s
WHERE s.rank_order <= 5  -- Fix: Changed from <= 1 to <= 5
ORDER BY s.zip, s.rank_order;
""")

top5_quantity_by_top5_zip = pd.read_sql(sql_top5_quantity_by_top5_zip, engine)
top5_quantity_by_top5_zip.head(25)

Unnamed: 0,zip,product,total_quantity_sold,rank_order
0,2215,AAA Batteries (4-pack),3458.0,1
1,2215,AA Batteries (4-pack),3011.0,2
2,2215,USB-C Charging Cable,2555.0,3
3,2215,Lightning Charging Cable,2482.0,4
4,2215,Wired Headphones,2222.0,5
5,10001,AAA Batteries (4-pack),4119.0,1
6,10001,AA Batteries (4-pack),3629.0,2
7,10001,USB-C Charging Cable,3263.0,3
8,10001,Lightning Charging Cable,3039.0,4
9,10001,Wired Headphones,2702.0,5


In [17]:
sql_top5_product_by_totalsales_by_top5_zip = text("""
-- Top 5 Products by Total Sales for Each of the Top 5 Zip Codes by Total Sales
-- Step 1: Get the Top 5 Zip Codes by Total Sales
WITH top_zipcodes AS (
    SELECT 
        zip, 
        SUM(total_price) AS total_sales
    FROM sales_data
    GROUP BY zip
    ORDER BY total_sales DESC
    LIMIT 5  -- Fix: Changed from 10 to 5
)
-- Step 2: Get the Top 5 Products Sold in Each of These Zip Codes by Total Revenue
SELECT s.zip, s.product, s.total_sales, s.rank_order
FROM (
    SELECT 
        s.zip, 
        s.product, 
        SUM(s.total_price) AS total_sales,
        RANK() OVER (PARTITION BY s.zip ORDER BY SUM(s.total_price) DESC) AS rank_order
    FROM sales_data s
    JOIN top_zipcodes tz ON s.zip = tz.zip
    GROUP BY s.zip, s.product
) s
WHERE s.rank_order <= 5  -- Fix: Changed from <= 1 to <= 5
ORDER BY s.zip, s.rank_order;

""")

top5_product_by_totalsales_by_top5_zip = pd.read_sql(sql_top5_product_by_totalsales_by_top5_zip, engine)
top5_product_by_totalsales_by_top5_zip.head(25)

Unnamed: 0,zip,product,total_sales,rank_order
0,2215,Macbook Pro Laptop,814300.0,1
1,2215,iPhone,526400.0,2
2,2215,ThinkPad Laptop,446995.53,3
3,2215,Google Phone,355200.0,4
4,2215,27in 4K Gaming Monitor,263243.25,5
5,10001,Macbook Pro Laptop,1116900.0,1
6,10001,iPhone,616700.0,2
7,10001,ThinkPad Laptop,559994.4,3
8,10001,Google Phone,454200.0,4
9,10001,34in Ultrawide Monitor,329451.33,5


In [18]:
# Dispose of the engine

engine.dispose()  
print("Engine disposed")

Engine disposed


In [19]:
# Part II: SQL Queries for Analysis is complete!

# Time to distill these result sets into insights that drive business!