# Order Data Analysis with Python and SQL

In [4]:
# Import Libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mysql.connector
import plotly.express as px


In [5]:
# Establish a connection to the MySQL database
db = mysql.connector.connect(
    host="localhost",
    user="root", 
    password="123456",
    database="retail_database"
)

# Create a cursor object to interact with the database
cur = db.cursor()

## Quaries

### 1. Find top 10 highest reveue generating products

In [6]:
query="""
SELECT product_id, SUM(sale_price) AS sales
FROM retail_order
GROUP BY product_id
ORDER BY sales DESC
LIMIT 10;
"""
# Execute the query
cur.execute(query)

# Fetch the result
data = cur.fetchall()

# Build a DataFrame
data_df = pd.DataFrame(data)

# Rename columns
data_df.rename(columns={0: 'product_id', 1: 'sales'}, inplace=True)
data_df.head()

Unnamed: 0,product_id,sales
0,TEC-CO-10004722,59514.0
1,OFF-BI-10003527,26525.3
2,TEC-MA-10002412,21734.4
3,FUR-CH-10002024,21096.2
4,OFF-BI-10001359,19090.2


### 2. Find top 5 highest selling products in each region

In [7]:
query="""
WITH cte AS (
    SELECT region, product_id, SUM(sale_price) AS sales
    FROM retail_order
    GROUP BY region, product_id
)
SELECT * 
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY region ORDER BY sales DESC) AS rn
    FROM cte
) AS A
WHERE rn <= 5;
"""
# Execute the query
cur.execute(query)

# Fetch the result
data = cur.fetchall()

# Build a DataFrame
data_df = pd.DataFrame(data)
data_df.rename(columns={0: 'region'}, inplace=True)
data_df['region']

0     Central
1     Central
2     Central
3     Central
4     Central
5        East
6        East
7        East
8        East
9        East
10      South
11      South
12      South
13      South
14      South
15       West
16       West
17       West
18       West
19       West
Name: region, dtype: object

### 3. Find month over month growth comparison for 2022 and 2023 sales eg : jan 2022 vs jan 2023

In [8]:
query="""WITH cte AS (
    SELECT 
        YEAR(order_date) AS order_year,
        MONTH(order_date) AS order_month,
        SUM(sale_price) AS sales
    FROM retail_order
    GROUP BY 
        YEAR(order_date),
        MONTH(order_date)
)
SELECT 
    order_month,
    SUM(CASE WHEN order_year = 2022 THEN sales ELSE 0 END) AS sales_2022,
    SUM(CASE WHEN order_year = 2023 THEN sales ELSE 0 END) AS sales_2023
FROM cte
GROUP BY order_month
ORDER BY order_month;
"""

# Execute the query
cur.execute(query)

# Fetch the result
data = cur.fetchall()

# Build a DataFrame
data_df = pd.DataFrame(data)

# Rename the column
data_df.rename(columns={0: 'order_month', 1: 'sales_2022', 2: 'sales_2023'}, inplace=True)

data_df

Unnamed: 0,order_month,sales_2022,sales_2023
0,1,94712.5,88632.6
1,2,90091.0,128124.2
2,3,80106.0,82512.3
3,4,95451.6,111568.6
4,5,79448.3,86447.9
5,6,94170.5,68976.5
6,7,78652.2,90563.8
7,8,104808.0,87733.6
8,9,79142.2,76658.6
9,10,118912.7,121061.5


### 4. For each category which month had highest sales

In [9]:
query="""WITH cte AS (
    SELECT 
        category,
        FORMAT(order_date, 'yyyyMM') AS order_year_month,
        SUM(sale_price) AS sales
    FROM retail_order
    GROUP BY 
        category,
        FORMAT(order_date, 'yyyyMM')
)
SELECT * 
FROM (
    SELECT *,
        ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) AS rn
    FROM cte
) a
WHERE rn = 1;
"""
cur.execute(query)
data = cur.fetchall()
data_df = pd.DataFrame(data)
data_df.rename(columns={0: 'category', 1: 'order_year_month', 2: 'sales', 3:'row_number'}, inplace=True)

data_df

Unnamed: 0,category,order_year_month,sales,row_number
0,Furniture,20230208,6247.0,1
1,Office Supplies,20230227,10474.6,1
2,Technology,20231013,23064.4,1


### 5. Which sub category had highest growth by profit in 2023 compare to 2022

In [10]:
query = """
WITH cte AS (
    SELECT sub_category,
           YEAR(order_date) AS order_year,
           SUM(sale_price) AS sales
    FROM retail_order
    GROUP BY sub_category, YEAR(order_date)
),
cte2 AS (
    SELECT sub_category,
           SUM(CASE WHEN order_year = 2022 THEN sales ELSE 0 END) AS sales_2022,
           SUM(CASE WHEN order_year = 2023 THEN sales ELSE 0 END) AS sales_2023
    FROM cte 
    GROUP BY sub_category
)
SELECT sub_category,
       (sales_2023 - sales_2022) AS sales_growth
FROM cte2
ORDER BY (sales_2023 - sales_2022) DESC
LIMIT 1;
"""
cur.execute(query)
data = cur.fetchall()
data_df = pd.DataFrame(data, columns=['sub_category', 'sales_growth'])
data_df


Unnamed: 0,sub_category,sales_growth
0,Machines,35455.3
