Module 01: Exploratory Data Analysis for Demand & Inventory

This notebook performs exploratory data analysis (EDA) for Module 01 of the **"Intelligent System for Supply Chain Management"** project.  

The primary goal is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within six months.

---

## Data Acquisition
### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import json
import plotly.express as px
import plotly.io as pio

from plotly.subplots import make_subplots

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from smart_supply_chain_ai.data_processing import get_data

import warnings
warnings.filterwarnings('ignore')

# Set up display options and plotting template
pd.set_option('display.max_columns', None)
pio.templates.default = "plotly_white"
px.defaults.width = 800
px.defaults.height = 600

### Load Dataset

In [2]:
# Define data paths
raw_data_path = os.path.join('../data', 'raw/')
docs_path = os.path.join('../docs/')

In [3]:
# Load the raw dataset
df_raw = pd.read_csv(raw_data_path + 'synthetic_data_grocery_stock.csv')
df_raw.head()

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity
0,2025-02-20,2025-02-15,False,Canned Beans,1913772|P,Pantry,Canned Goods,1095,90,unit,2,PantryEssentials Ltd.,1859586|S,95,130,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,80,5,420,525,574
1,2025-05-08,2025-04-28,False,Rice Flour,1787175|P,Pantry,Baking Supplies,365,90,lb,4,BakeWell Supplies,1803930|S,90,50,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,9,4,40,50,47
2,2023-11-22,2023-11-18,False,Maple Syrup,1027236|P,Pantry,Sweeteners,365,90,unit,5,Sugar & Spice Co.,1076080|S,85,70,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,30,4,152,190,165
3,2024-01-30,2024-01-22,False,Spinach,1300280|P,Fresh Foods,Vegetables,5,2,lb,3,GreenFields Co.,1094553|S,127,150,Refrigerated,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,923,3,4200,5250,6498
4,2024-10-20,2024-10-15,False,Rye Bread,1261252|P,Bakery,Bread,5,2,unit,1,Bakery Fresh Co.,1042337|S,45,75,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Sunday,False,True,High,1020,5,2418,3224,3562


In [4]:
# Load column descriptions from JSON file into a dictionary for reference or documentation
with open(docs_path + 'column_descriptions.json') as f:
    column_descriptions = json.load(f)

## Data Cleaning and Preprocessing

In [5]:
# Create a copy for cleaning and preprocessing
df = df_raw.copy()

In [6]:
# Verify for missing values in the 'Category' column
df.isna().sum()

received_date                   0
lpo                             0
in_season                       0
product                         0
product_id                      0
category                        0
sub_category                    0
shelf_life_days                 0
maximum_days_on_sale            0
unit_of_measurement             0
supplier_rating                 0
supplier                        0
supplier_id                     0
distance_km                     0
moq                             0
storage_recommendation          0
temperature_classification      0
precipitation_classification    0
wind_classification             0
weather_severity                0
day_classification              0
is_holiday                      0
is_weekend                      0
sales_demand                    0
sales_volume                    0
lead_time                       0
min_stock                       0
max_stock                       0
stock_quantity                  0
dtype: int64

### Convert Data Types for Analysis

In [7]:
# Check the data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   received_date                 3000 non-null   object
 1   lpo                           3000 non-null   object
 2   in_season                     3000 non-null   bool  
 3   product                       3000 non-null   object
 4   product_id                    3000 non-null   object
 5   category                      3000 non-null   object
 6   sub_category                  3000 non-null   object
 7   shelf_life_days               3000 non-null   int64 
 8   maximum_days_on_sale          3000 non-null   int64 
 9   unit_of_measurement           3000 non-null   object
 10  supplier_rating               3000 non-null   int64 
 11  supplier                      3000 non-null   object
 12  supplier_id                   3000 non-null   object
 13  distance_km       

In [8]:
# Convert date columns to datetime objects
date_columns = ['received_date', 'lpo']
df[date_columns] = df[date_columns].apply(pd.to_datetime, errors='coerce')

In [9]:
# Convert categorical columns to the 'category' type for memory efficiency
cat_columns = ['category', 'sub_category', 'unit_of_measurement', 'supplier_rating', 'temperature_classification', 'precipitation_classification', 'wind_classification', 'weather_severity', 'day_classification', 'sales_demand']
df[cat_columns] = df[cat_columns].astype('category')

In [10]:
# Convert numerical column to string
df['distance_km'] = df['distance_km'].astype(object)

In [11]:
# Statistics for Numeric columns
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
received_date,3000.0,2024-04-21 12:08:09.600000,2022-12-09 00:00:00,2023-08-16 00:00:00,2024-04-17 00:00:00,2024-12-20 06:00:00,2025-09-22 00:00:00,
lpo,3000.0,2024-04-15 13:06:14.400000256,2022-11-30 00:00:00,2023-08-09 00:00:00,2024-04-12 00:00:00,2024-12-15 06:00:00,2025-09-20 00:00:00,
shelf_life_days,3000.0,260.115333,2.0,7.0,30.0,365.0,1825.0,381.724587
maximum_days_on_sale,3000.0,55.585667,1.0,3.0,14.0,90.0,365.0,72.748563
moq,3000.0,78.119,10.0,45.0,70.0,90.0,300.0,53.175948
sales_volume,3000.0,361.709667,3.0,34.0,142.5,553.25,4068.0,478.92775
lead_time,3000.0,4.704333,2.0,4.0,5.0,5.0,12.0,1.201692
min_stock,3000.0,1499.986333,27.0,168.0,672.0,2298.0,8250.0,1803.73094
max_stock,3000.0,1864.185333,50.0,203.0,812.0,3015.0,9625.0,2198.27288
stock_quantity,3000.0,1642.222667,1.0,178.0,516.0,2537.0,11795.0,2131.250666


In [12]:
# Statistics for Categorical columns
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
category,3000,6,Pantry,1176
sub_category,3000,27,Vegetables,381
unit_of_measurement,3000,2,unit,1591
supplier_rating,3000,5,5,825
temperature_classification,3000,5,Mild to Temperate,1442
precipitation_classification,3000,5,No precipitation,1773
wind_classification,3000,3,Gentle to Fresh Breeze,2343
weather_severity,3000,3,Moderate,2232
day_classification,3000,4,Weekdays,2117
sales_demand,3000,3,Normal,1785


In [13]:
# Statistics for String columns
df.describe(include=['object']).T

Unnamed: 0,count,unique,top,freq
product,3000,170,Shrimp,52
product_id,3000,170,1177768|P,52
supplier,3000,50,Oil & Vinegar Co.,138
supplier_id,3000,50,1878487|S,138
distance_km,3000,42,95,205
storage_recommendation,3000,3,Room Temperature,1619


In [14]:
# Display the minimum date for each column
df[['received_date', 'lpo']].min()

received_date   2022-12-09
lpo             2022-11-30
dtype: datetime64[ns]

In [15]:
# Display the maximum date for each column
df[['received_date', 'lpo']].max()

received_date   2025-09-22
lpo             2025-09-20
dtype: datetime64[ns]

In [16]:
# Selects all columns in the DataFrame that have numeric data types (e.g., int, float)
df.select_dtypes(include=np.number)


Unnamed: 0,shelf_life_days,maximum_days_on_sale,moq,sales_volume,lead_time,min_stock,max_stock,stock_quantity
0,1095,90,130,80,5,420,525,574
1,365,90,50,9,4,40,50,47
2,365,90,70,30,4,152,190,165
3,5,2,150,923,3,4200,5250,6498
4,5,2,75,1020,5,2418,3224,3562
...,...,...,...,...,...,...,...,...
2995,365,90,40,161,3,784,980,821
2996,21,7,120,372,6,3198,3731,3943
2997,21,7,40,276,6,1640,1968,2140
2998,5,2,150,3131,6,5704,7130,492


In [17]:
# Selects all columns in the DataFrame that have the 'category' data type
df.select_dtypes(include='category')


Unnamed: 0,category,sub_category,unit_of_measurement,supplier_rating,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,sales_demand
0,Pantry,Canned Goods,unit,2,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
1,Pantry,Baking Supplies,lb,4,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,Normal
2,Pantry,Sweeteners,unit,5,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
3,Fresh Foods,Vegetables,lb,3,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
4,Bakery,Bread,unit,1,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Sunday,High
...,...,...,...,...,...,...,...,...,...,...
2995,Pantry,Snacks,unit,5,Cool,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
2996,Fresh Foods,Fruits,lb,1,Warm,No precipitation,Calm / Light Breeze,Moderate,Weekdays,Normal
2997,Dairy & Alternatives,Dairy,lb,3,Warm,Heavy Rain,Gentle to Fresh Breeze,Severe,Saturday,High
2998,Fresh Foods,Vegetables,lb,3,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Holiday,Very High


In [18]:
# Selects all columns in the DataFrame that are neither numeric nor categorical (e.g., object, boolean, datetime)
df.select_dtypes(exclude=['category', np.number])

Unnamed: 0,received_date,lpo,in_season,product,product_id,supplier,supplier_id,distance_km,storage_recommendation,is_holiday,is_weekend
0,2025-02-20,2025-02-15,False,Canned Beans,1913772|P,PantryEssentials Ltd.,1859586|S,95,Room Temperature,False,False
1,2025-05-08,2025-04-28,False,Rice Flour,1787175|P,BakeWell Supplies,1803930|S,90,Room Temperature,False,False
2,2023-11-22,2023-11-18,False,Maple Syrup,1027236|P,Sugar & Spice Co.,1076080|S,85,Room Temperature,False,False
3,2024-01-30,2024-01-22,False,Spinach,1300280|P,GreenFields Co.,1094553|S,127,Refrigerated,False,False
4,2024-10-20,2024-10-15,False,Rye Bread,1261252|P,Bakery Fresh Co.,1042337|S,45,Room Temperature,False,True
...,...,...,...,...,...,...,...,...,...,...,...
2995,2025-05-21,2025-05-12,False,Chocolate Bar,1614167|P,Chocolate Heaven,1788546|S,140,Room Temperature,False,False
2996,2024-03-19,2024-03-15,False,Apple,1004205|P,OrchardBest Fruits,1677419|S,200,Refrigerated,False,False
2997,2023-12-16,2023-12-14,False,Gouda Cheese,1738341|P,Artisan Cheesemakers,1631099|S,95,Refrigerated,False,True
2998,2023-12-25,2023-12-23,True,Kale,1488875|P,GreenFields Co.,1094553|S,127,Refrigerated,True,False


# Feature Engineering: Create New Features

In [19]:
# Calculate the supplier delivery lag (number of days between order placement and receipt)
df['delivery_lag'] = (df['received_date'] - df['lpo']).dt.days

# Add a short description for the 'delivery_lag' column
# column_descriptions.update({'delivery_lag': 'Delivery delay in days.'})

In [20]:
# Keep only rows where delivery took longer than the product's shelf life
lag_df = df.query('shelf_life_days < delivery_lag')

In [21]:
# Make a chart showing how long deliveries take
lag = px.histogram(lag_df, x='delivery_lag', title="Delivery lags exceed the product's shelf life.", labels={'delivery_lag': 'lag'})

# Add a small space between the bars
lag.update_layout(bargap=0.1)

# Show the chart
lag.show()

In [22]:
# Perform initial exploratory analysis on delivery_lag to understand its distribution
df['delivery_lag'].describe()

count    3000.000000
mean        5.959667
std         2.575186
min         2.000000
25%         4.000000
50%         6.000000
75%         8.000000
max        10.000000
Name: delivery_lag, dtype: float64

In [23]:
# Create 'Expiration_Status' (Expired, Nearing, Safe)
df['expiration_status'] = np.where(df['delivery_lag'] > df['shelf_life_days'], 'Expired', 
                                         np.where(df['delivery_lag'] > df['maximum_days_on_sale'], 'Nearing', 'Safe'))
df['expiration_status'] = df['expiration_status'].astype('category')

# Add a description for the 'expiration_status' column
# column_descriptions.update({'expiration_status': 'Indicates whether the product is expired or still valid.'})

In [24]:
# Get the year from the 'lpo' date and store it in a new column
df['year'] = df['lpo'].dt.year

# Group data by product and year, then calculate total sales and total stock
df_annual_turnover = df.groupby(['product', 'year']).agg(
    total_sales=('sales_volume', 'sum'),
    total_stock=('stock_quantity', 'mean')
).reset_index()

# Calculate inventory turnover rate by dividing total sales by total stock
df_annual_turnover['inventory_turnover_rate'] = df_annual_turnover['total_sales'] / df_annual_turnover['total_stock']

# Calculate the average inventory turnover rate for each product
df_average_turnover = df_annual_turnover.groupby('product')[['inventory_turnover_rate']].mean().reset_index()

In [25]:
# Add the average turnover rate to the main DataFrame
df = df.merge(df_average_turnover)

# Remove the 'year' column since it's no longer needed
df.drop(columns='year', inplace=True)

# Add a description for the 'inventory_turnover_rate' column
# column_descriptions.update({'inventory_turnover_rate': 'Shows how often inventory is sold and replaced over a period.'})

In [26]:
# Calculate the number of days in the dataset's period
period_days = (df['received_date'].max() - df['received_date'].min()).days

# Use 'inventory_turnover_rate' to calculate Days of Inventory (DOI)
# DOI indicates how many days the current stock can cover based on turnover
df['doi_inventory_turnover'] = (period_days / df['inventory_turnover_rate']).astype('int')

# Add description of the new column to the dictionary
# column_descriptions.update({'doi_inventory_turnover': 'Stock coverage in days.'})


# Exploratory Data Analysis (EDA)

### Distribution of Numerical Variables

In [27]:
# Histograms for numerical variables
# num_cols = ['stock_quantity', 'sales_volume', 'shelf_life_days', 'distance_km', 'delivery_lag']
num_cols = df.select_dtypes(np.number)

# Loop through each numerical column to generate a histogram
for col in num_cols:
    # Create a histogram using Plotly Express for the current column
    fig = px.histogram(df, x=col, title=f'Distribution of {col}', nbins=30)
    
    # Adjust the gap between bars for better readability
    fig.update_layout(bargap=0.1)
    
    # Display the histogram
    fig.show()


### Distribution of Categorical Variables

In [28]:
# Bar charts for categorical variables
# cat_cols = ['category', 'sub_category', 'sales_demand', 'expiration_status']
cat_cols = df.select_dtypes('category')

# Loop through each categorical column to generate a bar chart
for col in cat_cols:
    # Create a bar chart showing the frequency of each category
    fig = px.bar(
        df[col].value_counts(),  # Count occurrences of each category
        title=f'Distribution of {col}',  # Chart title
        labels={'value': 'Count', 'index': col}  # Axis labels
    )
    
    # Display the bar chart
    fig.show()


### Relation Between Stock and Sales

In [29]:
# Scatter plot: Stock vs Sales

# Create a scatter plot using the DataFrame 'df'
fig = px.scatter(
    df,  # Data source
    x='stock_quantity',  # X-axis represents stock quantity
    y='sales_volume',    # Y-axis represents sales volume
    color='sales_demand',  # Point color reflects sales demand
    title='Relation between Stock and Sales Volume',  # Chart title
    labels={  # Custom axis labels
        'stock_quantity': 'Stock',
        'sales_volume': 'Sales',
        'sales_demand': 'Demand'
    }
)

# Display the interactive chart
fig.show()


### Expiration Status for Category

In [30]:
# Create a stacked bar chart to visualize expiration status across product categories
fig = px.histogram(
    df,
    x='category',  
    color='expiration_status',  
    barmode='group',  
    title='Expiration Status by Category',
    labels={'expiration_status': 'Status'}
)

# Display the interactive chart
fig.show()


### Stock Turnover for Category

In [31]:
# Box plot: Inventory Turnover by Subcategory
fig = px.box(
    df,
    x='sub_category',
    y='inventory_turnover_rate',
    title='Inventory Turnover Rate by Subcategory'
)

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45)

# Display the chart
fig.show()


### Suppliers with the Longest Delivery Delays

In [32]:
# Top 10 suppliers with the highest average delivery delay
supplier_delay = (
    df.groupby('supplier')['delivery_lag']  # Group by supplier and calculate average delivery lag
    .mean()
    .sort_values(ascending=False)  # Sort in descending order
    .head(10)  # Select top 10 suppliers
)

# Create a bar chart to visualize the results
fig = px.bar(
    supplier_delay,
    title='Top 10 Suppliers by Average Delivery Delay'
)

# Display the chart
fig.show()


### Demand for Climate Classification

In [33]:
# Demand by Weather Severity
fig = px.histogram(
    df,
    x='weather_severity',
    color='sales_demand',
    barmode='group',
    title='Sales Demand by Weather Severity',
    labels={'sales_demand': 'Demand'}
)

fig.show()


### Correlation between Numerical Variables

In [34]:
# Create a heatmap showing how numeric columns are related
corr = df.select_dtypes(include=np.number).corr()  # Get correlation between numeric columns

# Hide the top half of the matrix to avoid repeating values
mask = np.tril(np.ones(corr.shape), k=-1)
masked_corr = corr.where(mask == 1)

# Show the heatmap with numbers and a title
fig = px.imshow(
    masked_corr,
    text_auto=True,
    aspect="auto",
    title='Correlation Matrix'
)

# Make the chart bigger
fig.update_layout(width=1200, height=1200)

# Display the chart
fig.show()


In [35]:
# Define data paths
processed_data_path = os.path.join('../data', 'processed')

utils_data_path = os.path.join('../docs/column_descriptions.json')

In [36]:
# Sort DataFrame by received_date in ascending order
df = df.sort_values(by='received_date').reset_index(drop=True)

In [37]:
# Show 10 random rows from the DataFrame
df.sample(10)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
1006,2023-11-10,2023-11-06,False,Soy Sauce,1812675|P,Pantry,Condiments,1095,180,unit,4,International Foods Inc.,1041131|S,250,70,Room Temperature,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,40,5,230,276,251,4,Safe,2.531523,402
426,2023-04-30,2023-04-27,False,Gouda Cheese,1738341|P,Dairy & Alternatives,Dairy,21,7,lb,3,Artisan Cheesemakers,1631099|S,95,40,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,443,5,1640,1968,1653,3,Safe,0.743825,1368
2065,2024-10-21,2024-10-13,False,Dijon Mustard,1070686|P,Pantry,Condiments,365,90,unit,3,Condiment Masters,1184993|S,75,60,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,43,3,240,300,320,8,Safe,0.839816,1212
1432,2024-03-24,2024-03-18,False,Zucchini,1575618|P,Fresh Foods,Vegetables,7,3,lb,2,Local Farm Collective,1188022|S,35,75,Refrigerated,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Sunday,False,True,High,1332,4,3375,4500,5295,6,Nearing,2.050838,496
2590,2025-04-14,2025-04-10,False,Cheddar Cheese,1179690|P,Dairy & Alternatives,Dairy,21,7,lb,4,Artisan Cheesemakers,1631099|S,95,40,Refrigerated,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,222,5,1084,1355,1285,4,Safe,1.127651,902
2598,2025-04-17,2025-04-15,False,Plum,1698666|P,Fresh Foods,Fruits,5,2,lb,4,Stone Fruit Specialists,1820407|S,165,38,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,740,5,6432,7504,8374,2,Safe,0.717905,1418
288,2023-03-13,2023-03-10,True,Kiwi,1991641|P,Fresh Foods,Fruits,14,7,lb,5,Tropical Fruits Ltd.,1939085|S,350,80,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,548,6,3606,4207,529,3,Safe,0.510231,1995
925,2023-10-15,2023-10-13,False,Frozen Berries,1344073|P,Frozen Foods,Fruits,270,90,lb,5,FrozenFoods Express,1579962|S,65,90,Frozen,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Sunday,False,True,High,611,5,2004,2505,2450,2,Safe,0.958726,1061
1615,2024-05-23,2024-05-19,False,Yogurt,1203174|P,Dairy & Alternatives,Dairy,14,5,unit,4,DairyPure Inc.,1240194|S,50,80,Refrigerated,Mild to Temperate,Heavy Rain,Gentle to Fresh Breeze,Severe,Weekdays,False,False,Normal,238,3,975,1300,1450,4,Safe,1.503581,677
1956,2024-09-16,2024-09-08,False,Banana,1440054|P,Fresh Foods,Fruits,7,3,lb,2,Emergency Supplier,1454719|S,15,10,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,765,3,2607,3476,3860,8,Expired,2.060725,494


In [38]:
# Save Data
df.to_pickle(processed_data_path + '/grocery.pkl')

# save Dictionary JSON archive
with open(utils_data_path, 'w') as f:
    json.dump(column_descriptions, f, indent=4)