Module 01: Exploratory Data Analysis for Demand & Inventory

This notebook performs exploratory data analysis (EDA) for Module 01 of the **"Intelligent System for Supply Chain Management"** project.  

The primary goal is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within six months.

---

## Data Acquisition
### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import json
import plotly.express as px
import plotly.io as pio

from plotly.subplots import make_subplots

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from smart_supply_chain_ai.data_processing import get_data

import warnings
warnings.filterwarnings('ignore')

# Set up display options and plotting template
pd.set_option('display.max_columns', None)
pio.templates.default = "plotly_white"
px.defaults.width = 800
px.defaults.height = 600

### Load Dataset

In [2]:
# Define data paths
raw_data_path = os.path.join('../data', 'raw/')
docs_path = os.path.join('../docs/')

In [3]:
# Load the raw dataset
df_raw = pd.read_csv(raw_data_path + 'synthetic_data_grocery_stock.csv')
df_raw.head()

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,stock_quantity,sales_volume,min_stock,max_stock,reorder_point,unit_of_measurement,barcode_ean,supplier_rating,supplier,supplier_id,distance_km,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand
0,2023-04-12,2023-04-05,False,Milk,1246179|P,Dairy,Milk,7,4,20,94,11,25,14,carton,8712345000483,5,SupplyTotal Logistics,1252625|S,1237,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal
1,2025-02-26,2025-02-16,False,Banana,1532114|P,Fresh Foods,Fruits,3,2,16,92,8,24,14,lb,8712345000100,5,AgroExpress Supplies,1113380|S,276,"Room temperature, away from other fruits",Warm,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal
2,2023-07-16,2023-07-10,False,Banana,1532114|P,Fresh Foods,Fruits,3,2,11,156,8,24,14,lb,8712345000100,2,BioSupply,1082312|S,421,"Room temperature, away from other fruits",Cool,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High
3,2023-09-07,2023-09-04,False,Milk,1246179|P,Dairy,Milk,7,4,14,233,11,25,14,carton,8712345000483,1,SupplyQuality Foods,1235356|S,1210,Refrigerated,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Holiday,True,False,Very High
4,2025-07-14,2025-07-09,False,Peas,1457605|P,Fresh Foods,Vegetables,4,2,12,54,10,25,12,pack,8712345000087,5,AgroPrime Foods,1656636|S,101,Refrigerated,Cool,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal


In [4]:
# Load column descriptions from JSON file into a dictionary for reference or documentation
with open(docs_path + 'column_descriptions.json') as f:
    column_descriptions = json.load(f)

In [5]:
column_descriptions

{'received_date': 'Date the product was received in inventory',
 'lpo': 'Date the Last Purchase Order',
 'in_season': 'Indicates if the product is currently in season',
 'product': 'Name or description of the product',
 'product_id': 'Unique identifier for the product',
 'category': 'Main classification of the product',
 'sub_category': 'Sub-classification within the main category',
 'shelf_life_days': 'Number of days the product remains sellable',
 'maximum_days_on_sale': 'Maximum number of days the product can be on sale',
 'stock_quantity': 'Current quantity of the product in stock',
 'sales_volume': 'Total units sold over a given period',
 'min_stock': 'Minimum stock level before restocking is required',
 'max_stock': 'Maximum allowable stock level',
 'reorder_point': 'Stock level at which a reorder should be triggered',
 'unit_of_measurement': 'Unit used to quantify the product (e.g., kg, liters)',
 'barcode_ean': 'European Article Number barcode for the product',
 'supplier_ratin

## Data Cleaning and Preprocessing

In [6]:
df_raw.shape

(3000, 29)

In [7]:
# Create a copy for cleaning and preprocessing
df = df_raw.copy()

In [8]:
# Verify for missing values in the 'Category' column
df.isna().sum()

received_date                   0
lpo                             0
in_season                       0
product                         0
product_id                      0
category                        0
sub_category                    0
shelf_life_days                 0
maximum_days_on_sale            0
stock_quantity                  0
sales_volume                    0
min_stock                       0
max_stock                       0
reorder_point                   0
unit_of_measurement             0
barcode_ean                     0
supplier_rating                 0
supplier                        0
supplier_id                     0
distance_km                     0
storage_recommendation          0
temperature_classification      0
precipitation_classification    0
wind_classification             0
weather_severity                0
day_classification              0
is_holiday                      0
is_weekend                      0
sales_demand                    0
dtype: int64

### Convert Data Types for Analysis

In [9]:
# Check the data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   received_date                 3000 non-null   object
 1   lpo                           3000 non-null   object
 2   in_season                     3000 non-null   bool  
 3   product                       3000 non-null   object
 4   product_id                    3000 non-null   object
 5   category                      3000 non-null   object
 6   sub_category                  3000 non-null   object
 7   shelf_life_days               3000 non-null   int64 
 8   maximum_days_on_sale          3000 non-null   int64 
 9   stock_quantity                3000 non-null   int64 
 10  sales_volume                  3000 non-null   int64 
 11  min_stock                     3000 non-null   int64 
 12  max_stock                     3000 non-null   int64 
 13  reorder_point     

In [10]:
# Convert date columns to datetime objects
date_columns = ['received_date', 'lpo']
df[date_columns] = df[date_columns].apply(pd.to_datetime, errors='coerce')

In [11]:
# Convert categorical columns to the 'category' type for memory efficiency
cat_columns = ['category', 'sub_category', 'unit_of_measurement', 'barcode_ean', 'supplier_rating', 'temperature_classification', 'precipitation_classification', 'wind_classification', 'weather_severity', 'day_classification', 'sales_demand']
df[cat_columns] = df[cat_columns].astype('category')

In [12]:
# Statistics for Numeric columns
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
received_date,3000.0,2024-04-19 20:52:19.200000,2022-12-09 00:00:00,2023-08-10 18:00:00,2024-04-23 00:00:00,2024-12-20 06:00:00,2025-09-22 00:00:00,
lpo,3000.0,2024-04-13 19:46:04.800000256,2022-12-01 00:00:00,2023-08-05 00:00:00,2024-04-18 00:00:00,2024-12-14 00:00:00,2025-09-20 00:00:00,
shelf_life_days,3000.0,24.784333,3.0,4.0,7.0,14.0,730.0,89.887654
maximum_days_on_sale,3000.0,16.099,2.0,2.0,4.0,9.0,486.0,59.929974
stock_quantity,3000.0,18.825333,0.0,13.0,19.0,26.0,45.0,8.969038
sales_volume,3000.0,110.229,1.0,52.0,92.0,147.25,627.0,82.912283
min_stock,3000.0,9.725,8.0,9.0,10.0,11.0,11.0,1.136576
max_stock,3000.0,24.557667,23.0,24.0,25.0,25.0,26.0,0.973819
reorder_point,3000.0,11.181333,9.0,10.0,11.0,12.0,14.0,1.428448
distance_km,3000.0,357.475,84.0,105.0,174.0,365.0,1890.0,393.615377


In [13]:
# Statistics for Categorical columns
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
category,3000,5,Fresh Foods,2322
sub_category,3000,10,Vegetables,1014
unit_of_measurement,3000,13,lb,1129
barcode_ean,3000,36,8712345000483,230
supplier_rating,3000,5,5,1560
temperature_classification,3000,5,Mild to Temperate,1430
precipitation_classification,3000,5,No precipitation,1770
wind_classification,3000,3,Gentle to Fresh Breeze,2310
weather_severity,3000,3,Moderate,2191
day_classification,3000,4,Weekdays,2078


In [14]:
# Statistics for String columns
df.describe(include=['object']).T

Unnamed: 0,count,unique,top,freq
product,3000,36,Milk,230
product_id,3000,36,1246179|P,230
supplier,3000,32,FreshHarvest Ltd.,300
supplier_id,3000,32,1926670|S,300
storage_recommendation,3000,12,Refrigerated,1920


In [15]:
# Display the minimum date for each column
df[['received_date', 'lpo']].min()

received_date   2022-12-09
lpo             2022-12-01
dtype: datetime64[ns]

In [16]:
# Display the maximum date for each column
df[['received_date', 'lpo']].max()

received_date   2025-09-22
lpo             2025-09-20
dtype: datetime64[ns]

In [17]:
# Selects all columns in the DataFrame that have numeric data types (e.g., int, float)
df.select_dtypes(include=np.number)


Unnamed: 0,shelf_life_days,maximum_days_on_sale,stock_quantity,sales_volume,min_stock,max_stock,reorder_point,distance_km
0,7,4,20,94,11,25,14,1237
1,3,2,16,92,8,24,14,276
2,3,2,11,156,8,24,14,421
3,7,4,14,233,11,25,14,1210
4,4,2,12,54,10,25,12,101
...,...,...,...,...,...,...,...,...
2995,14,9,9,19,9,24,10,621
2996,14,9,11,57,11,25,12,84
2997,4,2,23,155,10,23,11,105
2998,5,3,5,26,10,25,11,142


In [18]:
# Selects all columns in the DataFrame that have the 'category' data type
df.select_dtypes(include='category')


Unnamed: 0,category,sub_category,unit_of_measurement,barcode_ean,supplier_rating,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,sales_demand
0,Dairy,Milk,carton,8712345000483,5,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
1,Fresh Foods,Fruits,lb,8712345000100,5,Warm,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
2,Fresh Foods,Fruits,lb,8712345000100,2,Cool,No precipitation,Calm / Light Breeze,Normal,Sunday,High
3,Dairy,Milk,carton,8712345000483,1,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Holiday,Very High
4,Fresh Foods,Vegetables,pack,8712345000087,5,Cool,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
...,...,...,...,...,...,...,...,...,...,...,...
2995,Dairy,Cheeses,tub,8712345000223,1,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,Normal
2996,Fresh Foods,Vegetables,unit,8712345000612,5,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,High
2997,Fresh Foods,Vegetables,pack,8712345000032,5,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,High
2998,Fresh Foods,Fruits,pack,8712345000179,5,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,Normal


In [19]:
# Selects all columns in the DataFrame that are neither numeric nor categorical (e.g., object, boolean, datetime)
df.select_dtypes(exclude=['category', np.number])

Unnamed: 0,received_date,lpo,in_season,product,product_id,supplier,supplier_id,storage_recommendation,is_holiday,is_weekend
0,2023-04-12,2023-04-05,False,Milk,1246179|P,SupplyTotal Logistics,1252625|S,Refrigerated,False,False
1,2025-02-26,2025-02-16,False,Banana,1532114|P,AgroExpress Supplies,1113380|S,"Room temperature, away from other fruits",False,False
2,2023-07-16,2023-07-10,False,Banana,1532114|P,BioSupply,1082312|S,"Room temperature, away from other fruits",False,True
3,2023-09-07,2023-09-04,False,Milk,1246179|P,SupplyQuality Foods,1235356|S,Refrigerated,True,False
4,2025-07-14,2025-07-09,False,Peas,1457605|P,AgroPrime Foods,1656636|S,Refrigerated,False,False
...,...,...,...,...,...,...,...,...,...,...
2995,2025-03-26,2025-03-17,False,Ricotta Cheese,1967642|P,SupplyWorld Logistics,1530086|S,Refrigerated,False,False
2996,2025-05-04,2025-04-27,False,Bell Pepper,1026354|P,FreshHarvest Ltd.,1926670|S,Refrigerated,False,True
2997,2023-09-17,2023-09-11,False,Mushrooms,1747170|P,AgroNova,1258259|S,"Refrigerated, in a paper bag",False,True
2998,2024-10-28,2024-10-21,False,Cherry,1322487|P,AgroExpress Foods,1779611|S,Refrigerated,False,False


# Feature Engineering: Create New Features

In [20]:
# Calculate the supplier delivery lag (number of days between order placement and receipt)
df['delivery_lag'] = (df['received_date'] - df['lpo']).dt.days

In [21]:
# Keep only rows where delivery took longer than the product's shelf life
lag_df = df.query('shelf_life_days < delivery_lag')

In [22]:
# Make a chart showing how long deliveries take
lag = px.histogram(lag_df, x='delivery_lag', title="Delivery lags exceed the product's shelf life.")

# Add a small space between the bars
lag.update_layout(bargap=0.1)

# Show the chart
lag.show()

In [23]:
# Perform initial exploratory analysis on delivery_lag to understand its distribution
df['delivery_lag'].describe()

count    3000.000000
mean        6.046000
std         2.599129
min         2.000000
25%         4.000000
50%         6.000000
75%         8.000000
max        10.000000
Name: delivery_lag, dtype: float64

In [24]:
# Add a short description for the 'delivery_lag' column
column_descriptions.update({'delivery_lag': 'Delivery delay in days.'})

In [25]:
# Create 'Expiration_Status' (Expired, Nearing, Safe)
df['expiration_status'] = np.where(df['delivery_lag'] > df['shelf_life_days'], 'Expired', 
                                         np.where(df['delivery_lag'] > df['maximum_days_on_sale'], 'Nearing', 'Safe'))
df['expiration_status'] = df['expiration_status'].astype('category')

# Add a description for the 'expiration_status' column
column_descriptions.update({'expiration_status': 'Indicates whether the product is expired or still valid.'})

In [26]:
# Get the year from the 'lpo' date and store it in a new column
df['year'] = df['lpo'].dt.year

# Group data by product and year, then calculate total sales and total stock
df_annual_turnover = df.groupby(['product', 'year']).agg(
    total_sales=('sales_volume', 'sum'),
    total_stock=('stock_quantity', 'mean')
).reset_index()

# Calculate inventory turnover rate by dividing total sales by total stock
df_annual_turnover['inventory_turnover_rate'] = df_annual_turnover['total_sales'] / df_annual_turnover['total_stock']

# Calculate the average inventory turnover rate for each product
df_average_turnover = df_annual_turnover.groupby('product')[['inventory_turnover_rate']].mean().reset_index()

In [27]:
# Add the average turnover rate to the main DataFrame
df = df.merge(df_average_turnover)

# Remove the 'year' column since it's no longer needed
df.drop(columns='year', inplace=True)

# Add a description for the 'inventory_turnover_rate' column
column_descriptions.update({'inventory_turnover_rate': 'Shows how often inventory is sold and replaced over a period.'})

In [28]:
# Calculate the number of days in the dataset's period
period_days = (df['received_date'].max() - df['received_date'].min()).days

# Use 'inventory_turnover_rate' to calculate Days of Inventory (DOI)
# DOI indicates how many days the current stock can cover based on turnover
df['doi_inventory_turnover'] = (period_days / df['inventory_turnover_rate']).astype('int')

# Add description of the new column to the dictionary
column_descriptions.update({'doi_inventory_turnover': 'Stock coverage in days.'})


# Exploratory Data Analysis (EDA)

### Distribution of Numerical Variables

In [29]:
# Histograms for numerical variables
num_cols = ['stock_quantity', 'sales_volume', 'shelf_life_days', 'distance_km', 'delivery_lag']

# Loop through each numerical column to generate a histogram
for col in num_cols:
    # Create a histogram using Plotly Express for the current column
    fig = px.histogram(df, x=col, title=f'Distribution of {col}', nbins=30)
    
    # Adjust the gap between bars for better readability
    fig.update_layout(bargap=0.1)
    
    # Display the histogram
    fig.show()


### Distribution of Categorical Variables

In [30]:
# Bar charts for categorical variables
cat_cols = ['category', 'sub_category', 'sales_demand', 'expiration_status']

# Loop through each categorical column to generate a bar chart
for col in cat_cols:
    # Create a bar chart showing the frequency of each category
    fig = px.bar(
        df[col].value_counts(),  # Count occurrences of each category
        title=f'Distribution of {col}',  # Chart title
        labels={'value': 'Count', 'index': col}  # Axis labels
    )
    
    # Display the bar chart
    fig.show()


### Relation Between Stock and Sales

In [31]:
# Scatter plot: Stock vs Sales

# Create a scatter plot using the DataFrame 'df'
fig = px.scatter(
    df,  # Data source
    x='stock_quantity',  # X-axis represents stock quantity
    y='sales_volume',    # Y-axis represents sales volume
    color='sales_demand',  # Point color reflects sales demand
    title='Relation between Stock and Sales Volume',  # Chart title
    labels={  # Custom axis labels
        'stock_quantity': 'Stock',
        'sales_volume': 'Sales'
    }
)

# Display the interactive chart
fig.show()


### Expiration Status for Category

In [32]:
# Create a stacked bar chart to visualize expiration status across product categories
fig = px.histogram(
    df,
    x='category',  
    color='expiration_status',  
    barmode='group',  
    title='Expiration Status by Category'
)

# Display the interactive chart
fig.show()


### Stock Turnover for Category

In [33]:
# Box plot: Inventory Turnover by Subcategory
fig = px.box(
    df,
    x='sub_category',
    y='inventory_turnover_rate',
    title='Inventory Turnover Rate by Subcategory'
)

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45)

# Display the chart
fig.show()


### Suppliers with the Longest Delivery Delays

In [34]:
# Top 10 suppliers with the highest average delivery delay
supplier_delay = (
    df.groupby('supplier')['delivery_lag']  # Group by supplier and calculate average delivery lag
    .mean()
    .sort_values(ascending=False)  # Sort in descending order
    .head(10)  # Select top 10 suppliers
)

# Create a bar chart to visualize the results
fig = px.bar(
    supplier_delay,
    title='Top 10 Suppliers by Average Delivery Delay'
)

# Display the chart
fig.show()


### Demand for Climate Classification

In [35]:
# Demand by Weather Severity
fig = px.histogram(
    df,
    x='weather_severity',
    color='sales_demand',
    barmode='group',
    title='Sales Demand by Weather Severity'
)

fig.show()


### Correlation between Numerical Variables

In [40]:
# Correlation heatmap
corr = df.select_dtypes(include=np.number).corr()  # Compute correlation matrix for numeric columns

fig = px.imshow(
    corr,
    text_auto=True,  # Display correlation values on the heatmap
    aspect="auto",
    title='Correlation Matrix'
)
fig.update_layout(width=1200, height=1200)
fig.show()


In [41]:
# Define data paths
processed_data_path = os.path.join('../data', 'processed')

utils_data_path = os.path.join('../docs/column_descriptions.json')

In [42]:
# Sort DataFrame by received_date in ascending order
df = df.sort_values(by='received_date').reset_index(drop=True)

In [44]:
# Save Data
df.to_pickle(processed_data_path + '/grocery.pkl')

# save Dictionary JSON archive
with open(utils_data_path, 'w') as f:
    json.dump(column_descriptions, f, indent=4)