---

This notebook contains the Feature Engineering for the first module of the **"Intelligent System for Supply Chain Management"** project. 

---

Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import json
import plotly.express as px
import plotly.io as pio


# Configure display and graphs
pd.set_option('display.max_columns', None)
pio.templates.default = "plotly_white"

import warnings
warnings.filterwarnings('ignore')

Load Data

In [2]:
# Define data paths
data_path = os.path.join('../data', 'processed')
docs_path = os.path.join('../docs/')

# Load Pickle file
read_data = pd.read_parquet(data_path + '/grocery.parquet')

# Load column descriptions from JSON file into a dictionary for reference or documentation
with open(docs_path + 'column_descriptions.json') as f:
    column_descriptions = json.load(f)

In [3]:
# View dataframe initial shape
read_data.shape

(97646, 33)

In [4]:
order_columns = read_data.columns.tolist()

In [5]:
# Select columns for dtype 
number_max = ['delivery_lag', 'lead_time', 'max_stock']
number_mean = list(set(read_data.select_dtypes(include=[np.number]).columns.tolist()) - set(number_max) - set(['sales_volume', 'stock_quantity', 'min_stock'])) 
not_numbers = read_data.select_dtypes(exclude=[np.number]).columns.tolist()

In [6]:
# Create rules for groupby
agg_rules = {
    'sales_volume': 'sum',
    'stock_quantity': 'last',
    'min_stock': 'min',
}

for col in number_max:
    agg_rules[col] = 'max'

for col in number_mean:
    agg_rules[col] = 'mean'

for col in not_numbers:
    agg_rules[col] = 'last'

In [7]:
# Group duplicated data
read_data = read_data.groupby(['received_date', 'product_id', 'supplier_id'], as_index=False).agg(agg_rules)
# .reset_index(drop=True)

In [8]:
# Confirm duplicates
read_data.duplicated(subset=['received_date', 'product_id', 'supplier_id'], keep=False).sum()

0

In [9]:
# Reorder Columns for organization
read_data = read_data[order_columns]

In [10]:
# Sort the data by 'received_date' to ensure chronological order
read_data = read_data.sort_values(['received_date', 'product_id'], ascending=True).reset_index(drop=True)

# Split the data into two parts:
# - Data on/after May 10, 2025, for comparison/validation
# - Data before May 10, 2025, for training and analysis
compare_data = read_data[read_data['received_date'] >= '2025-05-10'].copy()
df = read_data[read_data['received_date'] < '2025-05-10'].copy()

In [11]:
# Check tha last few rows of the historical data
df.tail()

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70714,2025-05-09,2025-05-03,False,Brown Rice,1961227|P,Pantry,Grains & Rice,180.0,60.0,lb,4.0,GrainWorld Distributors,1807382|S,150.0,200.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,102,4,452,565,65,6,Safe,27.19756,37.0
70715,2025-05-09,2025-05-03,False,Walnuts,1966948|P,Pantry,Nuts & Seeds,180.0,60.0,lb,5.0,Nut & Seed Co.,1911483|S,125.0,55.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,10,3,64,80,71,6,Safe,26.122325,38.0
70716,2025-05-09,2025-05-01,False,Peanut Oil,1979791|P,Pantry,Oils & Vinegars,365.0,90.0,unit,4.0,Oil & Vinegar Co.,1464761|S,120.0,80.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,35,4,164,205,181,8,Safe,25.860183,39.0
70717,2025-05-09,2025-04-29,False,Shrimp,1992397|P,Fresh Foods,Seafood,2.0,1.0,lb,5.0,Coastal Catch,1999523|S,220.0,25.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,45,5,150,180,172,10,Expired,66.16518,15.0
70718,2025-05-09,2025-05-03,False,Ground Coffee,1994909|P,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,43,3,284,355,288,6,Safe,26.853389,37.0


In [12]:
# Check the first few rows of the comparison data
compare_data.head()

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70719,2025-05-11,2025-05-08,False,Vegetable Oil,1021354|P,Pantry,Oils & Vinegars,365.0,90.0,unit,3.0,Oil & Vinegar Co.,1464761|S,120.0,80.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,69,5,164,205,199,3,Safe,26.798828,37.0
70720,2025-05-11,2025-05-05,False,Wild Rice,1048863|P,Pantry,Grains & Rice,730.0,180.0,lb,4.0,GrainWorld Distributors,1807382|S,150.0,200.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,189,4,464,580,505,6,Safe,27.315464,37.0
70721,2025-05-11,2025-05-02,False,Soy Milk,1049385|P,Dairy & Alternatives,Plant-Based Milk,7.0,3.0,unit,3.0,Plant-Based Alternatives,1918650|S,105.0,60.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,29,4,244,305,342,9,Expired,26.679341,38.0
70722,2025-05-11,2025-05-05,False,Chocolate Bar,1068521|P,Pantry,Snacks,365.0,90.0,unit,5.0,Chocolate Heaven,1806034|S,140.0,40.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,363,4,840,1050,842,6,Safe,28.866901,35.0
70723,2025-05-11,2025-05-07,False,Halibut,1086277|P,Fresh Foods,Seafood,2.0,1.0,lb,1.0,OceanHarvest Seafood,1168079|S,180.0,40.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,47,5,150,180,161,4,Expired,21.952559,46.0


Save Split DataFrame

In [13]:
# Save the processed DataFrame as a pickle file for efficient loading in future steps
df.to_parquet(data_path + '/data_for_train.parquet', engine="fastparquet")
compare_data.to_parquet(data_path + '/data_for_compare.parquet', engine="fastparquet")


In [14]:
# Return the shapes of both datasets to verify the split
df.shape, compare_data.shape

((70719, 33), (8736, 33))

In [15]:
# Calculate the difference between expected and actual delivery time; positive means early, negative means delayed
df['delivery_time_variation'] = df['lead_time'] - df['delivery_lag']

# Add a description for the 'delivery_time_variation' column to clarify its meaning and interpretation
column_descriptions.update({
    'delivery_time_variation': 'Number of days between the expected delivery date and the actual delivery date. A positive value indicates early delivery, while a negative value indicates a delay.'
})


In [16]:
# Set 'received_date' column as the DataFrame index to enable time-based operations
df.set_index('received_date', inplace=True)


In [17]:
# Take information of Date
df['Year'] = df.index.get_level_values('received_date').year
df['Month'] = df.index.get_level_values('received_date').month
df['Day'] = df.index.get_level_values('received_date').day
df['DayOfYear'] = df.index.get_level_values('received_date').dayofyear
df['Weekday'] = df.index.get_level_values('received_date').weekday
df['QuarterOfYear'] = df.index.get_level_values('received_date').quarter
df['WeekOfYear'] = df.index.get_level_values('received_date').isocalendar().week.values

In [18]:
# Remove multi index
df = df.reset_index()
df

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear
0,2022-12-09,2022-11-29,True,Lime,1019979|P,Fresh Foods,Fruits,21.0,7.0,lb,5.0,Lemon & Lime Co.,1536622|S,190.0,35.0,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,526,7,3520,4224,5275,10,Nearing,21.494078,47.0,-3,2022,12,9,343,4,4,49
1,2022-12-09,2022-12-01,False,White Bread,1033235|P,Bakery,Bread,5.0,2.0,unit,3.0,Bakery Fresh Co.,1644445|S,45.0,75.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,688,3,2271,3028,2436,8,Expired,35.607450,28.0,-5,2022,12,9,343,4,4,49
2,2022-12-09,2022-12-07,False,Sardines,1093592|P,Fresh Foods,Seafood,2.0,1.0,lb,2.0,OceanHarvest Seafood,1168079|S,180.0,40.0,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,33,5,155,186,155,2,Nearing,22.882402,44.0,3,2022,12,9,343,4,4,49
3,2022-12-09,2022-11-29,False,Avocado,1113134|P,Fresh Foods,Fruits,7.0,3.0,unit,4.0,Avocado Avenue,1840542|S,280.0,30.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,718,7,5975,7170,6677,10,Expired,42.740888,23.0,-3,2022,12,9,343,4,4,49
4,2022-12-09,2022-12-03,False,Cod,1119767|P,Fresh Foods,Seafood,2.0,1.0,lb,1.0,OceanHarvest Seafood,1168079|S,180.0,40.0,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,45,6,150,180,156,9,Expired,44.152735,23.0,-3,2022,12,9,343,4,4,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70714,2025-05-09,2025-05-03,False,Brown Rice,1961227|P,Pantry,Grains & Rice,180.0,60.0,lb,4.0,GrainWorld Distributors,1807382|S,150.0,200.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,102,4,452,565,65,6,Safe,27.197560,37.0,-2,2025,5,9,129,4,2,19
70715,2025-05-09,2025-05-03,False,Walnuts,1966948|P,Pantry,Nuts & Seeds,180.0,60.0,lb,5.0,Nut & Seed Co.,1911483|S,125.0,55.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,10,3,64,80,71,6,Safe,26.122325,38.0,-3,2025,5,9,129,4,2,19
70716,2025-05-09,2025-05-01,False,Peanut Oil,1979791|P,Pantry,Oils & Vinegars,365.0,90.0,unit,4.0,Oil & Vinegar Co.,1464761|S,120.0,80.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,35,4,164,205,181,8,Safe,25.860183,39.0,-4,2025,5,9,129,4,2,19
70717,2025-05-09,2025-04-29,False,Shrimp,1992397|P,Fresh Foods,Seafood,2.0,1.0,lb,5.0,Coastal Catch,1999523|S,220.0,25.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,45,5,150,180,172,10,Expired,66.165180,15.0,-5,2025,5,9,129,4,2,19


In [19]:
def calculate_std_demand_lead_time(df, product_col='product', sales_col='sales_volume', lead_col='delivery_lag'):
    """
    Calculate the standard deviation of demand during lead time for each product.

    Parameters:
    - df (DataFrame): Input DataFrame containing sales and delivery data.
    - product_col (str): Column name representing the product identifier.
    - sales_col (str): Column name representing daily sales volume.
    - lead_col (str): Column name representing delivery lead time (in days).

    Returns:
    - DataFrame: Original DataFrame with additional columns:
        - 'avg_daily_sales': Average daily sales per product.
        - 'std_daily_sales': Standard deviation of daily sales per product.
        - 'avg_daily_lag': Average delivery lead time per product.
        - 'std_daily_lag': Standard deviation of delivery lead time per product.
        - 'std_demand_lead_time': Standard deviation of demand during lead time per product.
    """

    # Calculate the average daily sales for each product
    df['avg_daily_sales'] = df.groupby(product_col)[sales_col].transform('mean')

    # Calculate the standard deviation of daily sales for each product
    df['std_daily_sales'] = df.groupby(product_col)[sales_col].transform('std')

    # Calculate the average delivery lead time for each product
    df['avg_daily_lag'] = df.groupby(product_col)[lead_col].transform('mean')

    # Calculate the standard deviation of delivery lead time for each product
    df['std_daily_lag'] = df.groupby(product_col)[lead_col].transform('std')

    # Compute the standard deviation of demand during lead time using the formula:
    # sqrt(LeadTimeAvg * SalesStd^2 + SalesAvg^2 * LeadTimeStd^2)
    df['std_demand_lead_time'] = np.sqrt(
        (df['avg_daily_lag'] * (df['std_daily_sales'] ** 2)) +
        ((df['avg_daily_sales'] ** 2) * (df['std_daily_lag'] ** 2))
    )

    return df


In [20]:
# Apply the function to compute demand variability during lead time and update the DataFrame with new metrics
df = calculate_std_demand_lead_time(df=df)



---

### **Service Level Factor (Z) Based on the Standard Normal Distribution**

The Service Level Factor (Z) is obtained from a standard normal distribution table.

- **80% Service Level**: Z = 0.84  
- **85% Service Level**: Z = 1.04  
- **90% Service Level**: Z = 1.28  
- **95% Service Level**: Z = 1.64  
- **98% Service Level**: Z = 2.05  
- **99% Service Level**: Z = 2.33

---


In [21]:
# Set the Z factor for a 95% service level
Z = 1.64

# Calculate the Safety Stock
df['safety_stock'] = (Z * df['std_demand_lead_time'])

# Add description for the 'safety_stock' column to clarify its role in inventory management
column_descriptions.update({'safety_stock': 'Extra inventory buffer maintained to protect against uncertainties in demand and delivery lead time.'})



In [22]:
df.select_dtypes(np.number)[:2]

Unnamed: 0,shelf_life_days,maximum_days_on_sale,supplier_rating,distance_km,moq,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,avg_daily_lag,std_daily_lag,std_demand_lead_time,safety_stock
0,21.0,7.0,5.0,190.0,35.0,526,7,3520,4224,5275,10,21.494078,47.0,-3,2022,12,9,343,4,4,49,863.320809,490.36015,6.632948,2.52086,2516.196241,4126.561835
1,5.0,2.0,3.0,45.0,75.0,688,3,2271,3028,2436,8,35.60745,28.0,-5,2022,12,9,343,4,4,49,965.951009,540.321774,6.423631,2.540269,2810.04928,4608.480819


In [23]:
# Calculates the Reorder Point (ROP) for each product.
df['rop'] = (df['avg_daily_sales'] * df['delivery_lag']) + df['safety_stock']

# Define description for 'rop' (Reorder Point) to indicate when replenishment should be triggered
column_descriptions.update({'rop': 'ROP is the inventory level at which a new order should be placed to avoid stock outs.'})


In [24]:
# Create Variable Reorder Point / Sales Volume
df['reorder_point_coverage'] = df['rop'] / df['avg_daily_sales']

# Add description for 'reorder_point_coverage' to indicate how many days of demand the ROP supports
column_descriptions.update({'reorder_point_coverage': 'Number of days of demand that the reorder point (ROP) can cover'})


In [25]:
# Calculates the reorder quantity as the average sales per product multiplied by the lead time
df['reorder_quantity'] = (df.groupby('product')['sales_volume'].transform(np.mean) * df['lead_time']).astype(int)

# Add description for 'reorder_quantity' to define the optimal quantity to order when replenishing stock
column_descriptions.update({'reorder_quantity': 'Optimal number of units to order once inventory reaches the reorder point, based on expected demand during lead time.'})


In [26]:
# Create Variable Reorder Level, Reorder Quantity and Inventory Turnover Rate
df['reorder_point_quantity_turnover'] = (df['rop'] - df['reorder_quantity']) / df['inventory_turnover_rate']

# Add description for 'reorder_point_quantity_turnover' to explain how quickly the buffer stock is depleted
column_descriptions.update({'reorder_point_quantity_turnover': 'Rate at which the buffer between the reorder level and reorder quantity is consumed'})


In [27]:
# Transform string distance_km in integer for include in train models
df['distance_km'] = df['distance_km'].astype(int)

In [28]:
df

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,avg_daily_lag,std_daily_lag,std_demand_lead_time,safety_stock,rop,reorder_point_coverage,reorder_quantity,reorder_point_quantity_turnover
0,2022-12-09,2022-11-29,True,Lime,1019979|P,Fresh Foods,Fruits,21.0,7.0,lb,5.0,Lemon & Lime Co.,1536622|S,190,35.0,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,526,7,3520,4224,5275,10,Nearing,21.494078,47.0,-3,2022,12,9,343,4,4,49,863.320809,490.360150,6.632948,2.520860,2516.196241,4126.561835,12759.769928,14.779871,6043,312.493985
1,2022-12-09,2022-12-01,False,White Bread,1033235|P,Bakery,Bread,5.0,2.0,unit,3.0,Bakery Fresh Co.,1644445|S,45,75.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,688,3,2271,3028,2436,8,Expired,35.607450,28.0,-5,2022,12,9,343,4,4,49,965.951009,540.321774,6.423631,2.540269,2810.049280,4608.480819,12336.088888,12.770926,2897,265.087469
2,2022-12-09,2022-12-07,False,Sardines,1093592|P,Fresh Foods,Seafood,2.0,1.0,lb,2.0,OceanHarvest Seafood,1168079|S,180,40.0,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,33,5,155,186,155,2,Nearing,22.882402,44.0,3,2022,12,9,343,4,4,49,40.364179,25.390222,6.113433,2.657995,124.305179,203.860494,284.588852,7.050530,201,3.652975
3,2022-12-09,2022-11-29,False,Avocado,1113134|P,Fresh Foods,Fruits,7.0,3.0,unit,4.0,Avocado Avenue,1840542|S,280,30.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,718,7,5975,7170,6677,10,Expired,42.740888,23.0,-3,2022,12,9,343,4,4,49,1542.605183,988.380985,6.371951,2.661929,4804.840326,7879.938134,23305.989963,15.108202,10798,292.646935
4,2022-12-09,2022-12-03,False,Cod,1119767|P,Fresh Foods,Seafood,2.0,1.0,lb,1.0,OceanHarvest Seafood,1168079|S,180,40.0,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,45,6,150,180,156,9,Expired,44.152735,23.0,-3,2022,12,9,343,4,4,49,35.506550,19.776953,6.276565,2.550211,103.218676,169.278629,488.837581,13.767532,213,6.247350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70714,2025-05-09,2025-05-03,False,Brown Rice,1961227|P,Pantry,Grains & Rice,180.0,60.0,lb,4.0,GrainWorld Distributors,1807382|S,150,200.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,102,4,452,565,65,6,Safe,27.197560,37.0,-2,2025,5,9,129,4,2,19,141.690544,79.620518,6.441261,2.543894,413.225134,677.689220,1527.832487,10.782882,566,35.364661
70715,2025-05-09,2025-05-03,False,Walnuts,1966948|P,Pantry,Nuts & Seeds,180.0,60.0,lb,5.0,Nut & Seed Co.,1911483|S,125,55.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,10,3,64,80,71,6,Safe,26.122325,38.0,-3,2025,5,9,129,4,2,19,19.881098,11.143814,6.387195,2.513466,57.360600,94.071384,213.357969,10.731700,59,5.909044
70716,2025-05-09,2025-05-01,False,Peanut Oil,1979791|P,Pantry,Oils & Vinegars,365.0,90.0,unit,4.0,Oil & Vinegar Co.,1464761|S,120,80.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,35,4,164,205,181,8,Safe,25.860183,39.0,-4,2025,5,9,129,4,2,19,48.765957,27.841613,6.197568,2.538819,141.889003,232.697965,622.825624,12.771730,195,16.543797
70717,2025-05-09,2025-04-29,False,Shrimp,1992397|P,Fresh Foods,Seafood,2.0,1.0,lb,5.0,Coastal Catch,1999523|S,220,25.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,45,5,150,180,172,10,Expired,66.165180,15.0,-5,2025,5,9,129,4,2,19,37.756972,21.928387,6.409363,2.608131,113.045638,185.394846,562.964567,14.910215,188,5.667098


In [29]:
# Drop non-essential or redundant columns from the time series DataFrame to streamline analysis
columns_dropped = df.select_dtypes(exclude=[np.number, 'datetime', bool]).columns.tolist()

df.drop(columns=columns_dropped, inplace=True)


In [30]:
# Calculate the correlation matrix for all numeric columns
correlation_matrix = df.corr()

In [31]:
corr1 = correlation_matrix[correlation_matrix >= 0.7].replace(1.0, np.nan).dropna(how='all', axis=1)
corr1.dropna(how='all').replace(np.nan, '')

Unnamed: 0,received_date,lpo,shelf_life_days,maximum_days_on_sale,is_weekend,sales_volume,min_stock,max_stock,stock_quantity,delivery_lag,Year,Month,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,std_demand_lead_time,safety_stock,rop,reorder_point_coverage,reorder_quantity,reorder_point_quantity_turnover
received_date,,0.999949,,,,,,,,,0.916281,,,,,,,,,,,,,
lpo,0.999949,,,,,,,,,,0.91626,,,,,,,,,,,,,
shelf_life_days,,,,0.900836,,,,,,,,,,,,,,,,,,,,
maximum_days_on_sale,,,0.900836,,,,,,,,,,,,,,,,,,,,,
is_weekend,,,,,,,,,,,,,,0.789994,,,,,,,,,,
sales_volume,,,,,,,0.745612,0.752773,,,,,,,,,0.768107,0.766157,0.767724,0.767724,0.768248,,0.774728,
min_stock,,,,,,0.745612,,0.999028,0.906716,,,,,,,,0.970032,0.967024,0.969973,0.969973,0.926481,,0.974399,0.743888
max_stock,,,,,,0.752773,0.999028,,0.907104,,,,,,,,0.979408,0.976502,0.979263,0.979263,0.935444,,0.973706,0.750908
stock_quantity,,,,,,,0.906716,0.907104,,,,,,,,,0.886256,0.883884,0.886201,0.886201,0.846213,,0.883035,
delivery_lag,,,,,,,,,,,,,,,,,,,,,,0.999025,,


In [32]:
# Hide the top half of the matrix to avoid repeating values
mask = np.tril(np.ones(correlation_matrix.shape), k=-1)
masked_corr = correlation_matrix.where(mask == 1)

# Create a heatmap visualization of the correlation matrix
fig_corr = px.imshow(masked_corr,
                    title='Correlation Matrix - Numeric Variables',
                    color_continuous_scale='RdBu_r',  # Red-Blue reversed color scale
                    aspect="auto",                   # Automatic aspect ratio
                    text_auto=False,                 # Display correlation values on cells
                    zmin=-1, zmax=1)                 # Fix color scale from -1 to +1

# Adjust the figure dimensions
fig_corr.update_layout(width=1200, height=1200)

# Display the interactive heatmap
fig_corr.show()

In [33]:
# Displays a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70719 entries, 0 to 70718
Data columns (total 36 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   received_date                    70719 non-null  datetime64[ns]
 1   lpo                              70719 non-null  datetime64[ns]
 2   in_season                        70719 non-null  bool          
 3   shelf_life_days                  70719 non-null  float64       
 4   maximum_days_on_sale             70719 non-null  float64       
 5   supplier_rating                  70719 non-null  float64       
 6   distance_km                      70719 non-null  int64         
 7   moq                              70719 non-null  float64       
 8   is_holiday                       70719 non-null  bool          
 9   is_weekend                       70719 non-null  bool          
 10  sales_volume                     70719 non-null  int64    