---

This notebook contains the Feature Engineering for the first module of the **"Intelligent System for Supply Chain Management"** project. 

---

Import Libraries

In [12]:
import pandas as pd
import numpy as np
import os
import json
import plotly.express as px
import plotly.io as pio


# Configure display and graphs
pd.set_option('display.max_columns', None)
pio.templates.default = "plotly_white"

import warnings
warnings.filterwarnings('ignore')

Load Data

In [13]:
# Define data paths
data_path = os.path.join('../data', 'processed')
docs_path = os.path.join('../docs/')

# Load Pickle file
read_data = pd.read_pickle(data_path + '/grocery.pkl', )

# Load column descriptions from JSON file into a dictionary for reference or documentation
with open(docs_path + 'column_descriptions.json') as f:
    column_descriptions = json.load(f)

In [14]:
# View dataframe initial shape
read_data.shape

(97571, 33)

In [15]:
order_columns = read_data.columns.tolist()

In [16]:
# Select columns for dtype 
number_max = ['delivery_lag', 'lead_time', 'max_stock']
number_mean = list(set(read_data.select_dtypes(include=[np.number]).columns.tolist()) - set(number_max) - set(['sales_volume', 'stock_quantity', 'min_stock'])) 
not_numbers = read_data.select_dtypes(exclude=[np.number]).columns.tolist()

In [17]:
# Create rules for groupby
agg_rules = {
    'sales_volume': 'sum',
    'stock_quantity': 'last',
    'min_stock': 'min',
}

for col in number_max:
    agg_rules[col] = 'max'

for col in number_mean:
    agg_rules[col] = 'mean'

for col in not_numbers:
    agg_rules[col] = 'last'

In [18]:
# Group duplicated data
read_data = read_data.groupby(['received_date', 'product_id', 'supplier_id'], as_index=False).agg(agg_rules)
# .reset_index(drop=True)

In [19]:
# Confirm duplicates
read_data.duplicated(subset=['received_date', 'product_id', 'supplier_id'], keep=False).sum()

0

In [20]:
# Reorder Columns for organization
read_data = read_data[order_columns]

In [22]:
# Sort the data by 'received_date' to ensure chronological order
read_data = read_data.sort_values(['received_date', 'product_id'], ascending=True).reset_index(drop=True)

# Split the data into two parts:
# - Data on/after May 10, 2025, for comparison/validation
# - Data before May 10, 2025, for training and analysis
compare_data = read_data[read_data['received_date'] >= '2025-05-10'].copy()
df = read_data[read_data['received_date'] < '2025-05-10'].copy()

In [23]:
# Check tha last few rows of the historical data
df.tail()

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70874,2025-05-09,2025-04-29,False,Canned Beans,1928869|P,Pantry,Canned Goods,1095.0,90.0,unit,3,Wholesale Warehouse,1363063|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,81,3,291,388,297,10,Safe,61.642582,16.0
70875,2025-05-09,2025-05-03,False,Ground Black Pepper,1937715|P,Pantry,Spices,365.0,90.0,unit,4,SpiceWorld Imports,1391975|S,160.0,30.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,49,6,145,174,167,6,Safe,24.635106,41.0
70876,2025-05-09,2025-05-04,False,All-Purpose Flour,1940872|P,Pantry,Baking Supplies,365.0,60.0,lb,3,BakeWell Supplies,1552913|S,90.0,50.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,9,5,44,55,51,5,Safe,28.095371,36.0
70877,2025-05-09,2025-05-02,False,Popcorn Kernels,1992802|P,Pantry,Snacks,365.0,90.0,lb,4,SnackTime Distributors,1414750|S,80.0,110.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,135,4,848,1060,1342,7,Safe,25.934508,39.0
70878,2025-05-09,2025-05-07,False,Plum,1998069|P,Fresh Foods,Fruits,5.0,2.0,lb,4,Stone Fruit Specialists,1405032|S,165.0,38.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,754,6,6156,7182,6187,2,Safe,19.17862,53.0


In [24]:
# Check the first few rows of the comparison data
compare_data.head()

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70879,2025-05-11,2025-05-01,False,Arborio Rice,1003530|P,Pantry,Grains & Rice,730.0,180.0,lb,4,GrainWorld Distributors,1792439|S,150.0,200.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,186,5,468,585,662,10,Safe,28.680627,35.0
70880,2025-05-11,2025-05-06,False,Canned Tomatoes,1007004|P,Pantry,Canned Goods,1095.0,90.0,unit,2,Wholesale Warehouse,1363063|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,243,4,285,380,308,5,Safe,64.442801,15.0
70881,2025-05-11,2025-05-05,False,Canned Tuna,1017723|P,Pantry,Canned Fish,1095.0,90.0,unit,4,PantryEssentials Ltd.,1141220|S,95.0,130.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,25,4,56,130,89,6,Safe,17.540236,58.0
70882,2025-05-11,2025-05-04,False,Basmati Rice,1018159|P,Pantry,Grains & Rice,730.0,180.0,lb,2,GrainWorld Distributors,1792439|S,150.0,200.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,82,5,476,595,24,7,Safe,49.19511,20.0
70883,2025-05-11,2025-05-08,False,Egg (Duck),1023873|P,Dairy & Alternatives,Eggs,28.0,14.0,unit,3,FreshEggs Co.,1255339|S,65.0,30.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,458,5,1128,1410,128,3,Safe,25.702697,39.0


Save Split DataFrame

In [25]:
# Save the processed DataFrame as a pickle file for efficient loading in future steps
df.to_pickle(data_path + '/data_for_train.pkl')
compare_data.to_pickle(data_path + '/data_for_compare.pkl')


In [26]:
# Return the shapes of both datasets to verify the split
df.shape, compare_data.shape

((70879, 33), (8678, 33))

In [27]:
# Calculate the difference between expected and actual delivery time; positive means early, negative means delayed
df['delivery_time_variation'] = df['lead_time'] - df['delivery_lag']

# Add a description for the 'delivery_time_variation' column to clarify its meaning and interpretation
column_descriptions.update({
    'delivery_time_variation': 'Number of days between the expected delivery date and the actual delivery date. A positive value indicates early delivery, while a negative value indicates a delay.'
})


In [28]:
# Set 'received_date' column as the DataFrame index to enable time-based operations
df.set_index('received_date', inplace=True)


In [29]:
# Take information of Date
df['Year'] = df.index.get_level_values('received_date').year
df['Month'] = df.index.get_level_values('received_date').month
df['Day'] = df.index.get_level_values('received_date').day
df['DayOfYear'] = df.index.get_level_values('received_date').dayofyear
df['Weekday'] = df.index.get_level_values('received_date').weekday
df['QuarterOfYear'] = df.index.get_level_values('received_date').quarter
df['WeekOfYear'] = df.index.get_level_values('received_date').isocalendar().week.values

In [30]:
# Remove multi index
df = df.reset_index()
df

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear
0,2022-12-09,2022-12-01,False,Canned Tomatoes,1007004|P,Pantry,Canned Goods,1095.0,90.0,unit,2,PantryEssentials Ltd.,1141220|S,95.0,130.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,47,3,376,470,52,8,Safe,64.442801,15.0,-5,2022,12,9,343,4,4,49
1,2022-12-09,2022-11-29,False,Canned Tomatoes,1007004|P,Pantry,Canned Goods,1095.0,90.0,unit,2,Wholesale Warehouse,1363063|S,25.0,300.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,54,4,285,380,354,10,Safe,64.442801,15.0,-6,2022,12,9,343,4,4,49
2,2022-12-09,2022-12-03,False,Black Tea,1009699|P,Beverages,Tea,365.0,90.0,unit,5,TeaTime Imports,1479828|S,160.0,55.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,19,7,135,162,160,6,Safe,23.466802,43.0,1,2022,12,9,343,4,4,49
3,2022-12-09,2022-12-03,False,Canned Tuna,1017723|P,Pantry,Canned Fish,1095.0,90.0,unit,4,PantryEssentials Ltd.,1141220|S,95.0,130.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,17,5,56,130,77,10,Safe,17.540236,58.0,-5,2022,12,9,343,4,4,49
4,2022-12-09,2022-12-06,False,Basmati Rice,1018159|P,Pantry,Grains & Rice,730.0,180.0,lb,3,International Foods Inc.,1392771|S,250.0,70.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,63,6,555,666,804,3,Safe,49.195110,20.0,3,2022,12,9,343,4,4,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70874,2025-05-09,2025-04-29,False,Canned Beans,1928869|P,Pantry,Canned Goods,1095.0,90.0,unit,3,Wholesale Warehouse,1363063|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,81,3,291,388,297,10,Safe,61.642582,16.0,-7,2025,5,9,129,4,2,19
70875,2025-05-09,2025-05-03,False,Ground Black Pepper,1937715|P,Pantry,Spices,365.0,90.0,unit,4,SpiceWorld Imports,1391975|S,160.0,30.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,49,6,145,174,167,6,Safe,24.635106,41.0,0,2025,5,9,129,4,2,19
70876,2025-05-09,2025-05-04,False,All-Purpose Flour,1940872|P,Pantry,Baking Supplies,365.0,60.0,lb,3,BakeWell Supplies,1552913|S,90.0,50.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,9,5,44,55,51,5,Safe,28.095371,36.0,0,2025,5,9,129,4,2,19
70877,2025-05-09,2025-05-02,False,Popcorn Kernels,1992802|P,Pantry,Snacks,365.0,90.0,lb,4,SnackTime Distributors,1414750|S,80.0,110.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,135,4,848,1060,1342,7,Safe,25.934508,39.0,-3,2025,5,9,129,4,2,19


In [31]:
def calculate_std_demand_lead_time(df, product_col='product', sales_col='sales_volume', lead_col='delivery_lag'):
    """
    Calculate the standard deviation of demand during lead time for each product.

    Parameters:
    - df (DataFrame): Input DataFrame containing sales and delivery data.
    - product_col (str): Column name representing the product identifier.
    - sales_col (str): Column name representing daily sales volume.
    - lead_col (str): Column name representing delivery lead time (in days).

    Returns:
    - DataFrame: Original DataFrame with additional columns:
        - 'avg_daily_sales': Average daily sales per product.
        - 'std_daily_sales': Standard deviation of daily sales per product.
        - 'avg_daily_lag': Average delivery lead time per product.
        - 'std_daily_lag': Standard deviation of delivery lead time per product.
        - 'std_demand_lead_time': Standard deviation of demand during lead time per product.
    """

    # Calculate the average daily sales for each product
    df['avg_daily_sales'] = df.groupby(product_col)[sales_col].transform('mean')

    # Calculate the standard deviation of daily sales for each product
    df['std_daily_sales'] = df.groupby(product_col)[sales_col].transform('std')

    # Calculate the average delivery lead time for each product
    df['avg_daily_lag'] = df.groupby(product_col)[lead_col].transform('mean')

    # Calculate the standard deviation of delivery lead time for each product
    df['std_daily_lag'] = df.groupby(product_col)[lead_col].transform('std')

    # Compute the standard deviation of demand during lead time using the formula:
    # sqrt(LeadTimeAvg * SalesStd^2 + SalesAvg^2 * LeadTimeStd^2)
    df['std_demand_lead_time'] = np.sqrt(
        (df['avg_daily_lag'] * (df['std_daily_sales'] ** 2)) +
        ((df['avg_daily_sales'] ** 2) * (df['std_daily_lag'] ** 2))
    )

    return df


In [32]:
# Apply the function to compute demand variability during lead time and update the DataFrame with new metrics
df = calculate_std_demand_lead_time(df=df)



---

### **Service Level Factor (Z) Based on the Standard Normal Distribution**

The Service Level Factor (Z) is obtained from a standard normal distribution table.

- **80% Service Level**: Z = 0.84  
- **85% Service Level**: Z = 1.04  
- **90% Service Level**: Z = 1.28  
- **95% Service Level**: Z = 1.64  
- **98% Service Level**: Z = 2.05  
- **99% Service Level**: Z = 2.33

---


In [33]:
# Set the Z factor for a 95% service level
Z = 1.64

# Calculate the Safety Stock
df['safety_stock'] = (Z * df['std_demand_lead_time'])

# Add description for the 'safety_stock' column to clarify its role in inventory management
column_descriptions.update({'safety_stock': 'Extra inventory buffer maintained to protect against uncertainties in demand and delivery lead time.'})



In [34]:
df.select_dtypes(np.number)[:2]

Unnamed: 0,shelf_life_days,maximum_days_on_sale,distance_km,moq,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,avg_daily_lag,std_daily_lag,std_demand_lead_time,safety_stock
0,1095.0,90.0,95.0,130.0,47,3,376,470,52,8,64.442801,15.0,-5,2022,12,9,343,4,4,49,116.492435,70.983932,6.420908,2.588702,351.132662,575.857565
1,1095.0,90.0,25.0,300.0,54,4,285,380,354,10,64.442801,15.0,-6,2022,12,9,343,4,4,49,116.492435,70.983932,6.420908,2.588702,351.132662,575.857565


In [35]:
# Calculates the Reorder Point (ROP) for each product.
df['rop'] = (df['avg_daily_sales'] * df['delivery_lag']) + df['safety_stock']

# Define description for 'rop' (Reorder Point) to indicate when replenishment should be triggered
column_descriptions.update({'rop': 'ROP is the inventory level at which a new order should be placed to avoid stock outs.'})


In [36]:
# Create Variable Reorder Point / Sales Volume
df['reorder_point_coverage'] = df['rop'] / df['avg_daily_sales']

# Add description for 'reorder_point_coverage' to indicate how many days of demand the ROP supports
column_descriptions.update({'reorder_point_coverage': 'Number of days of demand that the reorder point (ROP) can cover'})


In [37]:
# Calculates the reorder quantity as the average sales per product multiplied by the lead time
df['reorder_quantity'] = (df.groupby('product')['sales_volume'].transform(np.mean) * df['lead_time']).astype(int)

# Add description for 'reorder_quantity' to define the optimal quantity to order when replenishing stock
column_descriptions.update({'reorder_quantity': 'Optimal number of units to order once inventory reaches the reorder point, based on expected demand during lead time.'})


In [38]:
# Create Variable Reorder Level, Reorder Quantity and Inventory Turnover Rate
df['reorder_point_quantity_turnover'] = (df['rop'] - df['reorder_quantity']) / df['inventory_turnover_rate']

# Add description for 'reorder_point_quantity_turnover' to explain how quickly the buffer stock is depleted
column_descriptions.update({'reorder_point_quantity_turnover': 'Rate at which the buffer between the reorder level and reorder quantity is consumed'})


In [39]:
# Transform string distance_km in integer for include in train models
df['distance_km'] = df['distance_km'].astype(int)

In [40]:
df

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,avg_daily_lag,std_daily_lag,std_demand_lead_time,safety_stock,rop,reorder_point_coverage,reorder_quantity,reorder_point_quantity_turnover
0,2022-12-09,2022-12-01,False,Canned Tomatoes,1007004|P,Pantry,Canned Goods,1095.0,90.0,unit,2,PantryEssentials Ltd.,1141220|S,95,130.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,47,3,376,470,52,8,Safe,64.442801,15.0,-5,2022,12,9,343,4,4,49,116.492435,70.983932,6.420908,2.588702,351.132662,575.857565,1507.797043,12.943304,349,17.981792
1,2022-12-09,2022-11-29,False,Canned Tomatoes,1007004|P,Pantry,Canned Goods,1095.0,90.0,unit,2,Wholesale Warehouse,1363063|S,25,300.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,54,4,285,380,354,10,Safe,64.442801,15.0,-6,2022,12,9,343,4,4,49,116.492435,70.983932,6.420908,2.588702,351.132662,575.857565,1740.781912,14.943304,465,19.797121
2,2022-12-09,2022-12-03,False,Black Tea,1009699|P,Beverages,Tea,365.0,90.0,unit,5,TeaTime Imports,1479828|S,160,55.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,19,7,135,162,160,6,Safe,23.466802,43.0,1,2022,12,9,343,4,4,49,33.533898,20.354792,6.440678,2.604884,101.483227,166.432492,367.635882,10.963112,234,5.694678
3,2022-12-09,2022-12-03,False,Canned Tuna,1017723|P,Pantry,Canned Fish,1095.0,90.0,unit,4,PantryEssentials Ltd.,1141220|S,95,130.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,17,5,56,130,77,10,Safe,17.540236,58.0,-5,2022,12,9,343,4,4,49,18.756923,11.416065,6.532308,2.598023,56.798202,93.149051,280.718282,14.966116,93,10.702153
4,2022-12-09,2022-12-06,False,Basmati Rice,1018159|P,Pantry,Grains & Rice,730.0,180.0,lb,3,International Foods Inc.,1392771|S,250,70.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,63,6,555,666,804,3,Safe,49.195110,20.0,3,2022,12,9,343,4,4,49,143.723032,90.572561,6.482507,2.470051,423.327112,694.256464,1125.425560,7.830516,862,5.354710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70874,2025-05-09,2025-04-29,False,Canned Beans,1928869|P,Pantry,Canned Goods,1095.0,90.0,unit,3,Wholesale Warehouse,1363063|S,25,300.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,81,3,291,388,297,10,Safe,61.642582,16.0,-7,2025,5,9,129,4,2,19,118.623563,65.734449,6.540230,2.573488,348.502608,571.544277,1757.779909,14.818134,355,22.756670
70875,2025-05-09,2025-05-03,False,Ground Black Pepper,1937715|P,Pantry,Spices,365.0,90.0,unit,4,SpiceWorld Imports,1391975|S,160,30.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,49,6,145,174,167,6,Safe,24.635106,41.0,0,2025,5,9,129,4,2,19,38.163934,21.569014,6.412568,2.564618,112.084636,183.818804,412.802410,10.816558,228,7.501588
70876,2025-05-09,2025-05-04,False,All-Purpose Flour,1940872|P,Pantry,Baking Supplies,365.0,60.0,lb,3,BakeWell Supplies,1552913|S,90,50.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,9,5,44,55,51,5,Safe,28.095371,36.0,0,2025,5,9,129,4,2,19,14.353474,8.950985,6.450151,2.612283,43.848458,71.911472,143.678843,10.010039,71,2.586862
70877,2025-05-09,2025-05-02,False,Popcorn Kernels,1992802|P,Pantry,Snacks,365.0,90.0,lb,4,SnackTime Distributors,1414750|S,80,110.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,135,4,848,1060,1342,7,Safe,25.934508,39.0,-3,2025,5,9,129,4,2,19,258.425595,147.665295,6.476190,2.547348,758.005605,1243.129193,3052.108359,11.810395,1033,77.854121


In [41]:
# Drop non-essential or redundant columns from the time series DataFrame to streamline analysis
columns_dropped = df.select_dtypes(exclude=[np.number, 'datetime', bool]).columns.tolist()

df.drop(columns=columns_dropped, inplace=True)


In [42]:
# Calculate the correlation matrix for all numeric columns
correlation_matrix = df.corr()

In [43]:
corr1 = correlation_matrix[correlation_matrix >= 0.7].replace(1.0, np.nan).dropna(how='all', axis=1)
corr1.dropna(how='all').replace(np.nan, '')

Unnamed: 0,received_date,lpo,shelf_life_days,maximum_days_on_sale,is_weekend,sales_volume,min_stock,max_stock,stock_quantity,delivery_lag,Year,Month,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,std_demand_lead_time,safety_stock,rop,reorder_point_coverage,reorder_quantity,reorder_point_quantity_turnover
received_date,,0.999949,,,,,,,,,0.916374,,,,,,,,,,,,,
lpo,0.999949,,,,,,,,,,0.916359,,,,,,,,,,,,,
shelf_life_days,,,,0.901432,,,,,,,,,,,,,,,,,,,,
maximum_days_on_sale,,,0.901432,,,,,,,,,,,,,,,,,,,,,
is_weekend,,,,,,,,,,,,,,0.789435,,,,,,,,,,
sales_volume,,,,,,,0.738291,0.746069,,,,,,,,,0.762398,0.759243,0.762025,0.762025,0.76137,,0.771079,
min_stock,,,,,,0.738291,,0.998923,0.906213,,,,,,,,0.968071,0.966033,0.968876,0.968876,0.925611,,0.97315,0.736381
max_stock,,,,,,0.746069,0.998923,,0.906896,,,,,,,,0.978293,0.975706,0.978834,0.978834,0.935271,,0.973054,0.744145
stock_quantity,,,,,,,0.906213,0.906896,,,,,,,,,0.885981,0.883581,0.886454,0.886454,0.845747,,0.883529,
delivery_lag,,,,,,,,,,,,,,,,,,,,,,0.999125,,


In [44]:
# Hide the top half of the matrix to avoid repeating values
mask = np.tril(np.ones(correlation_matrix.shape), k=-1)
masked_corr = correlation_matrix.where(mask == 1)

# Create a heatmap visualization of the correlation matrix
fig_corr = px.imshow(masked_corr,
                    title='Correlation Matrix - Numeric Variables',
                    color_continuous_scale='RdBu_r',  # Red-Blue reversed color scale
                    aspect="auto",                   # Automatic aspect ratio
                    text_auto=False,                 # Display correlation values on cells
                    zmin=-1, zmax=1)                 # Fix color scale from -1 to +1

# Adjust the figure dimensions
fig_corr.update_layout(width=1200, height=1200)

# Display the interactive heatmap
fig_corr.show()

In [45]:
# Displays a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70879 entries, 0 to 70878
Data columns (total 35 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   received_date                    70879 non-null  datetime64[ns]
 1   lpo                              70879 non-null  datetime64[ns]
 2   in_season                        70879 non-null  bool          
 3   shelf_life_days                  70879 non-null  float64       
 4   maximum_days_on_sale             70879 non-null  float64       
 5   distance_km                      70879 non-null  int64         
 6   moq                              70879 non-null  float64       
 7   is_holiday                       70879 non-null  bool          
 8   is_weekend                       70879 non-null  bool          
 9   sales_volume                     70879 non-null  int64         
 10  lead_time                        70879 non-null  int64    