---

This notebook contains the Feature Engineering for the first module of the **"Intelligent System for Supply Chain Management"** project. 

---

Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import json
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

# import functions personalized
from smart_supply_chain_ai.utils.time_series_functions import TimeSeriesFeatureGenerator

# Configure display and graphs
pd.set_option('display.max_columns', None)
pio.templates.default = "plotly_white"

import warnings
warnings.filterwarnings('ignore')

Load Data

In [2]:
# Define data paths
data_path = os.path.join('../data', 'processed')
docs_path = os.path.join('../docs/')

# Load Pickle file
read_data = pd.read_pickle(data_path + '/grocery.pkl', )

# Load column descriptions from JSON file into a dictionary for reference or documentation
with open(docs_path + 'column_descriptions.json') as f:
    column_descriptions = json.load(f)

In [3]:
# Split the data into two parts:
# - compare_data: from May 30, 2025 onward, used for comparing with predictions.
# - df_target: up to May 29, 2025, used for training the model.
compare_data = read_data.loc['2025-05-30':]
df = read_data.loc[:'2025-05-29']

In [4]:
# Returns a tuple representing the dimensions of the DataFrame: (number of rows, number of columns)
compare_data.shape

(974, 33)

In [5]:
# Returns a tuple representing the dimensions of the DataFrame: (number of rows, number of columns)
df.shape

(2026, 33)

In [6]:
# View the firsts lines of data
df.tail()

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
2021,2024-10-07,2024-10-02,False,Coconut Oil,1700395|P,Pantry,Oils & Vinegars,730,180,unit,1,Oil & Vinegar Co.,1878487|S,120,80,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,19,4,156,195,166,5,Safe,0.909132,1119
2022,2024-10-07,2024-09-28,False,Tilapia,1156946|P,Fresh Foods,Seafood,2,1,lb,3,OceanHarvest Seafood,1265248|S,180,40,Refrigerated,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,26,7,198,231,248,9,Expired,0.555356,1833
2023,2024-10-08,2024-10-06,False,Salmon,1110065|P,Fresh Foods,Seafood,2,1,lb,2,OceanHarvest Seafood,1265248|S,180,40,Refrigerated,Mild to Temperate,Moderate Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,32,7,162,189,205,2,Nearing,0.995607,1022
2024,2024-10-08,2024-10-05,False,Canned Green Beans,1285500|P,Pantry,Canned Goods,1095,90,unit,1,PantryEssentials Ltd.,1859586|S,95,130,Room Temperature,Mild to Temperate,Moderate Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,92,5,348,435,370,3,Safe,2.070304,491
2025,2024-10-09,2024-10-05,False,Crackers,1768765|P,Pantry,Snacks,90,30,unit,3,SnackTime Distributors,1830370|S,80,110,Room Temperature,Mild to Temperate,Light Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,141,6,776,970,67,4,Safe,1.162734,875


In [7]:
# Calculate the difference between expected and actual delivery time; positive means early, negative means delayed
df['delivery_time_variation'] = df['lead_time'] - df['delivery_lag']

# Add a description for the 'delivery_time_variation' column to clarify its meaning and interpretation
column_descriptions.update({
    'delivery_time_variation': 'Number of days between the expected delivery date and the actual delivery date. A positive value indicates early delivery, while a negative value indicates a delay.'
})


In [8]:
# Set 'received_date' column as the DataFrame index to enable time-based operations
df.set_index('received_date', inplace=True)


In [9]:
# Take information of Date
df['Year'] = df.index.get_level_values('received_date').year
df['Month'] = df.index.get_level_values('received_date').month
df['Day'] = df.index.get_level_values('received_date').day
df['DayOfYear'] = df.index.get_level_values('received_date').dayofyear
df['Weekday'] = df.index.get_level_values('received_date').weekday
df['QuarterOfYear'] = df.index.get_level_values('received_date').quarter
df['WeekOfYear'] = df.index.get_level_values('received_date').isocalendar().week.values

In [10]:
# Remove multi index
df = df.reset_index()

In [11]:
def calculate_std_demand_lead_time(df, product_col='product', sales_col='sales_volume', lead_col='delivery_lag'):
    """
    Calculate the standard deviation of demand during lead time for each product.

    Parameters:
    - df (DataFrame): Input DataFrame containing sales and delivery data.
    - product_col (str): Column name representing the product identifier.
    - sales_col (str): Column name representing daily sales volume.
    - lead_col (str): Column name representing delivery lead time (in days).

    Returns:
    - DataFrame: Original DataFrame with additional columns:
        - 'avg_daily_sales': Average daily sales per product.
        - 'std_daily_sales': Standard deviation of daily sales per product.
        - 'avg_daily_lag': Average delivery lead time per product.
        - 'std_daily_lag': Standard deviation of delivery lead time per product.
        - 'std_demand_lead_time': Standard deviation of demand during lead time per product.
    """

    # Calculate the average daily sales for each product
    df['avg_daily_sales'] = df.groupby(product_col)[sales_col].transform('mean')

    # Calculate the standard deviation of daily sales for each product
    df['std_daily_sales'] = df.groupby(product_col)[sales_col].transform('std')

    # Calculate the average delivery lead time for each product
    df['avg_daily_lag'] = df.groupby(product_col)[lead_col].transform('mean')

    # Calculate the standard deviation of delivery lead time for each product
    df['std_daily_lag'] = df.groupby(product_col)[lead_col].transform('std')

    # Compute the standard deviation of demand during lead time using the formula:
    # sqrt(LeadTimeAvg * SalesStd^2 + SalesAvg^2 * LeadTimeStd^2)
    df['std_demand_lead_time'] = np.sqrt(
        (df['avg_daily_lag'] * (df['std_daily_sales'] ** 2)) +
        ((df['avg_daily_sales'] ** 2) * (df['std_daily_lag'] ** 2))
    )

    return df


In [12]:
# Apply the function to compute demand variability during lead time and update the DataFrame with new metrics
df = calculate_std_demand_lead_time(df=df)



---

### **Service Level Factor (Z) Based on the Standard Normal Distribution**

The Service Level Factor (Z) is obtained from a standard normal distribution table.

- **80% Service Level**: Z = 0.84  
- **85% Service Level**: Z = 1.04  
- **90% Service Level**: Z = 1.28  
- **95% Service Level**: Z = 1.64  
- **98% Service Level**: Z = 2.05  
- **99% Service Level**: Z = 2.33

---


In [13]:
# Set the Z factor for a 95% service level
Z = 1.64

# Calculate the Safety Stock
df['safety_stock'] = (Z * df['std_demand_lead_time'])

# Add description for the 'safety_stock' column to clarify its role in inventory management
column_descriptions.update({'safety_stock': 'Extra inventory buffer maintained to protect against uncertainties in demand and delivery lead time.'})



In [14]:
df.select_dtypes(np.number)[:2]

Unnamed: 0,shelf_life_days,maximum_days_on_sale,moq,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,avg_daily_lag,std_daily_lag,std_demand_lead_time,safety_stock
0,365,90,60,50,4,240,300,317,2,0.839816,1212,2,2022,12,9,343,4,4,49,76.833333,31.801992,4.0,3.162278,251.155485,411.894996
1,14,7,120,590,6,3678,4291,4608,2,0.378488,2689,4,2022,12,9,343,4,4,49,618.6,390.07025,4.6,2.701851,1869.056871,3065.253269


In [15]:
# Calculates the Reorder Point (ROP) for each product.
df['rop'] = (df['avg_daily_sales'] * df['delivery_lag']) + df['safety_stock']

# Define description for 'rop' (Reorder Point) to indicate when replenishment should be triggered
column_descriptions.update({'rop': 'ROP is the inventory level at which a new order should be placed to avoid stock outs.'})


In [16]:
# Create Variable Reorder Point / Sales Volume
df['reorder_point_coverage'] = df['rop'] / df['avg_daily_sales']

# Add description for 'reorder_point_coverage' to indicate how many days of demand the ROP supports
column_descriptions.update({'reorder_point_coverage': 'Number of days of demand that the reorder point (ROP) can cover'})


In [17]:
# Calculates the reorder quantity as the average sales per product multiplied by the lead time
df['reorder_quantity'] = (df.groupby('product')['sales_volume'].transform(np.mean) * df['lead_time']).astype(int)

# Add description for 'reorder_quantity' to define the optimal quantity to order when replenishing stock
column_descriptions.update({'reorder_quantity': 'Optimal number of units to order once inventory reaches the reorder point, based on expected demand during lead time.'})


In [18]:
# Create Variable Reorder Level, Reorder Quantity and Inventory Turnover Rate
df['reorder_point_quantity_turnover'] = (df['rop'] - df['reorder_quantity']) / df['inventory_turnover_rate']

# Add description for 'reorder_point_quantity_turnover' to explain how quickly the buffer stock is depleted
column_descriptions.update({'reorder_point_quantity_turnover': 'Rate at which the buffer between the reorder level and reorder quantity is consumed'})


In [19]:
# Transform string distance_km in integer for include in train models
df['distance_km'] = df['distance_km'].astype(int)

# Transform Objects, Booleans and Category columns
# Objects
df['product_code'] = df['product'].astype('category').cat.codes
df['supplier_code'] = df['supplier'].astype('category').cat.codes

# Categories
df['category_code'] = df['category'].cat.codes
df['sub_category_code'] = df['sub_category'].cat.codes
df['supplier_rating_code'] = df['supplier_rating'].cat.codes
df['weather_severity_code'] = df['weather_severity'].cat.codes
df['day_classification_code'] = df['day_classification'].cat.codes
df['sales_demand_code'] = df['sales_demand'].cat.codes
df['expiration_status_code'] = df['expiration_status'].cat.codes

# Booleans
df['in_season_code'] = df['in_season'].astype('category').cat.codes
df['is_holiday_code'] = df['is_holiday'].astype('category').cat.codes
df['is_weekend_code'] = df['is_weekend'].astype('category').cat.codes

# Save Complete dataframe with Feature Engineering

In [20]:
# Save the processed DataFrame as a pickle file for efficient loading in future steps
df.to_pickle(data_path + '/feature_eng_complete.pkl')
compare_data.to_pickle(data_path + '/data_for_compare.pkl')

# Organize columns descriptions in alphabetical order names
column_descriptions_feat_eng = dict(sorted(column_descriptions.items()))

# save Dictionary JSON archive
with open(docs_path + 'column_descriptions_feat_eng.json', 'w') as f:
    json.dump(column_descriptions_feat_eng, f, indent=4)

In [21]:
del df, read_data, compare_data

# Load Completed Dataframe

In [22]:
# Define data paths
data_path = os.path.join('../data/', 'processed/')
docs_path = os.path.join('../docs/')

# Load Pickle file
df_load = pd.read_pickle(data_path + 'feature_eng_complete.pkl')

# Load column descriptions from JSON file into a dictionary for reference or documentation
with open(docs_path + 'column_descriptions_feat_eng.json') as f:
    column_descriptions = json.load(f)

In [23]:
# Select only datetime and numeric columns from the loaded DataFrame for time series analysis
df_ts = df_load.select_dtypes(['datetime', np.number])

In [24]:
# Drop non-essential or redundant columns from the time series DataFrame to streamline analysis
columns_dropped = ['max_stock', 'avg_daily_sales', 'std_daily_sales', 'avg_daily_lag', 'std_daily_lag', 'Year','min_stock', 'Month', 'lpo', 'DayOfYear', 'Weekday', 'WeekOfYear', 'avg_daily_sales',
       'std_daily_sales', 'avg_daily_lag', 'std_daily_lag', 'std_demand_lead_time', 'reorder_quantity', 'delivery_lag', 'safety_stock', 'maximum_days_on_sale', 'reorder_point_quantity_turnover',
       'reorder_point_coverage',  'doi_inventory_turnover', 'is_weekend_code']

df_ts.drop(columns=columns_dropped, inplace=True)


In [25]:
# Calculate the correlation matrix for all numeric columns
correlation_matrix = df_ts.corr()

In [26]:
corr1 = correlation_matrix[correlation_matrix >= 0.7].replace(1.0, np.nan).dropna(how='all', axis=1)
corr1.dropna(how='all').replace(np.nan, '')

Unnamed: 0,sales_volume,stock_quantity,rop
sales_volume,,0.756408,0.805085
stock_quantity,0.756408,,0.840619
rop,0.805085,0.840619,


In [27]:
# Hide the top half of the matrix to avoid repeating values
mask = np.tril(np.ones(correlation_matrix.shape), k=-1)
masked_corr = correlation_matrix.where(mask == 1)

# Create a heatmap visualization of the correlation matrix
fig_corr = px.imshow(masked_corr,
                    title='Correlation Matrix - Numeric Variables',
                    color_continuous_scale='RdBu_r',  # Red-Blue reversed color scale
                    aspect="auto",                   # Automatic aspect ratio
                    text_auto=False,                 # Display correlation values on cells
                    zmin=-1, zmax=1)                 # Fix color scale from -1 to +1

# Adjust the figure dimensions
fig_corr.update_layout(width=1200, height=1200)

# Display the interactive heatmap
fig_corr.show()

In [28]:
# Displays a concise summary of the DataFrame
df_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   received_date            2026 non-null   datetime64[ns]
 1   shelf_life_days          2026 non-null   int64         
 2   distance_km              2026 non-null   int64         
 3   moq                      2026 non-null   int64         
 4   sales_volume             2026 non-null   int64         
 5   lead_time                2026 non-null   int64         
 6   stock_quantity           2026 non-null   int64         
 7   inventory_turnover_rate  2026 non-null   float64       
 8   delivery_time_variation  2026 non-null   int64         
 9   Day                      2026 non-null   int32         
 10  QuarterOfYear            2026 non-null   int32         
 11  rop                      2026 non-null   float64       
 12  product_code             2026 non-

In [29]:
# Save the processed DataFrame as a pickle file for efficient loading in future steps
df_ts.to_pickle(data_path + '/data_for_train.pkl')