---

This notebook contains the Feature Engineering for the first module of the **"Intelligent System for Supply Chain Management"** project. 

---

Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import json
import plotly.express as px
import plotly.io as pio

# Import personalized classes 
from smart_supply_chain_ai.utils.preprocess_gapFreq import PrepareForecastingData, TimeSeriesIntegrityTransformer


# Configure display and graphs
pd.set_option('display.max_columns', None)
pio.templates.default = "plotly_white"

import warnings
warnings.filterwarnings('ignore')

Load Data

In [2]:
# Define data paths
data_path = os.path.join('../data', 'processed')
docs_path = os.path.join('../docs/')

# Load Parquet file
read_data = pd.read_parquet(data_path + '/grocery.parquet')

# Load column descriptions from JSON file into a dictionary for reference or documentation
with open(docs_path + 'column_descriptions.json') as f:
    column_descriptions = json.load(f)

In [3]:
# View dataframe initial shape
read_data.shape

(97646, 33)

In [4]:
timeTransformer = TimeSeriesIntegrityTransformer(date_col='received_date', id_col='product_id', target_col='sales_volume' )

df_transformed = timeTransformer.fit_transform(read_data)

In [5]:
order_columns = df_transformed.columns.tolist()

In [6]:
# Split the data into two parts:
# - Data on/after May 10, 2025, for comparison/validation
# - Data before May 10, 2025, for training and analysis
compare_data = df_transformed[df_transformed['received_date'] >= '2025-05-10'].copy()
df = df_transformed[df_transformed['received_date'] < '2025-05-10'].copy()

In [7]:
# Check tha last few rows of the historical data
df.tail()

Unnamed: 0,product_id,received_date,lpo,in_season,product,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
173089,1994909|P,2025-05-05,2025-04-29 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Saturday,False,True,High,0.0,5.0,284.0,355.0,34.0,4.0,Safe,26.853389,37.0
173090,1994909|P,2025-05-06,2025-04-30 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,69.0,4.0,284.0,355.0,347.0,6.0,Safe,26.853389,37.0
173091,1994909|P,2025-05-07,2025-04-29 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,78.0,4.0,284.0,355.0,321.0,8.0,Safe,26.853389,37.0
173092,1994909|P,2025-05-08,2025-04-29 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,0.0,4.0,284.0,355.0,321.0,8.0,Safe,26.853389,37.0
173093,1994909|P,2025-05-09,2025-05-03 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,43.0,3.0,284.0,355.0,288.0,6.0,Safe,26.853389,37.0


In [8]:
# Check the first few rows of the comparison data
compare_data.head()

Unnamed: 0,product_id,received_date,lpo,in_season,product,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
883,1010497|P,2025-05-10,2025-05-03 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,PantryEssentials Ltd.,1764687|S,95.0,130.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,0.0,4.0,388.0,485.0,77.0,5.0,Safe,58.659824,17.0
884,1010497|P,2025-05-11,2025-05-03 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,PantryEssentials Ltd.,1764687|S,95.0,130.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,0.0,4.0,388.0,485.0,77.0,5.0,Safe,58.659824,17.0
885,1010497|P,2025-05-12,2025-05-07 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,Wholesale Warehouse,1141069|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,100.0,3.0,285.0,380.0,377.0,5.0,Safe,58.659824,17.0
886,1010497|P,2025-05-13,2025-05-07 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,Wholesale Warehouse,1141069|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,0.0,3.0,285.0,380.0,377.0,5.0,Safe,58.659824,17.0
887,1010497|P,2025-05-14,2025-05-07 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,Wholesale Warehouse,1141069|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,0.0,3.0,285.0,380.0,377.0,5.0,Safe,58.659824,17.0


In [9]:
prior_columns = ['received_date', 'product_id', 'sales_volume']
exog_columns = list(set(order_columns) - set(prior_columns))

Save Split DataFrame

In [10]:
# Return the shapes of both datasets to verify the split
df.shape, compare_data.shape

((150110, 33), (23120, 33))

In [11]:
# Calculate the difference between expected and actual delivery time; positive means early, negative means delayed
df['delivery_time_variation'] = df['lead_time'] - df['delivery_lag']

# Add a description for the 'delivery_time_variation' column to clarify its meaning and interpretation
column_descriptions.update({
    'delivery_time_variation': 'Number of days between the expected delivery date and the actual delivery date. A positive value indicates early delivery, while a negative value indicates a delay.'
})


In [12]:
# Set 'received_date' column as the DataFrame index to enable time-based operations
df.set_index('received_date', inplace=True)


In [13]:
# Take information of Date
df['Year'] = df.index.get_level_values('received_date').year
df['Month'] = df.index.get_level_values('received_date').month
df['Day'] = df.index.get_level_values('received_date').day
df['DayOfYear'] = df.index.get_level_values('received_date').dayofyear
df['Weekday'] = df.index.get_level_values('received_date').weekday
df['QuarterOfYear'] = df.index.get_level_values('received_date').quarter
df['WeekOfYear'] = df.index.get_level_values('received_date').isocalendar().week.values

In [14]:
# Remove multi index
df = df.reset_index()
df

Unnamed: 0,received_date,product_id,lpo,in_season,product,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear
0,2022-12-09,1010497|P,0,missing,missing,missing,missing,1095.0,90.0,missing,2.0,missing,missing,25.0,300.0,missing,missing,missing,missing,missing,missing,missing,missing,missing,0.0,4.0,285.0,380.0,536.0,4.0,missing,58.659824,17.0,0.0,2022,12,9,343,4,4,49
1,2022-12-10,1010497|P,2022-12-02 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,Wholesale Warehouse,1141069|S,25.0,300.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Saturday,False,True,High,131.0,4.0,285.0,380.0,317.0,8.0,Safe,58.659824,17.0,-4.0,2022,12,10,344,5,4,49
2,2022-12-11,1010497|P,2022-12-01 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,Wholesale Warehouse,1141069|S,25.0,300.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Sunday,False,True,High,274.0,4.0,285.0,380.0,292.0,10.0,Safe,58.659824,17.0,-6.0,2022,12,11,345,6,4,49
3,2022-12-12,1010497|P,2022-12-05 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,Wholesale Warehouse,1141069|S,25.0,300.0,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,66.0,3.0,285.0,380.0,405.0,7.0,Safe,58.659824,17.0,-4.0,2022,12,12,346,0,4,50
4,2022-12-13,1010497|P,2022-12-05 00:00:00,False,Canned Tomatoes,Pantry,Canned Goods,1095.0,90.0,unit,2.0,Wholesale Warehouse,1141069|S,25.0,300.0,Room Temperature,Mild to Temperate,Moderate Rain,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,107.0,3.0,285.0,380.0,367.0,8.0,Safe,58.659824,17.0,-5.0,2022,12,13,347,1,4,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150105,2025-05-05,1994909|P,2025-04-29 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Saturday,False,True,High,0.0,5.0,284.0,355.0,34.0,4.0,Safe,26.853389,37.0,1.0,2025,5,5,125,0,2,19
150106,2025-05-06,1994909|P,2025-04-30 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,69.0,4.0,284.0,355.0,347.0,6.0,Safe,26.853389,37.0,-2.0,2025,5,6,126,1,2,19
150107,2025-05-07,1994909|P,2025-04-29 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,78.0,4.0,284.0,355.0,321.0,8.0,Safe,26.853389,37.0,-4.0,2025,5,7,127,2,2,19
150108,2025-05-08,1994909|P,2025-04-29 00:00:00,False,Ground Coffee,Beverages,Coffee,180.0,60.0,unit,1.0,BeverageSource Co.,1820500|S,110.0,70.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,0.0,4.0,284.0,355.0,321.0,8.0,Safe,26.853389,37.0,-4.0,2025,5,8,128,3,2,19


In [15]:
def calculate_std_demand_lead_time(df, product_col='product', sales_col='sales_volume', lead_col='delivery_lag'):
    """
    Calculate the standard deviation of demand during lead time for each product.

    Parameters:
    - df (DataFrame): Input DataFrame containing sales and delivery data.
    - product_col (str): Column name representing the product identifier.
    - sales_col (str): Column name representing daily sales volume.
    - lead_col (str): Column name representing delivery lead time (in days).

    Returns:
    - DataFrame: Original DataFrame with additional columns:
        - 'avg_daily_sales': Average daily sales per product.
        - 'std_daily_sales': Standard deviation of daily sales per product.
        - 'avg_daily_lag': Average delivery lead time per product.
        - 'std_daily_lag': Standard deviation of delivery lead time per product.
        - 'std_demand_lead_time': Standard deviation of demand during lead time per product.
    """

    # Calculate the average daily sales for each product
    df['avg_daily_sales'] = df.groupby(product_col)[sales_col].transform('mean')

    # Calculate the standard deviation of daily sales for each product
    df['std_daily_sales'] = df.groupby(product_col)[sales_col].transform('std')

    # Calculate the average delivery lead time for each product
    df['avg_daily_lag'] = df.groupby(product_col)[lead_col].transform('mean')

    # Calculate the standard deviation of delivery lead time for each product
    df['std_daily_lag'] = df.groupby(product_col)[lead_col].transform('std')

    # Compute the standard deviation of demand during lead time using the formula:
    # sqrt(LeadTimeAvg * SalesStd^2 + SalesAvg^2 * LeadTimeStd^2)
    df['std_demand_lead_time'] = np.sqrt(
        (df['avg_daily_lag'] * (df['std_daily_sales'] ** 2)) +
        ((df['avg_daily_sales'] ** 2) * (df['std_daily_lag'] ** 2))
    )

    return df


In [16]:
# Apply the function to compute demand variability during lead time and update the DataFrame with new metrics
df = calculate_std_demand_lead_time(df=df)



---

### **Service Level Factor (Z) Based on the Standard Normal Distribution**

The Service Level Factor (Z) is obtained from a standard normal distribution table.

- **80% Service Level**: Z = 0.84  
- **85% Service Level**: Z = 1.04  
- **90% Service Level**: Z = 1.28  
- **95% Service Level**: Z = 1.64  
- **98% Service Level**: Z = 2.05  
- **99% Service Level**: Z = 2.33

---


In [17]:
# Set the Z factor for a 95% service level
Z = 1.64

# Calculate the Safety Stock
df['safety_stock'] = (Z * df['std_demand_lead_time'])

# Add description for the 'safety_stock' column to clarify its role in inventory management
column_descriptions.update({'safety_stock': 'Extra inventory buffer maintained to protect against uncertainties in demand and delivery lead time.'})

In [18]:
df.select_dtypes(np.number)[:2]

Unnamed: 0,shelf_life_days,maximum_days_on_sale,supplier_rating,distance_km,moq,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,inventory_turnover_rate,doi_inventory_turnover,delivery_time_variation,Year,Month,Day,DayOfYear,Weekday,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,avg_daily_lag,std_daily_lag,std_demand_lead_time,safety_stock
0,1095.0,90.0,2.0,25.0,300.0,0.0,4.0,285.0,380.0,536.0,4.0,58.659824,17.0,0.0,2022,12,9,343,4,4,49,0.0,0.0,5.433594,2.507144,0.0,0.0
1,1095.0,90.0,2.0,25.0,300.0,131.0,4.0,285.0,380.0,317.0,8.0,58.659824,17.0,-4.0,2022,12,10,344,5,4,49,92.941043,107.19155,5.956916,2.452352,346.979213,569.04591


In [19]:
# Calculates the Reorder Point (ROP) for each product.
df['rop'] = (df['avg_daily_sales'] * df['delivery_lag']) + df['safety_stock']

# Define description for 'rop' (Reorder Point) to indicate when replenishment should be triggered
column_descriptions.update({'rop': 'ROP is the inventory level at which a new order should be placed to avoid stock outs.'})


In [20]:
# Create Variable Reorder Point / Sales Volume
df['reorder_point_coverage'] = df['rop'] / df['avg_daily_sales']
df['reorder_point_coverage'] = df['reorder_point_coverage'].fillna(-1)

# Add description for 'reorder_point_coverage' to indicate how many days of demand the ROP supports
column_descriptions.update({'reorder_point_coverage': 'Number of days of demand that the reorder point (ROP) can cover'})


In [21]:
# Calculates the reorder quantity as the average sales per product multiplied by the lead time
df['reorder_quantity'] = (df.groupby('product')['sales_volume'].transform(np.mean) * df['lead_time']).astype(int)

# Add description for 'reorder_quantity' to define the optimal quantity to order when replenishing stock
column_descriptions.update({'reorder_quantity': 'Optimal number of units to order once inventory reaches the reorder point, based on expected demand during lead time.'})


In [22]:
# Create Variable Reorder Level, Reorder Quantity and Inventory Turnover Rate
df['reorder_point_quantity_turnover'] = (df['rop'] - df['reorder_quantity']) / df['inventory_turnover_rate']

# Add description for 'reorder_point_quantity_turnover' to explain how quickly the buffer stock is depleted
column_descriptions.update({'reorder_point_quantity_turnover': 'Rate at which the buffer between the reorder level and reorder quantity is consumed'})


In [23]:
# Transform string distance_km in integer for include in train models
df['distance_km'] = df['distance_km'].astype(int)

In [24]:
# Drop non-essential or redundant columns from the time series DataFrame to streamline analysis
columns_dropped = df.select_dtypes(exclude=[np.number, 'datetime', bool]).columns.tolist()

df.drop(columns=columns_dropped, inplace=True)


In [25]:
# Calculate the correlation matrix for all numeric columns
correlation_matrix = df.corr()

In [26]:
corr1 = correlation_matrix[correlation_matrix >= 0.7].replace(1.0, np.nan).dropna(how='all', axis=1)
corr1.dropna(how='all').replace(np.nan, '')

Unnamed: 0,received_date,shelf_life_days,maximum_days_on_sale,min_stock,max_stock,stock_quantity,delivery_lag,Year,Month,DayOfYear,QuarterOfYear,WeekOfYear,avg_daily_sales,std_daily_sales,std_demand_lead_time,safety_stock,rop,reorder_point_coverage,reorder_quantity,reorder_point_quantity_turnover
received_date,,,,,,,,0.916248,,,,,,,,,,,,
shelf_life_days,,,0.895012,,,,,,,,,,,,,,,,,
maximum_days_on_sale,,0.895012,,,,,,,,,,,,,,,,,,
min_stock,,,,,0.999142,0.910169,,,,,,,0.866768,0.939056,0.913068,0.913068,0.862858,,0.868634,0.887898
max_stock,,,,0.999142,,0.910503,,,,,,,0.874043,0.946856,0.9207,0.9207,0.870151,,0.867911,0.893839
stock_quantity,,,,0.910169,0.910503,,,,,,,,0.794062,0.860302,0.836507,0.836507,0.790869,,0.789915,0.812976
delivery_lag,,,,,,,,,,,,,,,,,,0.949299,,
Year,0.916248,,,,,,,,,,,,,,,,,,,
Month,,,,,,,,,,0.996773,0.973065,0.979564,,,,,,,,
DayOfYear,,,,,,,,,0.996773,,0.970046,0.981369,,,,,,,,


In [27]:
# Hide the top half of the matrix to avoid repeating values
mask = np.tril(np.ones(correlation_matrix.shape), k=-1)
masked_corr = correlation_matrix.where(mask == 1)

# Create a heatmap visualization of the correlation matrix
fig_corr = px.imshow(masked_corr,
                    title='Correlation Matrix - Numeric Variables',
                    color_continuous_scale='RdBu_r',  # Red-Blue reversed color scale
                    aspect="auto",                   # Automatic aspect ratio
                    text_auto=False,                 # Display correlation values on cells
                    zmin=-1, zmax=1)                 # Fix color scale from -1 to +1

# Adjust the figure dimensions
fig_corr.update_layout(width=1200, height=1200)

# Display the interactive heatmap
fig_corr.show()

In [28]:
# Displays a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150110 entries, 0 to 150109
Data columns (total 32 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   received_date                    150110 non-null  datetime64[ns]
 1   shelf_life_days                  150110 non-null  float64       
 2   maximum_days_on_sale             150110 non-null  float64       
 3   supplier_rating                  150110 non-null  float64       
 4   distance_km                      150110 non-null  int64         
 5   moq                              150110 non-null  float64       
 6   sales_volume                     150110 non-null  float64       
 7   lead_time                        150110 non-null  float64       
 8   min_stock                        150110 non-null  float64       
 9   max_stock                        150110 non-null  float64       
 10  stock_quantity                   150110 non-

In [29]:
# save columns  names descriptions
with open(docs_path + 'feat_eng_column_descriptions.json', "w", encoding="utf-8") as f:
    json.dump(column_descriptions, f, ensure_ascii=False, indent=4)