## 3.1 Load Cleaned Data

In [1]:
import pandas as pd
import numpy as np

DATA_PATH = "../data/processed/cleaned_inventory_sales.csv"
df = pd.read_csv(DATA_PATH)

# Convert dates AFTER loading
date_cols = ["date_received", "last_order_date", "expiration_date"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

df.head()

Unnamed: 0,product_id,product_name,catagory,supplier_id,supplier_name,stock_quantity,reorder_level,reorder_quantity,unit_price,date_received,last_order_date,expiration_date,warehouse_location,sales_volume,inventory_turnover_rate,status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,4.5,2024-08-16,2024-06-29,2024-09-19,48 Del Sol Trail,32,19,Discontinued
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,NaT,2024-05-29,NaT,36 3rd Place,85,1,Discontinued
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,6.0,NaT,NaT,2024-09-22,3296 Walton Court,31,34,Backordered
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,1.5,NaT,2025-02-19,2024-04-17,3 Westerfield Crossing,95,99,Active
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,4.0,NaT,NaT,NaT,15068 Scoville Court,62,25,Backordered


## 3.2 Core Business Features

In [2]:
df["supply_demand_gap"] = df["stock_quantity"] - df["sales_volume"]

In [3]:
df["supply_demand_gap"]

0     -10
1     -40
2      -1
3     -83
4     -25
       ..
985    30
986   -35
987    -4
988     8
989   -72
Name: supply_demand_gap, Length: 990, dtype: int64

In [4]:
df["inventory_turnover_calc"] = (
    df["sales_volume"] / (df["stock_quantity"] + 1)
)

In [5]:
df["inventory_turnover_calc"]

0      1.391304
1      1.847826
2      1.000000
3      7.307692
4      1.631579
         ...   
985    0.651685
986    1.557377
987    1.031579
988    0.709677
989    4.550000
Name: inventory_turnover_calc, Length: 990, dtype: float64

In [6]:
df["reorder_pressure"] = (
    df["reorder_level"] - df["stock_quantity"]
)

In [7]:
df["reorder_pressure"]

0      50
1      32
2       8
3      47
4      -7
       ..
985   -10
986   -51
987    -4
988    18
989     9
Name: reorder_pressure, Length: 990, dtype: int64

## 3.3 Time-Based Features

In [8]:
df["inventory_age_days"] = (
    df["expiration_date"] - df["date_received"]
).dt.days

In [9]:
df["inventory_age_days"]

0      34.0
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
985     NaN
986     NaN
987     NaN
988     NaN
989   -91.0
Name: inventory_age_days, Length: 990, dtype: float64

In [10]:
df["days_since_last_order"] = (
    pd.Timestamp.today() - df["last_order_date"]
).dt.days

In [11]:
df["days_since_last_order"]

0      563.0
1      594.0
2        NaN
3      328.0
4        NaN
       ...  
985    381.0
986      NaN
987    446.0
988      NaN
989      NaN
Name: days_since_last_order, Length: 990, dtype: float64

## 3.4 Stock Risk Flags

In [12]:
df["stockout_flag"] = (df["stock_quantity"] < df["sales_volume"]).astype(int)

df["overstock_flag"] = (
    df["stock_quantity"] > df["sales_volume"] * 1.5
).astype(int)

## 3.5 Target Variable â€” Mismatch Risk

In [13]:
def assign_mismatch_risk(row):
    if row["stockout_flag"] == 1:
        return "High"
    elif row["overstock_flag"] == 1:
        return "Medium"
    else:
        return "Low"

df["mismatch_risk"] = df.apply(assign_mismatch_risk, axis=1)

In [14]:
df["mismatch_risk"].value_counts(normalize=True) * 100

mismatch_risk
High      53.434343
Medium    23.434343
Low       23.131313
Name: proportion, dtype: float64

## 3.6 Handle Missing Feature Values

In [15]:
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

## 3.7 Select Final Feature Set

In [16]:
feature_cols = [
    "stock_quantity",
    "sales_volume",
    "reorder_level",
    "reorder_quantity",
    "unit_price",
    "inventory_turnover_calc",
    "supply_demand_gap",
    "reorder_pressure",
    "inventory_age_days",
    "days_since_last_order"
]

X = df[feature_cols]
y = df["mismatch_risk"]

## 3.8 Encode Target

In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

y_encoded[:10], le.classes_

(array([0, 0, 0, 0, 0, 2, 1, 2, 2, 2]),
 array(['High', 'Low', 'Medium'], dtype=object))

## 3.9 Save Feature Dataset

In [18]:
df.to_csv("../data/processed/feature_engineered_inventory.csv", index=False)