In [13]:
!pip -q install scikit-learn pandas numpy joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

from google.colab import files
uploaded = files.upload()  # select your Shipment_Delay_Clean.csv
df = pd.read_csv("Shipment_Delay_Clean.csv")


Saving Shipment_Delay_Clean.csv to Shipment_Delay_Clean (1).csv


In [14]:
df.columns = df.columns.str.strip().str.lower()
print(df.columns.tolist())
display(df.head())

['order_id', 'supplier_name', 'supplier_country', 'product_name', 'order_item_quantity', 'expected_delivery_date', 'actual_delivery_date', 'days_delayed']


Unnamed: 0,order_id,supplier_name,supplier_country,product_name,order_item_quantity,expected_delivery_date,actual_delivery_date,days_delayed
0,75834,BrewMaster,Italy,Coffee Maker,1,03-01-2025,06-01-2025,3
1,75833,BrewMaster,Italy,Coffee Maker,1,02-01-2025,03-01-2025,1
2,75832,BrewMaster,Italy,Coffee Maker,1,02-01-2025,03-01-2025,1
3,75831,BrewMaster,Italy,Coffee Maker,1,02-01-2025,03-01-2025,1
4,75830,BrewMaster,Italy,Coffee Maker,1,05-01-2025,07-01-2025,2


In [21]:
df['days_delayed'] = pd.to_numeric(df['days_delayed'], errors='coerce')
df = df.dropna(subset=['days_delayed'])
feature_cols = ['supplier_name', 'supplier_country', 'product_name', 'order_item_quantity']
target_col   = 'days_delayed'

In [31]:
X = df[feature_cols].copy()
y = df[target_col].astype(float)

cat_cols = [c for c in feature_cols if X[c].dtype == 'object']
num_cols = [c for c in feature_cols if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [32]:
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
pipe = Pipeline(steps=[('prep', preprocess), ('model', rf)])
pipe.fit(X_train, y_train)


In [34]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Absolute Error (MAE): 1.19
Mean Squared Error (MSE): 2.22
R-squared (R2): -0.01


In [35]:
pred_df = X_test.copy()
pred_df['actual_days_delayed']    = y_test.values
pred_df['predicted_days_delayed'] = pred
# add an ID if you want to link back
if 'order_id' in df.columns:
    pred_df = df.loc[pred_df.index, ['order_id']].join(pred_df)
pred_df.to_csv("predictions_delay.csv", index=False)


In [36]:
ohe = pipe.named_steps['prep'].named_transformers_['cat']
cat_names = list(ohe.get_feature_names_out(cat_cols)) if len(cat_cols) else []
all_feature_names = cat_names + num_cols
importances = pipe.named_steps['model'].feature_importances_
fi = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances}) \
       .sort_values('Importance', ascending=False)
fi.to_csv("feature_importance_delay.csv", index=False)


In [38]:
rmse = np.sqrt(mse)
metrics = pd.DataFrame({'Metric': ['MAE','RMSE','R²'],
                        'Value':  [mae, rmse, r2]})
metrics.to_csv("metrics_delay.csv", index=False)

In [40]:
joblib.dump(pipe, "rf_delay_pipeline.joblib")

print("\n✅ Saved: predictions_delay.csv, feature_importance_delay.csv, metrics_delay.csv")


✅ Saved: predictions_delay.csv, feature_importance_delay.csv, metrics_delay.csv
