In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = pd.read_csv("DataCoSupplyChainDataset.csv",encoding = "ISO-8859-1")

# Preprocessing

In [3]:
data = raw_data.drop(columns=[
                                  'Delivery Status', 'Late_delivery_risk', 
                                  'shipping date (DateOrders)', 'Benefit per order', 'Sales per customer', 'Category Id',
                                  'Order Profit Per Order', 'Order Item Discount', 'Order Item Total', 'Order Status', 
                                  'Customer Email', 'Customer Password', 'Latitude', 'Longitude', 'Product Description', 'Product Image',
                                  'Customer Fname', 'Customer Id', 'Customer Lname', 'Department Id',
                                  'Order Customer Id', 'Order Item Cardprod Id', 'Order Item Id',
                                  'Product Card Id', 'Product Category Id', 'Order Id', 'Customer Street',
                                  'Customer Zipcode', 'Order Zipcode', 'Order Item Product Price',
                                   'Order Item Profit Ratio'
])

In [4]:
len(data.columns)

22

In [5]:
data.columns


Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Category Name', 'Customer City', 'Customer Country',
       'Customer Segment', 'Customer State', 'Department Name', 'Market',
       'Order City', 'Order Country', 'order date (DateOrders)',
       'Order Item Discount Rate', 'Order Item Quantity', 'Sales',
       'Order Region', 'Order State', 'Product Name', 'Product Price',
       'Product Status', 'Shipping Mode'],
      dtype='object')

In [6]:
data["order date (DateOrders)"] = pd.to_datetime(data["order date (DateOrders)"])

In [7]:
data = data.rename(columns={"order date (DateOrders)":"order_date"})

In [8]:
data['Year'] = data['order_date'].dt.year
data['Month'] = data['order_date'].dt.month
data.sort_values(by='order_date', inplace=True)
data.drop(columns=['order_date'], inplace=True)

In [9]:
grouped_data = data.groupby(['Year', 'Market', 'Month', 'Order Country',
                                        'Order Region', 'Order State', 'Product Name', 
                                         'Category Name','Customer Segment'])

In [10]:
grouped_data_1 = grouped_data["Order Item Quantity"].sum().reset_index()

In [11]:
grouped_data_2 = grouped_data["Sales"].sum().reset_index()

In [12]:
grouped_data_3 = grouped_data["Days for shipping (real)"].mean().reset_index()
grouped_data_4 = grouped_data["Product Price"].mean().reset_index()
grouped_data_5 = grouped_data["Product Status"].mean().reset_index()

In [13]:
grouped_data_df = grouped_data_1.copy()
grouped_data_df["Sales"] = grouped_data_2["Sales"]
grouped_data_df["Days for shipping (real)"] = grouped_data_3["Days for shipping (real)"]
grouped_data_df["Product Price"] = grouped_data_4["Product Price"]
grouped_data_df["Product Status"] = grouped_data_5["Product Status"]

In [14]:
grouped_data_df.columns

Index(['Year', 'Market', 'Month', 'Order Country', 'Order Region',
       'Order State', 'Product Name', 'Category Name', 'Customer Segment',
       'Order Item Quantity', 'Sales', 'Days for shipping (real)',
       'Product Price', 'Product Status'],
      dtype='object')

In [15]:
# Dummy variables
grouped_data_df = pd.get_dummies(grouped_data_df, columns = ['Market', 'Order Country',
                                        'Order Region', 'Order State', 'Product Name', 
                                         'Category Name','Customer Segment'],drop_first=True, dtype=int)

In [16]:
grouped_data_df.shape

(72679, 1452)

In [17]:
x=grouped_data_df.drop(columns=['Order Item Quantity'])
y=grouped_data_df['Order Item Quantity']

In [18]:
# Standardization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_X = x.copy()
scaled_X = scaler.fit_transform(scaled_X)
scaled_X = pd.DataFrame(scaled_X, columns = x.columns)

In [19]:
scaled_X.isnull().sum() 

Year                                0
Month                               0
Sales                               0
Days for shipping (real)            0
Product Price                       0
                                   ..
Category Name_Women's Apparel       0
Category Name_Women's Clothing      0
Category Name_Women's Golf Clubs    0
Customer Segment_Corporate          0
Customer Segment_Home Office        0
Length: 1451, dtype: int64

In [20]:
scaled_X=scaled_X.loc[:, (scaled_X!=0).any(axis=0)]

# Feature Engineering

In [21]:

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=42)
rfr.fit(scaled_X, y)

In [22]:
importances = rfr.feature_importances_
feature_names = scaled_X.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)


In [23]:
feature_importance_df

Unnamed: 0,Feature,Importance
2,Sales,0.649694
4,Product Price,0.341478
1410,Category Name_Cleats,0.001592
1352,Product Name_Perfect Fitness Perfect Rip Deck,0.001539
1436,Category Name_Shop By Sport,0.000923
...,...,...
554,Order State_Gaziantep,0.000000
552,Order State_Garb-Chrarda-Beni Hsen,0.000000
551,Order State_Gansu,0.000000
550,Order State_Galway,0.000000


In [24]:
selected_ind = np.argsort(rfr.feature_importances_)[1400:] # sorting the features from lowest to highest
selected_features = scaled_X.columns[selected_ind]
selected_features
print(len(selected_features))

50


In [25]:
X_selected = scaled_X[selected_features]
X_selected.head()

Unnamed: 0,Order Country_Australia,Category Name_Cameras,Order State_Madrid,Product Name_Lawn mower,Market_USCA,Market_Pacific Asia,Customer Segment_Home Office,Order Region_US Center,Category Name_Garden,Order State_Queensland,...,Category Name_Men's Footwear,Days for shipping (real),Product Name_Nike Men's Free 5.0+ Running Shoe,Category Name_Cardio Equipment,Product Name_Under Armour Girls' Toddler Spine Surge Runni,Category Name_Shop By Sport,Product Name_Perfect Fitness Perfect Rip Deck,Category Name_Cleats,Product Price,Sales
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.195975,0.008126
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.060302,0.002501
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.020106,0.006043
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.666667,1.0,1.0,0.0,0.0,0.0,0.0,0.045226,0.006043
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.020095,0.002916


In [26]:
X_selected.columns

Index(['Order Country_Australia', 'Category Name_Cameras ',
       'Order State_Madrid', 'Product Name_Lawn mower', 'Market_USCA',
       'Market_Pacific Asia', 'Customer Segment_Home Office',
       'Order Region_US Center ', 'Category Name_Garden',
       'Order State_Queensland', 'Order Country_Estados Unidos',
       'Order Country_Francia', 'Product Name_Clicgear 8.0 Shoe Brush',
       'Category Name_Fishing', 'Order State_Texas',
       'Category Name_Golf Gloves', 'Order State_Isla de Francia',
       'Product Name_Field & Stream Sportsman 16 Gun Fire Safe',
       'Order State_Inglaterra', 'Order Country_Reino Unido', 'Market_Europe',
       'Order State_Nueva York', 'Order Region_East of USA',
       'Product Name_O'Brien Men's Neoprene Life Vest',
       'Order Region_Northern Europe', 'Order Region_Western Europe',
       'Product Name_Nike Men's Dri-FIT Victory Golf Polo',
       'Category Name_Women's Apparel', 'Category Name_Camping & Hiking',
       'Year', 'Product Nam

In [27]:
X_selected.shape

(72679, 50)

# Model Building

## Random Forest

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size = 0.3, random_state = 36)

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rfr = RandomForestRegressor(random_state = 695)
forest_params = [{'n_estimators': [30, 50, 70, 90],
                  'max_features': [5, 10, 15, 20, 'sqrt', 'log2'],
                  'min_samples_split': [1, 50, 100, 150, 200]}]

In [30]:
reg = GridSearchCV(rfr, forest_params, cv = 3, scoring='neg_mean_squared_error', verbose = 2)
reg.fit(X_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[CV] END max_features=5, min_samples_split=1, n_estimators=30; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=30; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=30; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=50; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=50; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=50; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=70; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=70; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=70; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=90; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=90; total time=   0.0s
[CV] END max_features=5, min_sample

72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constrain

In [31]:
print(reg.best_params_)
print(reg.best_score_) # MSE

{'max_features': 20, 'min_samples_split': 50, 'n_estimators': 90}
-4.031620454612725


In [32]:
# Best model rerun
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
rfr_best = RandomForestRegressor(random_state = 36, 
                            max_features = 20, min_samples_split = 50, n_estimators = 90)
rfr_best.fit(X_train, y_train)
y_pred_rf = rfr_best.predict(X_test)
print("R^2:", r2_score(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))

R^2: 0.9545394818833998
MSE: 3.873249868082718
MAE: 0.2230146532916066


In [33]:
import pickle

with open ("randomforest_model.pkl","wb") as file:
    pickle.dump(rfr_best,file)

# Gradient Boosting

In [34]:
from sklearn.ensemble import GradientBoostingRegressor

gbt = GradientBoostingRegressor(random_state = 695)
boost_params = [{'n_estimators': [30, 50, 70, 90],
                  'max_features': [5, 10, 15, 20, 'sqrt', 'log2'],
                  'min_samples_split': [1, 50, 100, 150, 200]}]

In [35]:
reg = GridSearchCV(gbt, boost_params, cv = 3, scoring='neg_mean_squared_error', verbose = 2)
reg.fit(X_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[CV] END max_features=5, min_samples_split=1, n_estimators=30; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=30; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=30; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=50; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=50; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=50; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=70; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=70; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=70; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=90; total time=   0.0s
[CV] END max_features=5, min_samples_split=1, n_estimators=90; total time=   0.0s
[CV] END max_features=5, min_sample

72 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/home/user/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constrain

In [36]:
print(reg.best_params_)
print(reg.best_score_) # MSE

{'max_features': 20, 'min_samples_split': 50, 'n_estimators': 90}
-1.9972665876932385


In [37]:
y_pred_gbm = reg.predict(X_test)
mean_squared_error(y_test, y_pred_gbm)

1.5227823388409703

In [38]:
# Best model rerun
gbt_finetuned = GradientBoostingRegressor(random_state = 695, 
                            max_features = 20, min_samples_split = 50, n_estimators = 90)
gbt_finetuned.fit(X_train, y_train)
y_pred_gbt = gbt_finetuned.predict(X_test)
print("R^2:", r2_score(y_test, y_pred_gbt))
print("MSE:", mean_squared_error(y_test, y_pred_gbt))
print("MAE:", mean_absolute_error(y_test, y_pred_gbt))

R^2: 0.9821270311856265
MSE: 1.5227823388409703
MAE: 0.5477645735305599


In [39]:
import pickle

with open ("gradientboost_model.pkl","wb") as file:
    pickle.dump(gbt_finetuned,file)