In [59]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
file_name ='original_sales_data.csv'

try:
    sales = pd.read_csv(r"C:\Users\JASWANTH REDDY\OneDrive\Desktop\Projects\Demand_Forecasting_Thesis\data\original_sales_data.csv")
except:
    print("Provided file name not found in the local, downloading from s3 bucket....")
    

In [5]:
df.head()

Unnamed: 0,order_number,order_date,sku_id,warehouse_id,customer_type,order_quantity,unit_sale_price,revenue
0,SO - 018900,2021-01-01,3551CA,GUT930,Export,105.0,7.07,742
1,SO - 018901,2021-01-01,3079BA,AXW291,Wholesale,151.0,134.5,20310
2,SO - 018902,2021-01-01,3250CA,AXW291,Distributor,300.0,34.75,10426
3,SO - 018903,2021-01-01,1161AA,GUT930,Wholesale,50.0,136.59,6830
4,SO - 018904,2021-01-01,3512AA,GUT930,Distributor,1000.0,0.1,103


In [4]:
df = sales.loc[:, ~sales.columns.str.startswith('Unnamed')]

df['order_date'] = pd.to_datetime(df['order_date'], format='%d-%b-%y')

Aggregate quantity by sku_id per each a day

In [23]:
sku_per_type_wh_daily_sales = df.groupby(['order_date', 'sku_id'])['order_quantity'].sum().reset_index()

sku_per_type_wh_daily_sales = sku_per_type_wh_daily_sales.sort_values(by=['sku_id', 'order_date'])

In [52]:
sku_per_type_wh_daily_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33299 entries, 39 to 33118
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_date          33299 non-null  datetime64[ns]
 1   sku_id              33299 non-null  object        
 2   customer_type       33299 non-null  object        
 3   warehouse_id        33299 non-null  object        
 4   order_quantity      33299 non-null  float64       
 5   lag_1               30155 non-null  float64       
 6   lag_7               17105 non-null  float64       
 7   rolling_avg_7_days  18751 non-null  float64       
 8   cumulative_sum      33299 non-null  float64       
 9   year                33299 non-null  int32         
 10  month               33299 non-null  int32         
 11  day_of_week         33299 non-null  int32         
dtypes: datetime64[ns](1), float64(5), int32(3), object(3)
memory usage: 2.9+ MB


Feature Engineering to feed model with extra features for more accuracy

In [25]:
sku_per_type_wh_daily_sales['lag_1'] = sku_per_type_wh_daily_sales.groupby('sku_id')['order_quantity'].shift(1)
sku_per_type_wh_daily_sales['lag_7'] = sku_per_type_wh_daily_sales.groupby('sku_id')['order_quantity'].shift(7)
sku_per_type_wh_daily_sales['rolling_avg_7_days'] = sku_per_type_wh_daily_sales.groupby('sku_id')['order_quantity'].transform(lambda x: x.rolling(window=7).mean())
sku_per_type_wh_daily_sales['cumulative_sum'] = sku_per_type_wh_daily_sales.groupby('sku_id')['order_quantity'].cumsum()
sku_per_type_wh_daily_sales['year'] = sku_per_type_wh_daily_sales['order_date'].dt.year
sku_per_type_wh_daily_sales['month'] = sku_per_type_wh_daily_sales['order_date'].dt.month
sku_per_type_wh_daily_sales['day_of_week'] = sku_per_type_wh_daily_sales['order_date'].dt.dayofweek
sku_per_type_wh_daily_sales['week'] = sku_per_type_wh_daily_sales['order_date'].dt.isocalendar().week
# Removing events that have NaN values
sku_daily_sales = sku_per_type_wh_daily_sales.dropna(subset=['lag_1', 'lag_7', 'rolling_avg_7_days', 'cumulative_sum'])

Spliting data: 80% as train data and 20% as test data

In [26]:
split_date_filtered = sku_daily_sales['order_date'].quantile(0.8)
train_data_filtered = sku_daily_sales[sku_daily_sales['order_date'] <= split_date_filtered]
test_data_filtered = sku_daily_sales[sku_daily_sales['order_date'] > split_date_filtered]

Drop order_date as we captured everything from it

In [27]:
train_data_filtered.drop(columns='order_date', inplace=True)
test_data_filtered.drop(columns='order_date', inplace=True)

This function convert object data type categorical columns into integer type columns.

In [28]:
def oneHotEncoding(df, index_column):
    df=df.set_index(index_column)
    
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded_categorical = encoder.fit_transform(df[categorical_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
    
    one_hot_encoded = pd.concat([df.reset_index(),one_hot_df], axis=1)
    one_hot_encoded = one_hot_encoded.drop(categorical_cols, axis=1)
    df = one_hot_encoded.set_index(index_column)

    return df

train_data_filtered = oneHotEncoding(train_data_filtered, 'sku_id')
test_data_filtered = oneHotEncoding(test_data_filtered, 'sku_id')

Separate features and target for sets 

In [29]:
X_train_original = train_data_filtered.iloc[:,1:]
y_train_original = train_data_filtered['order_quantity']
X_test_original = test_data_filtered.iloc[:,1:]
y_test_original = test_data_filtered['order_quantity']

Fine tuning the model to get the most suitable hyperparameters for the model

In [36]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid to fine tune for Random Forest Model
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2,5,7,10],
    'min_samples_leaf': [1, 2, 4]
}

# Random Forest model 
rf_model = RandomForestRegressor(random_state=42)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train_original, y_train_original)

# Best parameters and performance
best_params = grid_search.best_params_

best_params


Fitting 3 folds for each of 240 candidates, totalling 720 fits


{'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 250}

({'max_depth': 10,
  'min_samples_leaf': 2,
  'min_samples_split': 10,
  'n_estimators': 300},
 np.float64(3169267.57223538))

In [37]:
# Train a Random Forest model
rf_model = RandomForestRegressor(random_state=42, n_estimators=250, max_depth=5, min_samples_leaf=2, min_samples_split=10)
rf_model.fit(X_train_original, y_train_original)

# Predict on the test set
rf_predictions = rf_model.predict(X_test_original)

# Evaluate the model
rf_mse = mean_squared_error(y_test_original, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test_original, rf_predictions)

rf_mse, rf_rmse, rf_r2

(np.float64(2600842.9805036364),
 np.float64(1612.7129256329647),
 0.6357431970409999)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# Define the hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300, 400],          # Number of boosting rounds
    'max_depth': [2, 3, 5, 7, 10],                   # Maximum tree depth
    'learning_rate': [0.01, 0.05, 0.1, 0.2],      # Step size for weight updates
    'subsample': [0.6, 0.8, 1.0],                 # Fraction of samples used for training each tree
    'colsample_bytree': [0.6, 0.8, 1.0],          # Fraction of features considered for each split
    'gamma': [0, 1, 5]                            # Minimum loss reduction required to make a split
}

# Initialize the XGBoost model
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

# Perform Randomized Search with cross-validation
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid_xgb,
    n_iter=50,  # Number of parameter combinations to try
    cv=3,       # 3-fold cross-validation
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model to the training data
random_search_xgb.fit(X_train_original, y_train_original)

# Best parameters and performance
best_params_xgb = random_search_xgb.best_params_
best_score_xgb = -random_search_xgb.best_score_  # Convert from negative MSE to MSE

best_params_xgb, best_score_xgb


Fitting 3 folds for each of 50 candidates, totalling 150 fits


({'subsample': 1.0,
  'n_estimators': 100,
  'max_depth': 2,
  'learning_rate': 0.2,
  'gamma': 1,
  'colsample_bytree': 0.6},
 np.float64(3140223.3115354306))

({'subsample': 0.8,
  'n_estimators': 200,
  'max_depth': 3,
  'learning_rate': 0.05,
  'gamma': 0,
  'colsample_bytree': 0.8},
 np.float64(3222520.287230836))

In [50]:

from xgboost import XGBRegressor

xgb_model_original = XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.2, subsample=1.0, max_depth= 2, gamma= 1, colsample_bytree=0.6)
xgb_model_original.fit(X_train_original, y_train_original)

# Predict on the test set
xgb_predictions_original_scale = xgb_model_original.predict(X_test_original)

# Evaluate the XGBoost model in the original scale
xgb_mse_no_log = mean_squared_error(y_test_original, xgb_predictions_original_scale)
xgb_rmse_no_log = np.sqrt(xgb_mse_no_log)
xgb_r2_no_log = r2_score(y_test_original, xgb_predictions_original_scale)

xgb_mse_no_log, xgb_rmse_no_log, xgb_r2_no_log


(np.float64(2846917.1009056633),
 np.float64(1687.2809786474993),
 0.6012796892243022)