In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
file_name ='original_sales_data.csv'

try:
    df = pd.read_csv(r"C:\Users\JASWANTH REDDY\OneDrive\Desktop\Projects\Demand_Forecasting_Thesis\data\original_sales_data.csv")
except:
    print("Provided file name not found in the local!")
    raise Exception('FileNotFoundError')

In [3]:
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
df['order_date'] = pd.to_datetime(df['order_date'], format='%d-%b-%y')

Extract features from date attribute for furthur analysis

In [4]:
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['week'] = df['order_date'].dt.isocalendar().week

Aggregate quanity by sku_id per week

In [5]:
weekly_sales = df.groupby(['sku_id','warehouse_id','year','month','week'])['order_quantity'].sum().reset_index()
weekly_sales = weekly_sales.sort_values(by=['year','month','week'])

Feature Engineering to feed model with extra features for more accuracy

In [6]:
weekly_sales['lag_1'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(1)
weekly_sales['lag_2'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(2)
weekly_sales['lag_7'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(7)
weekly_sales['rolling_avg_3_weeks'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].transform(lambda x: x.rolling(window=3).mean())
weekly_sales['cumulative_sum'] = weekly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].cumsum()

# Removing events that have NaN values
sku_weekly_sales = weekly_sales.dropna(subset=['lag_1', 'lag_2', 'lag_7', 'rolling_avg_3_weeks', 'cumulative_sum'])

This function convert object data type categorical columns into integer type columns.

In [7]:
def oneHotEncoding(df, index_column):
    df=df.set_index(index_column)
    
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded_categorical = encoder.fit_transform(df[categorical_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
    
    one_hot_encoded = pd.concat([df.reset_index(),one_hot_df], axis=1)
    one_hot_encoded = one_hot_encoded.drop(categorical_cols, axis=1)
    df = one_hot_encoded.set_index(index_column)

    return df

data = oneHotEncoding(sku_weekly_sales, 'sku_id')

In [8]:
X = data.drop(columns= ['order_quantity'])
y = data['order_quantity']

Spliting data: 75% as train data and 25% as test data

In [9]:
split_ratio = 0.75
split_index = int(len(data) * split_ratio)

In [10]:
X_train = X.iloc[:split_index] 
y_train = y.iloc[:split_index] 

X_test = X.iloc[split_index:]
y_test = y.iloc[split_index:]  

Fine tuning the model to get the most suitable hyperparameters for the model

In [13]:
params = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [2,3,5,7,9],
    'min_samples_split': [6, 8, 10, 12],
    'min_samples_leaf': [2,4,6]
}

random_forest_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=random_forest_model, scoring='neg_mean_squared_error', param_grid=params, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_params

Fitting 3 folds for each of 300 candidates, totalling 900 fits


{'max_depth': 9,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 250}

In [14]:
random_forest_model = RandomForestRegressor(random_state=42, n_estimators=250, max_depth=9, min_samples_leaf=2, min_samples_split=6)
random_forest_model.fit(X_train, y_train)

predictions = random_forest_model.predict(X_test)

model_mse = mean_squared_error(y_test, predictions)
model_rmse = np.sqrt(model_mse)
model_r2 = r2_score(y_test, predictions)

model_mse, model_rmse, model_r2

(np.float64(555573.619994382),
 np.float64(745.3681103953818),
 0.9313806879071653)

In [382]:
xgb_params = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [2, 3, 5, 7, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.06, 0.07],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5]
}

xgboost_model = XGBRegressor(random_state=42, objective='reg:squarederror')

xgboost_random_search = RandomizedSearchCV(
    estimator=xgboost_model,
    param_distributions=xgb_params,
    n_iter=50,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgboost_random_search.fit(X_train, y_train)

best_params_xgb = xgboost_random_search.best_params_

best_params_xgb


Fitting 3 folds for each of 50 candidates, totalling 150 fits


{'subsample': 1.0,
 'n_estimators': 300,
 'max_depth': 2,
 'learning_rate': 0.03,
 'gamma': 1,
 'colsample_bytree': 1.0}

In [12]:
xgboost_model = XGBRegressor(random_state=42, n_estimators=300, learning_rate=0.03, subsample=1.0, max_depth= 2, gamma= 1, colsample_bytree=1)
xgboost_model.fit(X_train, y_train)

xgboost_predictions = xgboost_model.predict(X_test)

xgboost_mse = mean_squared_error(y_test, xgboost_predictions)
xgboost_rmse = np.sqrt(xgboost_mse)
xgboost_r2 = r2_score(y_test, xgboost_predictions)

xgboost_mse, xgboost_rmse, xgboost_r2


(np.float64(1224574.512021412),
 np.float64(1106.6049484894834),
 0.8487518888636619)