In [184]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [185]:
file_name ='original_sales_data.csv'

try:
    df = pd.read_csv(r"C:\Users\JASWANTH REDDY\OneDrive\Desktop\Projects\Demand_Forecasting_Thesis\data\original_sales_data.csv")
except:
    print("Provided file name not found in the local!")
    raise Exception('FileNotFoundError')

In [186]:
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
df['order_date'] = pd.to_datetime(df['order_date'], format='%d-%b-%y')

Extract some features from date attribute

In [187]:
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month

In [188]:
monthly_sales = df.groupby(['sku_id','warehouse_id','year','month'])['order_quantity'].sum().reset_index()
monthly_sales = monthly_sales.sort_values(by=['year','month'])

In [189]:
monthly_sales['lag_1'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(1)
monthly_sales['lag_2'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].shift(2)
monthly_sales['rolling_avg_3_months'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].transform(lambda x: x.rolling(window=3).mean())
monthly_sales['cumulative_sum'] = monthly_sales.groupby(['sku_id','warehouse_id'])['order_quantity'].cumsum()

# Removing events that have NaN values
sku_monthly_sales = monthly_sales.dropna(subset=['lag_1', 'lag_2', 'rolling_avg_3_months', 'cumulative_sum'])

This function convert object data type categorical columns into integer type columns.

In [190]:
def oneHotEncoding(df, index_column):
    df=df.set_index(index_column)
    
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded_categorical = encoder.fit_transform(df[categorical_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
    
    one_hot_encoded = pd.concat([df.reset_index(),one_hot_df], axis=1)
    one_hot_encoded = one_hot_encoded.drop(categorical_cols, axis=1)
    df = one_hot_encoded.set_index(index_column)

    return df

data = oneHotEncoding(sku_monthly_sales, 'sku_id')

In [191]:
X = data.drop(columns= ['order_quantity'])
y = data['order_quantity']

Spliting data: 75% as train data and 25% as test data

In [192]:
split_ratio = 0.75
split_index = int(len(data) * split_ratio)

In [193]:
X_train = X.iloc[:split_index] 
y_train = y.iloc[:split_index] 

X_test = X.iloc[split_index:]
y_test = y.iloc[split_index:]  

Fine tuning the model to get the most suitable hyperparameters for the model

In [159]:
params = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400],
    'max_depth': [2,3,5,7,9],
    'min_samples_split': [6, 8, 10, 12],
    'min_samples_leaf': [1, 2, 3, 4]
}

random_forest_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=random_forest_model, scoring='neg_mean_squared_error', param_grid=params, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_params

Fitting 3 folds for each of 560 candidates, totalling 1680 fits


{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 250}

In [194]:
random_forest_model = RandomForestRegressor(random_state=42, n_estimators=250, max_depth=9, min_samples_leaf=1, min_samples_split=6)
random_forest_model.fit(X_train, y_train)

predictions = random_forest_model.predict(X_test)

model_mse = mean_squared_error(y_test, predictions)
model_rmse = np.sqrt(model_mse)
model_r2 = r2_score(y_test, predictions)

model_mse, model_rmse, model_r2

(np.float64(3344924.7273885757),
 np.float64(1828.9135374283212),
 0.8595069467189429)

In [None]:
xgb_params = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [2, 3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5]
}

xgboost_model = XGBRegressor(random_state=42, objective='reg:squarederror')

xgboost_random_search = RandomizedSearchCV(
    estimator=xgboost_model,
    param_distributions=xgb_params,
    n_iter=50,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgboost_random_search.fit(X_train, y_train)

best_params_xgb = xgboost_random_search.best_params_

best_params_xgb


In [183]:
xgboost_model = XGBRegressor(random_state=42, n_estimators=200, learning_rate=0.05, subsample=0.8, max_depth= 2, gamma= 1, colsample_bytree=1)
xgboost_model.fit(X_train, y_train)

xgboost_predictions = xgboost_model.predict(X_test)

xgboost_mse = mean_squared_error(y_test, xgboost_predictions)
xgboost_rmse = np.sqrt(xgboost_mse)
xgboost_r2 = r2_score(y_test, xgboost_predictions)

xgboost_mse, xgboost_rmse, xgboost_r2


(np.float64(2844381.143029355),
 np.float64(1686.5293187577129),
 0.811440227056152)