In [14]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import mean_squared_error, r2_score

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [15]:
sales = pd.read_csv("data/original_sales_data.csv")

In [24]:
sales.head()

Unnamed: 0,order_number,order_date,sku_id,warehouse_id,customer_type,order_quantity,unit_sale_price,revenue,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,SO - 018900,01-Jan-21,3551CA,GUT930,Export,105.0,7.07,742,,,,,
1,SO - 018901,01-Jan-21,3079BA,AXW291,Wholesale,151.0,134.5,20310,,,,,
2,SO - 018902,01-Jan-21,3250CA,AXW291,Distributor,300.0,34.75,10426,,,,,
3,SO - 018903,01-Jan-21,1161AA,GUT930,Wholesale,50.0,136.59,6830,,,,,
4,SO - 018904,01-Jan-21,3512AA,GUT930,Distributor,1000.0,0.1,103,,,,,


#####Include customer_type and warehouse_id to the training data they might add some weightage to improve the accuracy of the model.

In [16]:
sales_data_cleaned = sales.loc[:, ~sales.columns.str.startswith('Unnamed')]

sales_data_cleaned['order_date'] = pd.to_datetime(sales_data_cleaned['order_date'], format='%d-%b-%y')

In [17]:
sku_daily_sales = sales_data_cleaned.groupby(['order_date', 'sku_id'])['order_quantity'].sum().reset_index()

sku_daily_sales = sku_daily_sales.sort_values(by=['sku_id', 'order_date'])

# Feature Engineering:
sku_daily_sales['lag_1'] = sku_daily_sales.groupby('sku_id')['order_quantity'].shift(1)
sku_daily_sales['lag_7'] = sku_daily_sales.groupby('sku_id')['order_quantity'].shift(7)
sku_daily_sales['rolling_avg_7_days'] = sku_daily_sales.groupby('sku_id')['order_quantity'].transform(lambda x: x.rolling(window=7).mean())
sku_daily_sales['cumulative_sum'] = sku_daily_sales.groupby('sku_id')['order_quantity'].cumsum()
sku_daily_sales['year'] = sku_daily_sales['order_date'].dt.year
sku_daily_sales['month'] = sku_daily_sales['order_date'].dt.month
sku_daily_sales['day_of_week'] = sku_daily_sales['order_date'].dt.dayofweek

# Drop rows with NaN values caused by lagging and rolling calculations
sku_daily_sales = sku_daily_sales.dropna(subset=['lag_1', 'lag_7', 'rolling_avg_7_days', 'cumulative_sum'])

In [18]:
# Split data temporally: Use last 20% of dates as the test set
split_date_filtered = sku_daily_sales['order_date'].quantile(0.8)
train_data_filtered = sku_daily_sales[sku_daily_sales['order_date'] <= split_date_filtered]
test_data_filtered = sku_daily_sales[sku_daily_sales['order_date'] > split_date_filtered]

# Separate features and target for train and test sets
X_train_original = train_data_filtered[['lag_1', 'lag_7', 'rolling_avg_7_days', 'cumulative_sum', 'year', 'month', 'day_of_week']]
y_train_original = train_data_filtered['order_quantity']
X_test_original = test_data_filtered[['lag_1', 'lag_7', 'rolling_avg_7_days', 'cumulative_sum', 'year', 'month', 'day_of_week']]
y_test_original = test_data_filtered['order_quantity']

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [10, 20, None],      # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]     # Minimum samples required in a leaf node
}

# Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train_original, y_train_original)

# Best parameters and performance
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert from negative MSE to MSE

best_params, best_score


Fitting 3 folds for each of 81 candidates, totalling 243 fits


({'max_depth': 10,
  'min_samples_leaf': 2,
  'min_samples_split': 10,
  'n_estimators': 300},
 np.float64(3169267.57223538))

({'max_depth': 10,
  'min_samples_leaf': 2,
  'min_samples_split': 10,
  'n_estimators': 300},
 np.float64(3169267.57223538))

In [20]:
# Train a Random Forest model
rf_model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=2, min_samples_split=10)
rf_model.fit(X_train_original, y_train_original)

# Predict on the test set
rf_predictions = rf_model.predict(X_test_original)

# Evaluate the model
rf_mse = mean_squared_error(y_test_original, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test_original, rf_predictions)

rf_mse, rf_rmse, rf_r2

(np.float64(2650096.7426475133),
 np.float64(1627.911773606762),
 0.6288450420709688)

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# Define the hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300, 400],          # Number of boosting rounds
    'max_depth': [3, 5, 7, 10],                   # Maximum tree depth
    'learning_rate': [0.01, 0.05, 0.1, 0.2],      # Step size for weight updates
    'subsample': [0.6, 0.8, 1.0],                 # Fraction of samples used for training each tree
    'colsample_bytree': [0.6, 0.8, 1.0],          # Fraction of features considered for each split
    'gamma': [0, 1, 5]                            # Minimum loss reduction required to make a split
}

# Initialize the XGBoost model
xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

# Perform Randomized Search with cross-validation
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid_xgb,
    n_iter=50,  # Number of parameter combinations to try
    cv=3,       # 3-fold cross-validation
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model to the training data
random_search_xgb.fit(X_train_original, y_train_original)

# Best parameters and performance
best_params_xgb = random_search_xgb.best_params_
best_score_xgb = -random_search_xgb.best_score_  # Convert from negative MSE to MSE

best_params_xgb, best_score_xgb


Fitting 3 folds for each of 50 candidates, totalling 150 fits


({'subsample': 0.8,
  'n_estimators': 200,
  'max_depth': 3,
  'learning_rate': 0.05,
  'gamma': 0,
  'colsample_bytree': 0.8},
 np.float64(3222520.287230836))

({'subsample': 0.8,
  'n_estimators': 200,
  'max_depth': 3,
  'learning_rate': 0.05,
  'gamma': 0,
  'colsample_bytree': 0.8},
 np.float64(3222520.287230836))

In [23]:

from xgboost import XGBRegressor

xgb_model_original = XGBRegressor(random_state=42, n_estimators=200, learning_rate=0.03, subsample=0.8, max_depth= 3, gamma= 0, colsample_bytree=0.8)
xgb_model_original.fit(X_train_original, y_train_original)

# Predict on the test set
xgb_predictions_original_scale = xgb_model_original.predict(X_test_original)

# Evaluate the XGBoost model in the original scale
xgb_mse_no_log = mean_squared_error(y_test_original, xgb_predictions_original_scale)
xgb_rmse_no_log = np.sqrt(xgb_mse_no_log)
xgb_r2_no_log = r2_score(y_test_original, xgb_predictions_original_scale)

xgb_mse_no_log, xgb_rmse_no_log, xgb_r2_no_log


(np.float64(2821763.2521880954),
 np.float64(1679.8104810329335),
 0.6048025702996532)