# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from cstm_pkg_grp_9.data.sets import pop_target
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import xgboost as xgb
from joblib import dump

# Loading Data

In [2]:
train_df = pd.read_csv("../../data/processed/train_processed_1.csv")
test_df = pd.read_csv("../../data/processed/test_processed_1.csv")

In [3]:
train_df.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day,sales_revenue
0,-0.328719,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,5.52
1,-0.454687,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,3.12
2,-0.392039,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,0.0
3,-0.244169,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,0.0
4,-0.158854,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,2.8


In [4]:
train_df.shape

(34720691, 9)

In [5]:
test_df.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day,sales_revenue
0,-0.20372,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
1,-0.534879,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
2,-0.658027,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
3,0.976588,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,18.56
4,-0.332285,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,8.64


In [6]:
test_df.shape

(12160986, 9)

# Sampling Data

In [7]:
df_train_sample = train_df.sample(frac=0.6, random_state=42)
df_test_sample = test_df

In [8]:
features_train, target_train = pop_target(df_train_sample, 'sales_revenue')
X_test, y_test = pop_target(df_test_sample, 'sales_revenue')

# Splitting Data

In [9]:
X_train, X_val, y_train, y_val = train_test_split(features_train, target_train, test_size=0.3, random_state=42)

# Baseline Model

In [10]:
y_mean = y_train.mean()
y_mean

np.float64(4.100991207383548)

In [11]:
y_base = np.full(y_train.shape, y_mean)
print("RMSE on Training Data:", rmse(y_train, y_base))

RMSE on Training Data: 10.485546198416937


In [12]:
y_val_base = np.full(y_val.shape, y_mean)
print("RMSE on Validation Data:", rmse(y_val, y_val_base))

RMSE on Validation Data: 10.421423291969589


In [13]:
y_test_base = np.full(y_test.shape, y_mean)
print("RMSE on Testing Data:", rmse(y_test, y_test_base))

RMSE on Testing Data: 11.313842329627265


# Modelling

## XGBoost Model 1

In [14]:
xgb_1 = xgb.XGBRegressor()

In [15]:
xgb_1.fit(X_train, y_train)

In [16]:
y_train_pred = xgb_1.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

8.094090687242801


In [17]:
y_val_pred = xgb_1.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.086534600532271


In [25]:
y_test_pred = xgb_1.predict(X_test)
test_rmse = rmse(y_test, y_test_pred)
print(test_rmse)

8.695374841821373


## XGBoost Model 2

In [20]:
xgb_2 = xgb.XGBRegressor(learning_rate=0.1)

In [21]:
xgb_2.fit(X_train, y_train)

In [22]:
y_train_pred = xgb_2.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

8.268369761487197


In [23]:
y_val_pred = xgb_2.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.229352760448114


## XGBoost Model 3

In [29]:
xgb_3 = xgb.XGBRegressor(max_depth=5)

In [30]:
xgb_3.fit(X_train, y_train)

In [31]:
y_train_pred = xgb_3.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

8.214832813199678


In [33]:
y_val_pred = xgb_3.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.179516336870437


## XGBoost Model 4

In [42]:
xgb_4 = xgb.XGBRegressor(max_depth=7)

In [43]:
xgb_4.fit(X_train, y_train)

In [44]:
y_train_pred = xgb_4.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

7.974778205797334


In [45]:
y_val_pred = xgb_4.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

7.996934548344966


## XGBoost Model 5

In [58]:
xgb_5 = xgb.XGBRegressor(max_depth=7, min_child_weight=5)

In [59]:
xgb_5.fit(X_train, y_train)

In [60]:
y_train_pred = xgb_5.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

7.981239945889989


In [61]:
y_val_pred = xgb_5.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.000789875327802


## XGBoost Model 6

In [72]:
xgb_6 = xgb.XGBRegressor(max_depth=7, gamma=2)

In [73]:
xgb_6.fit(X_train, y_train)

In [74]:
y_train_pred = xgb_6.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

7.974778266911249


In [75]:
y_val_pred = xgb_6.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

7.996934608437609


## XGBoost Model 7

In [76]:
xgb_7 = xgb.XGBRegressor(max_depth=7, subsample=0.5)

In [77]:
xgb_7.fit(X_train, y_train)

In [78]:
y_train_pred = xgb_7.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

7.993490226884053


In [79]:
y_val_pred = xgb_7.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.042236956147752


## XGBoost Model 8

In [84]:
xgb_8 = xgb.XGBRegressor(max_depth=7, reg_alpha=0.01)

In [85]:
xgb_8.fit(X_train, y_train)

In [86]:
y_train_pred = xgb_8.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

7.974778592266647


In [87]:
y_val_pred = xgb_8.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

7.996934613525186


## XGBoost Model 9

In [88]:
xgb_9 = xgb.XGBRegressor(max_depth=7, reg_lambda=0.01)

In [89]:
xgb_9.fit(X_train, y_train)

In [90]:
y_train_pred = xgb_9.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

7.976566431324437


In [91]:
y_val_pred = xgb_9.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.002137307773783


## XGBoost Model 3

In [38]:
xgb_4 = xgb.XGBRegressor()

In [39]:
xgb_4.fit(X_train, y_train)

In [40]:
y_train_pred = xgb_4.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

8.116208105910685


In [41]:
y_val_pred = xgb_4.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

7.992108653385777


In [42]:
y_test_pred = xgb_4.predict(X_test)
test_rmse = rmse(y_test, y_test_pred)
print(test_rmse)

8.817446782861964


# Best Model on Test set

In [93]:
y_test_pred = xgb_4.predict(X_test)
test_rmse = rmse(y_test, y_test_pred)
print(test_rmse)

8.597856885355244


# Saving Model

In [94]:
dump(xgb_4,  '../../models/predictive/xgb_4.joblib')

['../../models/predictive/xgb_4.joblib']