# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from cstm_pkg_grp_9.data.sets import pop_target
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# Loading Data

In [2]:
train_df = pd.read_csv("../../data/processed/train_processed_1.csv")
test_df = pd.read_csv("../../data/processed/test_processed_1.csv")

In [3]:
train_df.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day,sales_revenue
0,-0.328719,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,5.52
1,-0.454687,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,3.12
2,-0.392039,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,0.0
3,-0.244169,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,0.0
4,-0.158854,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,2.8


In [4]:
test_df.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day,sales_revenue
0,-0.20372,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
1,-0.534879,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
2,-0.658027,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
3,0.976588,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,18.56
4,-0.332285,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,8.64


In [5]:
features_train, target_train = pop_target(train_df, 'sales_revenue')
features_test, target_test = pop_target(test_df, 'sales_revenue')

# Sample Dataset

In [26]:
df_train_sample = train_df.sample(frac=0.6, random_state=42)
df_test_sample = test_df

In [27]:
features_train, target_train = pop_target(df_train_sample, 'sales_revenue')
X_test, y_test = pop_target(df_test_sample, 'sales_revenue')

# Split Dataset

In [28]:
X_train, X_val, y_train, y_val = train_test_split(features_train, target_train, test_size=0.3, random_state=42)

# Baseline Model

In [29]:
y_mean = y_train.mean()
y_mean

np.float64(4.100991207383548)

In [30]:
y_base = np.full(y_train.shape, y_mean)
print("RMSE on Training Data:", rmse(y_train, y_base))

RMSE on Training Data: 10.485546198416937


In [31]:
y_val_base = np.full(y_val.shape, y_mean)
print("RMSE on Validation Data:", rmse(y_val, y_val_base))

RMSE on Validation Data: 10.421423291969589


In [32]:
y_mean_test = y_test.mean()
y_mean_test

np.float64(4.042860433356301)

In [33]:
y_test_base = np.full(y_test.shape, y_mean_test)
print("RMSE on Testing Data:", rmse(y_test, y_test_base))

RMSE on Testing Data: 11.313692990035433


# Modelling

In [34]:
def train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test):
    results = {}
    
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Predictions on training and validation sets
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
        
        # Calculate RMSE for training and validation sets
        train_rmse = rmse(y_train, y_train_pred)
        val_rmse = rmse(y_val, y_val_pred)
        test_rmse = rmse(y_test, y_test_pred)
        
        # Store the results
        results[name] = {'Train RMSE': train_rmse, 'Validation RMSE': val_rmse, 'Test RMSE': test_rmse}
        print(f"{name} - Train RMSE: {train_rmse}, Validation RMSE: {val_rmse}, Test RMSE: {test_rmse}")
    
    return pd.DataFrame(results).T

In [35]:
models = {
    'Linear Regression': LinearRegression(),
    'ElasticNet': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(),
    # 'AdaBoost': AdaBoostRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor()
}

20% data

In [16]:
results = train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test)
print(results)

Linear Regression - Train RMSE: 8.793064711944254, Validation RMSE: 8.615876570766103, Test RMSE: 9.7517968576874
ElasticNet - Train RMSE: 9.08791713840053, Validation RMSE: 8.88647531886048, Test RMSE: 10.034485762227293
Decision Tree - Train RMSE: 4.7407626506080605e-17, Validation RMSE: 9.627809981763969, Test RMSE: 10.953607280715385
XGBoost - Train RMSE: 8.024782620004522, Validation RMSE: 7.971787317244758, Test RMSE: 8.708550658571044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 333
[LightGBM] [Info] Number of data points in the train set: 4860896, number of used features: 8
[LightGBM] [Info] Start training from score 4.097789
LightGBM - Train RMSE: 8.130337901979562, Validation RMSE: 8.061476138584686, Test RMSE: 9.006700057690841
                     Train RMSE  Validati

40% training data

In [25]:
results_1 = train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test)
print(results_1)

Linear Regression - Train RMSE: 8.836159620996066, Validation RMSE: 8.668901508902156, Test RMSE: 9.753320269815749
ElasticNet - Train RMSE: 9.129077423784018, Validation RMSE: 8.957881942111001, Test RMSE: 10.03653655196235
Decision Tree - Train RMSE: 4.622067298393757e-17, Validation RMSE: 9.199770890295138, Test RMSE: 10.675047290513044
XGBoost - Train RMSE: 8.116208105910685, Validation RMSE: 7.992108653385777, Test RMSE: 8.822862602657834
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 333
[LightGBM] [Info] Number of data points in the train set: 9721793, number of used features: 8
[LightGBM] [Info] Start training from score 4.101612
LightGBM - Train RMSE: 8.21952771707973, Validation RMSE: 8.083485489810728, Test RMSE: 8.981273910896729
                     Train RMSE  Validat

60% training data

In [36]:
results_2 = train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test)
print(results_2)

Linear Regression - Train RMSE: 8.816224522479516, Validation RMSE: 8.755906411582133, Test RMSE: 9.753503945408692
ElasticNet - Train RMSE: 9.108715396030968, Validation RMSE: 9.046492615645578, Test RMSE: 10.0375584340897
Decision Tree - Train RMSE: 4.408134196468292e-17, Validation RMSE: 9.043207411742186, Test RMSE: 10.58928996444265
XGBoost - Train RMSE: 8.094090687242801, Validation RMSE: 8.086534600532271, Test RMSE: 8.695374841821373
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 333
[LightGBM] [Info] Number of data points in the train set: 14582690, number of used features: 8
[LightGBM] [Info] Start training from score 4.100991
LightGBM - Train RMSE: 8.207400826243104, Validation RMSE: 8.181724101262898, Test RMSE: 8.928876827424633
                     Train RMSE  Validat

80% training data

In [16]:
results_3 = train_and_evaluate(models, X_train, y_train, X_val, y_val, X_test, y_test)
print(results_3)

Linear Regression - Train RMSE: 8.796069368976394, Validation RMSE: 8.801307196665107, Test RMSE: 9.752984224547916
ElasticNet - Train RMSE: 9.090472075815525, Validation RMSE: 9.094848393606206, Test RMSE: 10.038057217032579
Decision Tree - Train RMSE: 4.27360386428157e-17, Validation RMSE: 8.952985536438257, Test RMSE: 10.521419415153748
XGBoost - Train RMSE: 8.06508732795485, Validation RMSE: 8.084722470417498, Test RMSE: 8.640922301651479
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.251099 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 343
[LightGBM] [Info] Number of data points in the train set: 19443587, number of used features: 12
[LightGBM] [Info] Start training from score 4.101842
LightGBM - Train RMSE: 8.193124017611431, Validation RMSE: 8.210556140354118, Test RMSE: 8.93857857205692
                     Train RMSE  Valida