##### Model Training

Import Data and Required Packages

In [1]:
# Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [10]:
# Modelling imports
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [3]:
scaled_X_train=pd.read_csv('data/scaled_X_train.csv')
scaled_y_train=pd.read_csv('data/scaled_y_train.csv')
scaled_X_test=pd.read_csv('data/scaled_X_test.csv')
scaled_y_test=pd.read_csv('data/scaled_y_test.csv')

In [4]:
# define evaluate models function
# calculates mean absolute error, mean squared error, root mean squared error
# and r2 score
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    # Linear regression with L1 regularization
    "Lasso": Lasso(),
    # Linear regression with L2 regularization
    "Ridge": Ridge(),

    "K-Neighbors Regressor": KNeighborsRegressor(),

    "Decision Tree": DecisionTreeRegressor(),

    "Random Forest Regressor": RandomForestRegressor(),
    # extreme gradient boosting
    "XGBRegressor": XGBRegressor(), 
    #
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),

    "AdaBoost Regressor": AdaBoostRegressor(),

    "Gradient Boosting Regressor": GradientBoostingRegressor()
}
# define empty list
model_list = []
r2_list =[]

# for every model in the list
for i in range(len(list(models))):
    # set model to model at index i
    model = list(models.values())[i]

    # Train model
    model.fit(scaled_X_train, scaled_y_train) 

    # Make predictions
    y_train_pred = model.predict(scaled_X_train)
    y_test_pred = model.predict(scaled_X_test)
    
    # Evaluate Train and Test dataset
    # y_train is actual value, y_train_pred is predicted values from X_train
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(scaled_y_train, y_train_pred)
    # y_test is actual value, y_test_pred is predicted value from X_test
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(scaled_y_test, y_test_pred)

    
    print(list(models.keys())[i])

    # append each model to model_list
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.2597
- Mean Absolute Error: 0.1988
- R2 Score: 0.7413
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2538
- Mean Absolute Error: 0.1945
- R2 Score: 0.7581


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.5106
- Mean Absolute Error: 0.4298
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5160
- Mean Absolute Error: 0.4336
- R2 Score: -0.0003


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.2597
- Mean Absolute Error: 0.1988
- R2 Score: 0.7413
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2538
- Mean Absolute Error: 0.1944
- R2 Score: 0.7581


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.1698
- Mean Absolute Error: 0.1166
- R2 Score: 0.8895
----------------------

  return fit_method(estimator, *args, **kwargs)


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0985
- Mean Absolute Error: 0.0587
- R2 Score: 0.9628
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1763
- Mean Absolute Error: 0.1167
- R2 Score: 0.8832


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 0.1171
- Mean Absolute Error: 0.0806
- R2 Score: 0.9474
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1575
- Mean Absolute Error: 0.1119
- R2 Score: 0.9068


CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error: 0.1340
- Mean Absolute Error: 0.0972
- R2 Score: 0.9311
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1528
- Mean Absolute Error: 0.1128
- R2 Score: 0.9123




  y = column_or_1d(y, warn=True)


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.2896
- Mean Absolute Error: 0.2398
- R2 Score: 0.6783
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2854
- Mean Absolute Error: 0.2350
- R2 Score: 0.6941




  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Gradient Boosting Regressor
Model performance for Training set
- Root Mean Squared Error: 0.1977
- Mean Absolute Error: 0.1552
- R2 Score: 0.8501
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1992
- Mean Absolute Error: 0.1554
- R2 Score: 0.8509




In [12]:

# sort values by greatest R2_score
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.912298
6,XGBRegressor,0.906762
5,Random Forest Regressor,0.883187
9,Gradient Boosting Regressor,0.850914
3,K-Neighbors Regressor,0.845077
4,Decision Tree,0.814294
2,Ridge,0.758088
0,Linear Regression,0.758084
8,AdaBoost Regressor,0.694059
1,Lasso,-0.000264


Also run without the feature scaling on the models that don't require scaling to see if better performance

In [13]:
X_train=pd.read_csv('data/X_train.csv')
y_train=pd.read_csv('data/y_train.csv')
X_test=pd.read_csv('data/X_test.csv')
y_test=pd.read_csv('data/y_test.csv')

In [14]:
models = {
    "Decision Tree": DecisionTreeRegressor(),

    "Random Forest Regressor": RandomForestRegressor(),
    # extreme gradient boosting
    "XGBRegressor": XGBRegressor(), 
    #
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),

    "AdaBoost Regressor": AdaBoostRegressor(),

    "Gradient Boosting Regressor": GradientBoostingRegressor()
}
# define empty list
model_list = []
r2_list =[]

# for every model in the list
for i in range(len(list(models))):
    # set model to model at index i
    model = list(models.values())[i]

    # Train model
    model.fit(X_train, y_train) 

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    # y_train is actual value, y_train_pred is predicted values from X_train
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    # y_test is actual value, y_test_pred is predicted value from X_test
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])

    # append each model to model_list
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Decision Tree
Model performance for Training set
- Root Mean Squared Error: 806.0401
- Mean Absolute Error: 317.9561
- R2 Score: 0.9661
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2159.3622
- Mean Absolute Error: 1269.5685
- R2 Score: 0.7667




  return fit_method(estimator, *args, **kwargs)


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 955.0788
- Mean Absolute Error: 559.0844
- R2 Score: 0.9524
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1776.7055
- Mean Absolute Error: 1126.0822
- R2 Score: 0.8421


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 1126.5802
- Mean Absolute Error: 755.1634
- R2 Score: 0.9338
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1640.7978
- Mean Absolute Error: 1091.5347
- R2 Score: 0.8653


CatBoosting Regressor
Model performance for Training set
- Root Mean Squared Error: 1294.6588
- Mean Absolute Error: 902.1737
- R2 Score: 0.9126
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1548.7576
- Mean Absolute Error: 1067.9052
- R2 Score: 0.8800


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 2720.2211
- Mean Abso

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Gradient Boosting Regressor
Model performance for Training set
- Root Mean Squared Error: 1944.6405
- Mean Absolute Error: 1405.0743
- R2 Score: 0.8028
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1969.2182
- Mean Absolute Error: 1417.8224
- R2 Score: 0.8060




In [15]:

# sort values by greatest R2_score
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
3,CatBoosting Regressor,0.879983
2,XGBRegressor,0.865295
1,Random Forest Regressor,0.842055
5,Gradient Boosting Regressor,0.805973
0,Decision Tree,0.766694
4,AdaBoost Regressor,0.636479


You can see that with feature scaling, the model performs better.
Let us use CatBoosting Regressor with scaling as best model

In [31]:
# You can see Category Boosting regressor has best performance, follow by XGBRegressor and Randomforest
# Interpretation:
# Catboost, XGBoost offer better performance than Adaboost since they include regularization, while Adaboost is prone to overfitting
# These models use gradient boosting and decision trees. 
# random forest also is more robust and reduce overfitting as compared to decision trees, and are better able to handle more features

# decision tree overall can model complex, non-linear relationships, handle a mix of categorical and numerical data without extensive preprocessing, and are more robust to outliers and missing values


In [None]:
# It appears the best random forest parameters are with 120 estimators, max depth of 10, and consider all features