In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score 

In [14]:
X_train = pd.read_csv("../data/processed/X_train.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()

X_test = pd.read_csv("../data/processed/X_test.csv")
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

Multiple Linear Regression Model

In [9]:
from sklearn.linear_model import LinearRegression

mlr = LinearRegression(
    fit_intercept=True,
    copy_X=True,
    n_jobs=-1,
)

mlr.fit(X_train,y_train)

print("coeffients :",mlr.coef_)
print("intercept :",mlr.intercept_)

y_pred_lr = mlr.predict(X_test)

print("first 10 predictions :",y_pred_lr[:10])
print(f"RMSE score : {root_mean_squared_error(y_test,y_pred_lr):.4f}")
print(f"MAE score : {mean_absolute_error(y_test,y_pred_lr):.4f}")
print(f"R2 : {r2_score(y_test,y_pred_lr):.4f}")

coeffients : [ 7.21210766e-01  3.86312742e-01 -1.08484178e-01 -5.25313301e-02
  2.85127361e-01 -4.51906139e-01  5.28704142e+00  9.02795183e-01
  1.26617830e-01 -3.86986728e-01 -1.44793961e-02 -5.81130255e+00
 -1.75127902e+02 -6.49224985e+00  1.15502124e+00  5.70626997e+00
 -8.46656882e-02]
intercept : 13346.71524878972
first 10 predictions : [546.50057618 499.7209888  383.43373548 366.55174284 291.13137725
 274.63567974 220.00664962 202.22133154 176.94822014 242.83815979]
RMSE score : 31.7792
MAE score : 25.0809
R2 : 0.8763


Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

param = {
    'n_estimators' : [100, 200, 300, 400],
    'learning_rate' : [0.05, 0.1, 0.2],
    'max_depth' : [7,9,11,13],
    'subsample' : [0.5,0.7,0.9]
}

grid = GridSearchCV(
    estimator=GradientBoostingRegressor(),
    param_grid=param,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid.fit(X_train,y_train)

y_pred_gb = grid.predict(X_test)
gboost = grid.best_estimator_

print("best parameters :", grid.best_params_)
print(f"RMSE score : {root_mean_squared_error(y_test,y_pred_gb):.4f}")
print(f"MAE score : {mean_absolute_error(y_test,y_pred_gb):.4f}")
print(f"R2 score : {r2_score(y_test,y_pred_gb):.4f}")

best parameters : {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
RMSE score : 28.3647
MAE score : 23.4889
R2 score : 0.9015


AdaBoost Regressor

In [19]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

base = DecisionTreeRegressor()
adaboost = AdaBoostRegressor(
    estimator=base,
    random_state=42
)

param = {
    'estimator__max_depth' : [2,3,4,5],
    'n_estimators' : [100,200,300,400],
    'learning_rate' : [0.01,0.02,0.05]
}

grid = GridSearchCV(
    estimator=adaboost,
    param_grid=param,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid.fit(X_train,y_train)

y_pred_ada = grid.predict(X_test)
adaboost = grid.best_estimator_

print("best parameters :",grid.best_params_)
print("first 10 predictions :",y_pred_ada[:10])
print(f"RMSE score : {root_mean_squared_error(y_test,y_pred_ada):.4f}")
print(f"MAE score : {mean_absolute_error(y_test,y_pred_ada):.4f}")
print(f"R2 score : {r2_score(y_test,y_pred_ada):.4f}")

best parameters : {'estimator__max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 300}
first 10 predictions : [527.2        469.54347826 386.81203008 385.36936937 333.65217391
 331.33628319 270.99052133 270.89361702 213.36774194 273.35810811]
RMSE score : 31.0149
MAE score : 26.8069
R2 score : 0.8822
