In [53]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVR
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import json
import ast
import warnings

cwd = os.getcwd()
warnings.filterwarnings('ignore')

In [54]:
read_dir = os.path.join(cwd,'datasets')
train = pd.read_csv(os.path.join(read_dir,'train_pca.csv'))
val = pd.read_csv(os.path.join(read_dir,'val_pca.csv'))

In [55]:
index = train.keys()
index_x = [ind for ind in index if ind!= 'price' and ind!= 'index']

In [56]:
y_train = train['price']
X_train = train[index_x]

y_val = val['price']
X_val = val[index_x]

y_train = np.array(y_train).reshape(-1,1)
y_val = np.array(y_val).reshape(-1,1)

In [57]:
models = {'Linear Regression': LinearRegression(), 'Lasso': Lasso(),
          'Ridge':Ridge(), 'Random Forest': RandomForestRegressor(),'Gradient Boosting': GradientBoostingRegressor()}

for k,v in models.items():   
    model = v
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    print('{} RMSE: {:.3f}'.format(k,rmse))

Linear Regression RMSE: 0.369
Lasso RMSE: 0.673
Ridge RMSE: 0.369
Random Forest RMSE: 0.158
Gradient Boosting RMSE: 0.172


# It is clear that gradient boosting ensemble methods are the best for regression of this task - therefore the focus should be optimising boosting methods.

In [58]:
param_grid = {'n_estimators':[3,10,30],'max_depth':[2,4,6,8], 
              'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.75,1.0]}
GBR = GradientBoostingRegressor()
grid_search_GBR = GridSearchCV(GBR, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search_GBR.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [3, 10, 30], 'max_depth': [2, 4, 6, 8], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [None]:
y_predGSGBR = grid_search_GBR.predict(X_val)
mse_GBR = mean_squared_error(y_val,y_predGSGBR)
rmse_GBR = np.sqrt(mse_GBR)
print(rmse_GBR)

0.15018432147244953


In [None]:
param_grid = {'n_estimators':[100,200,300,400,500],'max_depth':[2,3,4,5], 
              'learning_rate':[0.1,0.2,0.3,0.4]}
GBR = GradientBoostingRegressor()
grid_search_GBR = GridSearchCV(GBR, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search_GBR.fit(X_train, y_train)

In [None]:
print(grid_search_GBR.best_params_)

In [None]:
break
y_predGSGBR = grid_search_GBR.predict(X_val)
mse_GBR = mean_squared_error(y_val,y_predGSGBR)
rmse_GBR = np.sqrt(mse_GBR)
print(rmse_GBR)

In [None]:
param_grid = {'n_estimators':[100,200,300,400,500],'max_depth':[2,3,4,5], 
              'max_leaf_nodes':[10,15,20]}
RFR = RandomForestRegressor()
grid_search_RFR = GridSearchCV(RFR, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search_RFR.fit(X_train, y_train)

In [None]:
y_predGSRF = grid_search_RFR.predict(X_val)
mse_RF = mean_squared_error(y_val,y_predGSRF)
rmse_RF = np.sqrt(mse_RF)
print(rmse_RF)