First

Evaluate the performance of the following individual regression models using 5-fold cross validation using only the training data:


Data split

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

import pandas as pd
import numpy as np

# select 2000 of the dataset
file = pd.read_csv('/content/sample_data/california_housing_test.csv')
file.head(2000)
X, y = fetch_california_housing(return_X_y=True)
X = X[:2000,:]
y = y[:2000]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

1) CLF1: Linear regression model, i.e., LinearRegression()

In [None]:
pipe_linR = Pipeline([['scaler', StandardScaler()],['linR', LinearRegression()]])
pipe_linR.fit(X_train,y_train)
label = 'CLF1'
scores_linR_1=cross_val_score(estimator=pipe_linR,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')
print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_linR_1.mean(), scores_linR_1.std(), label))

MSE: 0.30 (+/- 0.03) [CLF1]


2）Decision tree regressor, i.e, DecisionTreeRegressor(max_depth=1, criterion=’squared_error’)

In [None]:
pipe_deciTreeReg = Pipeline([['scaler', StandardScaler()],['deciTreeReg', DecisionTreeRegressor(max_depth=1, criterion='squared_error')]])
pipe_deciTreeReg.fit(X_train,y_train)
labe2 = 'Decision tree regressor'
scores_deciTreeReg_1=cross_val_score(estimator=pipe_deciTreeReg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')
print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_deciTreeReg_1.mean(), scores_deciTreeReg_1.std(), labe2))

MSE: 0.53 (+/- 0.04) [Decision tree regressor]


3）Support vector regression model with a linear kernel, i.e, SVR(kernel=’linear’)

In [None]:
pipe_supVecReg = Pipeline([['scaler', StandardScaler()],['supVecReg', SVR(kernel='linear')]])
pipe_supVecReg.fit(X_train,y_train)
labe3 = 'Support vector regression'
scores_supVecReg_1=cross_val_score(estimator=pipe_supVecReg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')
print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_supVecReg_1.mean(), scores_supVecReg_1.std(), labe3)) 

MSE: 0.33 (+/- 0.06) [Support vector regression]


Second

Build a bagging regression model for each classifier and evaluate the 
performance of the bagging model using 5-fold cross validation.


1) linear regression with bagging

In [None]:
params_linR = {'n_estimators': np.arange(50, 250, 20), 'max_samples': np.arange(0.01, 1.0, 0.2)}
gridSea_cv_baglinR = GridSearchCV(BaggingRegressor(base_estimator=pipe_linR), params_linR, verbose=1, cv=5)
gridSea_cv_baglinR.fit(X_train, y_train)
labe_bagging1 = 'CLF1 with bagging'
scores_linR_2=cross_val_score(estimator=gridSea_cv_baglinR,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
# score of test data
scoreOfTest_bagl = gridSea_cv_baglinR.score(X_test,y_test)

print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_linR_2.mean(), scores_linR_2.std(), labe_bagging1))
print('Best parameters of max_samples : %0.2f'% gridSea_cv_baglinR.best_params_['max_samples'])
print('Best parameters of n_estimators : %0.2f'% gridSea_cv_baglinR.best_params_['n_estimators'])
print('Score of test data : %0.2f'% scoreOfTest_bagl)

2) Decision tree regressor with bagging

In [None]:
params_deciTreeReg = {'n_estimators': np.arange(50, 250, 20), 'max_samples': np.arange(0.01, 1.0, 0.2)}
gridSea_cv_bagdeciTreeReg = GridSearchCV(BaggingRegressor(base_estimator=pipe_deciTreeReg), params_deciTreeReg, verbose=1, cv=5)
gridSea_cv_bagdeciTreeReg.fit(X_train, y_train)
labe_bagging2 = 'Decision tree regressor with bagging'
scores_deciTreeReg_2=cross_val_score(estimator=gridSea_cv_bagdeciTreeReg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
MSE: 0.45 (+/- 0.04) [Decision tree regressor with bagging] Best parameters of max_samples : 0.21 Best parameters of n_estimators : 110.00


In [None]:
# score of test data
scoreOfTest_bagd = gridSea_cv_baglinR.score(X_test,y_test)

print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_deciTreeReg_2.mean(), scores_deciTreeReg_2.std(), labe_bagging2))
print('Best parameters of max_samples : %0.2f'% gridSea_cv_bagdeciTreeReg.best_params_['max_samples'])
print('Best parameters of n_estimators : %0.2f'% gridSea_cv_bagdeciTreeReg.best_params_['n_estimators'])
print('Score of test data : %0.2f'% scoreOfTest_bagd)

3) Support vector regression with bagging

In [None]:
params_supVecReg = {'n_estimators': np.arange(50, 250, 50), 'max_samples': np.arange(0.01, 1.0, 0.2)}
gridSea_cv_bagsupVecReg = GridSearchCV(BaggingRegressor(base_estimator=pipe_supVecReg), params_supVecReg, verbose=1, cv=5)
gridSea_cv_bagsupVecReg.fit(X_train, y_train)
labe_bagging3 = 'Support vector regression with bagging'
scores_supVecReg_2=cross_val_score(estimator=gridSea_cv_bagsupVecReg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
MSE: 0.32 (+/- 0.05) [Support vector regression with bagging] Best parameters of max_samples : 0.51 Best parameters of n_estimators : 50.00


In [None]:
# score of test data
scoreOfTest_bags = gridSea_cv_baglinR.score(X_test,y_test)

print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_supVecReg_2.mean(), scores_supVecReg_2.std(), labe_bagging3))
print('Best parameters of max_samples : %0.2f'% gridSea_cv_bagsupVecReg.best_params_['max_samples'])
print('Best parameters of n_estimators : %0.2f'% gridSea_cv_bagsupVecReg.best_params_['n_estimators'])
print('Score of test data : %0.2f'% scoreOfTest_bags)

Third

Build an Adaboost regression model for each classifier and evaluate its 
performance using 10-fold cross validation.

1) linear regression with Adaboost

In [None]:
params_linR_2 = {'n_estimators': np.arange(50, 250, 20), 'learning_rate': np.arange(0.01, 1.0, 0.2)}
gridSea_cv_adalinR = GridSearchCV(AdaBoostRegressor(base_estimator=pipe_linR), params_linR_2, verbose=1, cv=5)
gridSea_cv_adalinR.fit(X_train, y_train)
labe_ada1 = 'CLF1 with Adaboost'
scores_linR_3=cross_val_score(estimator=gridSea_cv_adalinR,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
MSE: 0.29 (+/- 0.03) [CLF1 with Adaboost]
Best parameters of max_samples : 0.01
Best parameters of n_estimators : 70.00
Score of test data : 0.73


In [None]:
# score of test data
scoreOfTest_adal = gridSea_cv_adalinR.score(X_test,y_test)

print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_linR_3.mean(), scores_linR_3.std(), labe_ada1))
print('Best parameters of learning_rate : %0.2f'% gridSea_cv_adalinR.best_params_['learning_rate'])
print('Best parameters of n_estimators : %0.2f'% gridSea_cv_adalinR.best_params_['n_estimators'])
print('Score of test data : %0.2f'% scoreOfTest_adal)

2) Decision tree regressor with Adaboost

In [None]:
params_deciTreeReg_2 = {'n_estimators': np.arange(50, 250, 20), 'learning_rate': np.arange(0.01, 1.0, 0.2)}
gridSea_cv_adadeciTreeReg = GridSearchCV(AdaBoostRegressor(base_estimator=pipe_deciTreeReg), params_deciTreeReg_2, verbose=1, cv=5)
gridSea_cv_adadeciTreeReg.fit(X_train, y_train)
labe_ada2 = 'Decision tree regressor with Adaboost'
scores_deciTreeReg_3=cross_val_score(estimator=gridSea_cv_adadeciTreeReg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')

MSE: 0.49 (+/- 0.05) [Decision tree regressor with Adaboost]


In [None]:
# score of test data
scoreOfTest_adad = gridSea_cv_adadeciTreeReg.score(X_test,y_test)

print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_deciTreeReg_3.mean(), scores_deciTreeReg_3.std(), labe_ada2))
print('Best parameters of learning_rate : %0.2f'% gridSea_cv_adadeciTreeReg.best_params_['learning_rate'])
print('Best parameters of n_estimators : %0.2f'% gridSea_cv_adadeciTreeReg.best_params_['n_estimators'])
print('Score of test data : %0.2f'% scoreOfTest_adad)

3) Support vector regression with Adaboost

In [None]:
params_supVecReg_2 = {'n_estimators': np.arange(50, 250, 30), 'learning_rate': np.arange(0.01, 1.0, 0.1)}
gridSea_cv_adasupVecReg = GridSearchCV(AdaBoostRegressor(base_estimator=pipe_supVecReg), params_supVecReg_2, verbose=1, cv=5)
gridSea_cv_adasupVecReg.fit(X_train, y_train)
labe_ada3 = 'Support vector regression with Adaboost'
scores_supVecReg_3=cross_val_score(estimator=gridSea_cv_adasupVecReg,X=X_train,y=y_train,cv=5,scoring='neg_mean_squared_error')

MSE: 0.44 (+/- 0.13) [Support vector regression with Adaboost]


In [None]:
# score of test data
scoreOfTest_adas = gridSea_cv_adasupVecReg.score(X_test,y_test)

print("MSE: %0.2f (+/- %0.2f) [%s]"% (-scores_supVecReg_3.mean(), scores_supVecReg_3.std(), labe_ada3))
print('Best parameters of learning_rate : %0.2f'% gridSea_cv_adasupVecReg.best_params_['learning_rate'])
print('Best parameters of n_estimators : %0.2f'% gridSea_cv_adasupVecReg.best_params_['n_estimators'])
print('Score of test data : %0.2f'% scoreOfTest_adas)