# Regresja

In [45]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Rozważmy zbiór Boston

In [30]:
boston = datasets.load_boston()
# print description
# print(boston.DESCR)

In [31]:
# get the data
boston_X = boston.data
boston_Y = boston.target

Podzielmy zbiór na część testową i treningową .

In [32]:
# Split the data into training/testing sets
boston_X_train = boston_X[:-50]
boston_X_test = boston_X[-50:]
 
# Split the targets into training/testing sets
boston_y_train = boston_Y[:-50]
boston_y_test = boston_Y[-50:]

In [33]:
X=boston_X_train
y=boston_y_train

# Zadanie

Znajdź najlepszy model dzieląc na zbiór testowy i treningowy **oraz wykorzystując maksymalnie 5 współrzędnych otrzymanych z algorytmu PCA**

Sprawdź:

* Linear Regression
* Ridge Regression
* Lasso Regression
* ElasticNet Regression
* SVR
* RandomForestRegressor
* MLPRegressor

Wszytkie wyniki prównaj za pomocą 
* R2 score

In [34]:
seed=123
kfold = model_selection.KFold(n_splits=5, random_state=seed)


In [36]:
pipe = Pipeline([
        ("scaler", StandardScaler()), 
        ("pca", PCA(n_components=2)),
        ("R", make_pipeline(PolynomialFeatures(degree=2), ElasticNet(alpha=1, tol=0.1)))    
    ])

grid = {
    'pca__n_components': [2, 3, 4, 5], 
    'R__polynomialfeatures__degree': [1, 2, 3, 4],
    'R__elasticnet__alpha': [0.3, 0.35, 0.4]}

grid_1 = GridSearchCV(pipe,
                    param_grid=grid,
                    cv=kfold,
                    refit=True)
grid_1.fit(X, y)
grid_1.best_params_



{'R__elasticnet__alpha': 0.3,
 'R__polynomialfeatures__degree': 1,
 'pca__n_components': 5}

In [37]:

pipe = Pipeline([
        ("scaler", StandardScaler()), 
        ("pca", PCA(n_components=2)),
        ("R", make_pipeline(PolynomialFeatures(degree=2), Lasso(alpha=1, tol=0.1)))    
    ])

grid = {
    'pca__n_components': [2, 3, 4, 5], 
    'R__polynomialfeatures__degree': [1, 2, 3, 4],
    'R__lasso__alpha': [0.1, 1, 2, 3]}


grid_2 = GridSearchCV(pipe,
                    param_grid=grid ,
                    cv=5,
                    refit=True)
grid_2.fit(X, y)
grid_2.best_params_



{'R__lasso__alpha': 0.1,
 'R__polynomialfeatures__degree': 1,
 'pca__n_components': 5}

In [38]:
pipe = Pipeline([
        ("scaler", StandardScaler()), 
        ("pca", PCA(n_components=2)),
        ("R", make_pipeline(PolynomialFeatures(degree=2), linear_model.Ridge(alpha=1, tol=0.1)))    
    ])

grid = {
    'pca__n_components': [2, 3, 4, 5], 
    'R__polynomialfeatures__degree': [1, 2, 3, 4],
    'R__ridge__alpha': [0.1, 1, 2, 3]}


grid_3 = GridSearchCV(pipe,
                    param_grid=grid,
                    cv=5,
                    refit=True)
grid_3.fit(X, y)
grid_3.best_params_

{'R__polynomialfeatures__degree': 1,
 'R__ridge__alpha': 0.1,
 'pca__n_components': 3}

In [39]:
pipe = Pipeline([
        ("scaler", StandardScaler()), 
        ("pca", PCA(n_components=2)),
        ("R", make_pipeline(PolynomialFeatures(degree=2), linear_model.LinearRegression()))    
    ])

grid = {
    'pca__n_components': [2, 3, 4, 5], 
    'R__polynomialfeatures__degree': [1, 2, 3, 4]}


grid_4 = GridSearchCV(pipe,
                    param_grid=grid,
                    cv=kfold,
                    refit=True)
grid_4.fit(X, y)
print(grid_4.best_params_)
grid_4.best_estimator_

{'R__polynomialfeatures__degree': 1, 'pca__n_components': 3}


Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('R', Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))]))])

In [42]:

pipe = Pipeline([
        ("scaler", StandardScaler()), 
        ("pca", PCA(n_components=2)),
        ("R", SVR(kernel='rbf'))    
    ])

grid = {
    'pca__n_components': [2, 3, 4, 5],     
    "R__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "R__gamma": [ 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
       }

grid_5 = GridSearchCV(pipe,
                   param_grid=grid,
                    cv=kfold,
                    refit=True)
grid_5.fit(X, y)
print(grid_5.best_params_)
grid_5.best_estimator_



{'R__C': 1000, 'R__gamma': 0.0001, 'pca__n_components': 5}


Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('R', SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0001,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [44]:
pipe = Pipeline([
        ("scaler", StandardScaler()), 
        ("pca", PCA(n_components=2)),
        ("R", RandomForestRegressor(n_jobs=-1, max_features= 'sqrt', n_estimators=50, oob_score = True))    
    ])

grid = {
    'pca__n_components': [2, 3, 4, 5],     
    'R__n_estimators': [150, 300, 400],
    'R__max_features': ['auto', 'sqrt', 'log2'],  
    'R__max_depth': [ 400, 500, 600]
       }

grid_6 = GridSearchCV(pipe,
                   param_grid=grid,
                    cv=kfold,
                    refit=True)
grid_6.fit(X, y)
print(grid_6.best_params_)
grid_6.best_estimator_



{'R__max_depth': 400, 'R__max_features': 'auto', 'R__n_estimators': 300, 'pca__n_components': 5}


Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('R', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=400,
           max_featu...imators=300, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False))])

In [47]:
# pipe = Pipeline([
#         ("scaler", StandardScaler()), 
#         ("pca", PCA(n_components=2)),
#         ("R", MLPRegressor(hidden_layer_sizes=(100,100,100),activation='tanh',alpha=0.0001))    
#     ])

# grid = {
#     'pca__n_components': [2, 3, 4, 5],     
#     'R__hidden_layer_sizes': [(1000,10,10),(10,10),(100,100)],
#     'R__alpha': [1, 10, 100 ,1000],
#     'R__activation': ['identity', 'logistic', 'tanh', 'relu']  
#        }

# grid_7 = GridSearchCV(pipe,
#                     param_grid=grid,
#                     cv=kfold,
#                     n_jobs=2,
#                     refit=True)
# grid_7.fit(X, y)
# print(grid_7.best_params_)
# grid_7.best_estimator_

In [48]:
from sklearn import  metrics

X_test=boston_X_test
y_test=boston_y_test

models = []
models.append(('ElasticNet', grid_1.best_estimator_))
models.append(('Lasso', grid_2.best_estimator_))
models.append(('Ridge', grid_3.best_estimator_))
models.append(('LR', grid_4.best_estimator_))
models.append(('SVR', grid_5.best_estimator_))
models.append(('RFR', grid_6.best_estimator_))
# models.append(('MLP_R', grid_7.best_estimator_))

r2 = []
explained_variance_score = []
median_absolute_error = []
mean_squared_error = []
mean_absolute_error = []
for name, model in models:
    print(name)
    print("R^2: {}".format(metrics.r2_score(y_test, model.predict(X_test)) ))
    print("Explained variance score: {}".format( metrics.explained_variance_score(y_test, model.predict(X_test)) ))
    print("Median absolute error: {}".format( metrics.median_absolute_error(y_test, model.predict(X_test)) ))
    print("Mean squared error: {}".format( metrics.mean_squared_error(y_test, model.predict(X_test)) ))
    print("Mean absolute errors: {}".format(metrics.mean_absolute_error(y_test, model.predict(X_test)) ))
    r2.append(metrics.r2_score(y_test, model.predict(X_test)))
    explained_variance_score.append(metrics.explained_variance_score(y_test, model.predict(X_test)))
    median_absolute_error.append( metrics.median_absolute_error(y_test, model.predict(X_test)))
    mean_squared_error.append(metrics.mean_squared_error(y_test, model.predict(X_test)))
    mean_absolute_error.append(metrics.mean_absolute_error(y_test, model.predict(X_test)))

ElasticNet
R^2: 0.4526182873903407
Explained variance score: 0.4660260433026925
Median absolute error: 2.3898757741466
Mean squared error: 10.325206506784744
Mean absolute errors: 2.647959653280042
Lasso
R^2: 0.40736186435174804
Explained variance score: 0.42365798702917323
Median absolute error: 2.4274238973922815
Mean squared error: 11.178873888919414
Mean absolute errors: 2.7956278889453756
Ridge
R^2: 0.29895515113555937
Explained variance score: 0.47637239175082535
Median absolute error: 2.9633712128506087
Mean squared error: 13.22373887964506
Mean absolute errors: 3.007791521055942
LR
R^2: 0.2988854641297233
Explained variance score: 0.47637655260645284
Median absolute error: 2.9633048643185127
Mean squared error: 13.225053378667445
Mean absolute errors: 3.0079302556814556
SVR
R^2: 0.07057397906528062
Explained variance score: 0.3624811588349268
Median absolute error: 3.221597498390107
Mean squared error: 17.53167009028952
Mean absolute errors: 3.458243918062843
RFR
R^2: -1.854163

In [49]:
import pandas as pd
d = {'r2': r2, 
     'explained_variance_score': explained_variance_score, 
     'median_absolute_error': median_absolute_error,
     'mean_squared_error' : mean_squared_error,
     'mean_absolute_error' : mean_absolute_error,
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['ElasticNet','Lasso','Ridge','LR','SVR','RFR'])#,'MLP_R'])
df

Unnamed: 0,Method,r2,explained_variance_score,median_absolute_error,mean_squared_error,mean_absolute_error
0,ElasticNet,0.452618,0.466026,2.389876,10.325207,2.64796
1,Lasso,0.407362,0.423658,2.427424,11.178874,2.795628
2,Ridge,0.298955,0.476372,2.963371,13.223739,3.007792
3,LR,0.298885,0.476377,2.963305,13.225053,3.00793
4,SVR,0.070574,0.362481,3.221597,17.53167,3.458244
5,RFR,-1.854164,-0.728082,5.592833,53.837805,6.101613
