In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

  from pandas.core import datetools


W poniższym przykładzie posłużymy się zestawem danych reklamowych, obejmującym sprzedaż produktów i ich budżet reklamowy w trzech różnych mediach telewizyjnych, radiu, gazetach.

In [48]:
df_adv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
XX = df_adv[['TV', 'radio', 'newspaper', 'sales']]
y = XX.iloc[:,-1] 
X = XX.iloc[:,:-1] 

In [49]:
X.shape

(200, 3)

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# Zadanie 1
Dokonaj pełnej analizy zbioru i porównaj wszystkie modele.

In [51]:
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn import model_selection

seed=123
kfold = model_selection.KFold(n_splits=5, random_state=seed)

In [52]:
grid_1 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), ElasticNet(alpha=1, tol=0.1)),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4],
                    'elasticnet__alpha': [0.3, 0.35, 0.4, 20]},
                    cv=kfold,
                    refit=True)
grid_1.fit(X, y)
grid_1.best_params_

{'elasticnet__alpha': 20, 'polynomialfeatures__degree': 3}

In [53]:
grid_2 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), Lasso(alpha=1, tol=0.1)),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4, 5, 6],
                    'lasso__alpha': [0.1, 1, 2, 3, 4, 10, 20]},
                    cv=5,
                    refit=True)
grid_2.fit(X, y)
grid_2.best_params_

{'lasso__alpha': 2, 'polynomialfeatures__degree': 6}

In [54]:
grid_3 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), linear_model.Ridge(alpha=1, tol=0.1)),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4],
                    'ridge__alpha': [0.1, 1, 2, 3, 4, 5, 6, 20]},
                    cv=5,
                    refit=True)
grid_3.fit(X, y)
grid_3.best_params_

{'polynomialfeatures__degree': 4, 'ridge__alpha': 20}

In [55]:
grid_4 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), linear_model.LinearRegression()),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4]},
                    cv=kfold,
                    refit=True)
grid_4.fit(X, y)
print(grid_4.best_params_)
grid_4.best_estimator_

{'polynomialfeatures__degree': 3}


Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [62]:
from sklearn.svm import SVR
grid_5 = GridSearchCV(SVR(kernel='rbf', gamma=0.1),
                    param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)},
                    cv=kfold,
                    refit=True)
grid_5.fit(X, y)
print(grid_5.best_params_)
grid_5.best_estimator_

{'C': 100.0, 'gamma': 0.01}


SVR(C=100.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [70]:
from sklearn.ensemble import RandomForestRegressor
param_grid = {
    'n_estimators': [200, 700],
    'max_depth':  [10, 100, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    
}
rfc = RandomForestRegressor(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 
grid_6 = GridSearchCV(estimator=rfc, 
                      param_grid=param_grid, 
                      cv= kfold)
grid_6.fit(X, y)
print(grid_6.best_params_)
grid_6.best_estimator_

{'max_depth': 1000, 'max_features': 'auto', 'n_estimators': 700}


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=1000,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [71]:
from sklearn import  metrics

models = []
models.append(('ElasticNet', grid_1.best_estimator_))
models.append(('Lasso', grid_2.best_estimator_))
models.append(('Ridge', grid_3.best_estimator_))
models.append(('LR', grid_4.best_estimator_))
models.append(('SVR', grid_5.best_estimator_))
models.append(('RF', grid_6.best_estimator_))

r2 = []
explained_variance_score = []
median_absolute_error = []
mean_squared_error = []
mean_absolute_error = []
for name, model in models:
    print(name)
    print("R^2: {}".format(metrics.r2_score(y_test, model.predict(X_test)) ))
    print("Explained variance score: {}".format( metrics.explained_variance_score(y_test, model.predict(X_test)) ))
    print("Median absolute error: {}".format( metrics.median_absolute_error(y_test, model.predict(X_test)) ))
    print("Mean squared error: {}".format( metrics.mean_squared_error(y_test, model.predict(X_test)) ))
    print("Mean absolute errors: {}".format(metrics.mean_absolute_error(y_test, model.predict(X_test)) ))
    r2.append(metrics.r2_score(y_test, model.predict(X_test)))
    explained_variance_score.append(metrics.explained_variance_score(y_test, model.predict(X_test)))
    median_absolute_error.append( metrics.median_absolute_error(y_test, model.predict(X_test)))
    mean_squared_error.append(metrics.mean_squared_error(y_test, model.predict(X_test)))
    mean_absolute_error.append(metrics.mean_absolute_error(y_test, model.predict(X_test)))

ElasticNet
R^2: 0.9724676237402108
Explained variance score: 0.9724830407988655
Median absolute error: 0.38710446950646205
Mean squared error: 0.7247481193005187
Mean absolute errors: 0.524617713629671
Lasso
R^2: 0.9785715168227413
Explained variance score: 0.9785815253035994
Median absolute error: 0.31688110691734117
Mean squared error: 0.5640723755785249
Mean absolute errors: 0.4504783167943076
Ridge
R^2: 0.9933790264226015
Explained variance score: 0.9934143037909097
Median absolute error: 0.2278551696769373
Mean squared error: 0.17428710485720927
Mean absolute errors: 0.2919217418248228
LR
R^2: 0.9889630782979898
Explained variance score: 0.9890223389225979
Median absolute error: 0.20513645440356543
Mean squared error: 0.29053025321615344
Mean absolute errors: 0.3285145521615667
SVR
R^2: 0.9996315515652928
Explained variance score: 0.9996318478507834
Median absolute error: 0.10002915898153031
Mean squared error: 0.00969884718970693
Mean absolute errors: 0.09786230745172768
RF
R^2: 

In [72]:
import pandas as pd
d = {'r2': r2, 
     'explained_variance_score': explained_variance_score, 
     'median_absolute_error': median_absolute_error,
     'mean_squared_error' : mean_squared_error,
     'mean_absolute_error' : mean_absolute_error,
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['ElasticNet','Lasso','Ridge','LR', 'SVR', 'RF'])
df

Unnamed: 0,Method,explained_variance_score,mean_absolute_error,mean_squared_error,median_absolute_error,r2
0,ElasticNet,0.972483,0.524618,0.724748,0.387104,0.972468
1,Lasso,0.978582,0.450478,0.564072,0.316881,0.978572
2,Ridge,0.993414,0.291922,0.174287,0.227855,0.993379
3,LR,0.989022,0.328515,0.29053,0.205136,0.988963
4,SVR,0.999632,0.097862,0.009699,0.100029,0.999632
5,RF,0.997108,0.199755,0.078083,0.157786,0.997034
