In [1]:
# import the data for regression
import pandas as pd
from sklearn.datasets import load_boston
dataset = load_boston()

#set dataframe
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['y'])

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(y.describe())
print('----------------------------------------------------------------------------------------')
print(X.join(y).head())

----------------------------------------------------------------------------------------
X shape: (506,13)
y shape: (506,1)
----------------------------------------------------------------------------------------
                y
count  506.000000
mean    22.532806
std      9.197104
min      5.000000
25%     17.025000
50%     21.200000
75%     25.000000
max     50.000000
----------------------------------------------------------------------------------------
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT     y  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21

In [3]:
# import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

#make pipelines for modeling
pipe_ols = Pipeline([('scl',StandardScaler()),('est', LinearRegression())])
pipe_ridge = Pipeline([('scl', StandardScaler()), ('est', Ridge())])

#build models
pipe_ols.fit(X,y.as_matrix().ravel())
pipe_ridge.fit(X,y.as_matrix().ravel())

#get R2 score
y_true = y.as_matrix().ravel()
y_pred_ols = pipe_ols.predict(X)
y_pred_ridge = pipe_ridge.predict(X)

# print the performance
print('R2 score of the OLS model: %.6f' % r2_score(y_true, y_pred_ols))
print('R2 score of the Ridge model: %.6f' % r2_score(y_true, y_pred_ridge))

R2 score of the OLS model: 0.740608
R2 score of the Ridge model: 0.740596


In [6]:
# import libraries
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 交差検証のためデータを訓練とテストに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

# make pipelines for modeling
pipe_ols = Pipeline([('scl',  StandardScaler()), ('est', LinearRegression())])
pipe_ridge = Pipeline([('scl', StandardScaler()), ('est', Ridge(alpha=10))])

# build models
pipe_ols.fit(X_train, y_train.as_matrix().ravel())
pipe_ridge.fit(X_train, y_train.as_matrix().ravel())

# 性能指標の表示
print('-----------------------------------------------------')
print('Test Score of OLS : %.6f' % r2_score(y_test, pipe_ols.predict(X_test)))
print('Test Score of Ridge : %.6f' % r2_score(y_test, pipe_ridge.predict(X_test)))

# 回帰係数の総和比較
# リッジ回帰の正則化項の役割把握のため（モデルの性能評価ではありません）
print('-----------------------------------------------------')
print('Absolute Sum of coefficient of OLS  model: %.6f' % np.absolute(pipe_ols.named_steps['est'].coef_).sum())
print('Absolute Sum of coefficient of Ridge  model: %.6f' % np.absolute(pipe_ridge.named_steps['est'].coef_).sum())

-----------------------------------------------------
Test Score of OLS : 0.763481
Test Score of Ridge : 0.761724
-----------------------------------------------------
Absolute Sum of coefficient of OLS  model: 22.070732
Absolute Sum of coefficient of Ridge  model: 19.487876
