<a href="https://colab.research.google.com/github/skyworld19/ds-school-advanced/blob/master/ML_05_%ED%9A%8C%EA%B7%80%EB%AA%A8%EB%8D%B8%ED%8F%89%EA%B0%80%EC%A7%80%ED%91%9C_GridSearchCV_LAB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 보스톤 데이터 - Ridge

In [1]:
# 라이브러리 import
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [2]:
# 데이터 가져오기
boston = load_boston()
X = boston['data']
y = boston['target']
feature_names = boston['feature_names']
df = pd.DataFrame(X, columns=feature_names)
df['PRICE'] = y
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
# 데이터 전처리, 분할
X = df.loc[:, 'CRIM':'LSTAT']
y = df.loc[:, 'PRICE']
degree = 3
x_scaled = MinMaxScaler().fit_transform(X)
P = PolynomialFeatures(degree=degree, include_bias=False)
x_poly = P.fit_transform(x_scaled)
x_train, x_test, y_train, y_test = train_test_split(x_poly, y, random_state=0)
[a.shape for a in (x_train, x_test, y_train, y_test)]

[(379, 559), (127, 559), (379,), (127,)]

In [4]:
# 모델링, 평가
alpha=0.1
model = Ridge(alpha=alpha).fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(0.9537155397034656, 0.7905570432110397)

### 예측값 구하기

In [6]:
y_pred = model.predict(x_test)

### $R^2$ score
- ```model.score(X, y)```
- ```sklearn.metrics.r2_score(y_true, y_pred)```

In [11]:
from sklearn.metrics import r2_score
model.score(x_test, y_test), r2_score(y_test, y_pred) #y_test:실제정답

(0.7905570432110397, 0.7905570432110397)

### MAE(Mean Absolute Error)
```sklearn.metrics.mean_absolute_error(y_true, y_pred)```

In [8]:
from sklearn.metrics import mean_absolute_error as mae
mae(y_test, y_pred)

2.7419189417256002

### MSE(Mean Squared Error)
```sklearn.metrics.mean_squared_error(y_true, y_pred)```

In [9]:
from sklearn.metrics import mean_squared_error as mse
mse(y_test, y_pred)

17.111283360088102

### RMSE(Root Mean Squared Error)
- sklearn API에 없음
- MSE에 math.sqrt() 를 사용함

In [10]:
from sklearn.metrics import mean_squared_error as mse
import math
mse(y_test, y_pred)**0.5

4.136578702271734

### GridSearchCV를 사용한 hyperparameter 찾기
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [12]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from  sklearn.model_selection import GridSearchCV

In [13]:
# 데이터 가져오기
boston = load_boston()
X = boston['data']
y = boston['target']
feature_names = boston['feature_names']
df = pd.DataFrame(X, columns=feature_names)
df['PRICE'] = y
df.to_csv('boston.csv', index=False)

In [14]:
## 데이터 섞어 주기
from sklearn.utils import shuffle
df = pd.read_csv('boston.csv')
dfA = df.sample(frac=1, random_state=0) 
dfB = shuffle(df, random_state=0)

In [15]:
dfA.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
329,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6
371,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53,50.0
219,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5,23.0
403,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77,8.3
78,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34,21.2


In [16]:
dfB.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
329,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6
371,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53,50.0
219,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5,23.0
403,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77,8.3
78,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34,21.2


In [17]:
# 데이터 전처리, 분할
df = pd.read_csv('boston.csv')
df = df.sample(frac=1, random_state=0)  ## 데이터 섞어 주기
X = df.loc[:, 'CRIM':'LSTAT']
y = df.loc[:, 'PRICE']

degree = 3
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(X)
P = PolynomialFeatures(degree=degree, include_bias=False)
x_poly = P.fit_transform(x_scaled)

In [18]:
# 모델 생성
model = Ridge()    # hyper parameter를 넣지 않음

In [21]:
import numpy as np
# 파라미터를 딕셔너리 형태로 제공 (Key값 틀리면 ValueError 발생)
#alphas = [0.1, 0.2, 0.3, 1, 10]   
alphas = np.arange(0.01, 0.15, 0.01)
params = {'alpha': alphas}

In [22]:
from sklearn.model_selection import GridSearchCV
#param_grid의 하이퍼 파라미터를  3개의 train, testset fold로 나누어 테스트 수행 설정
#refit=True : 가장 좋은 파라미터 설정으로 재 학습시킴
grid_model = GridSearchCV(model, params, cv=10, refit=True)
grid_model.fit(x_poly, y)

#GridSearchCV 결과를 추출해 보기 편하게 DataFrame으로 변환
gm_df = pd.DataFrame(grid_model.cv_results_)
gm_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.031701,0.007999,0.001829,0.000544,0.01,{'alpha': 0.01},0.781692,0.817031,0.879153,0.897431,0.831928,0.936779,0.856031,0.921029,0.938065,0.943338,0.880248,0.053953,3
1,0.025301,0.007138,0.00174,0.000398,0.02,{'alpha': 0.02},0.768716,0.820748,0.887405,0.905069,0.81621,0.936607,0.88905,0.913928,0.948026,0.946741,0.88325,0.058362,1
2,0.029097,0.00753,0.002196,0.00139,0.03,{'alpha': 0.03},0.758488,0.823831,0.890149,0.906189,0.792769,0.936842,0.90262,0.909426,0.952301,0.945778,0.881839,0.063598,2
3,0.022323,0.006335,0.001352,9.1e-05,0.04,{'alpha': 0.04},0.750847,0.826202,0.891545,0.905687,0.771618,0.937157,0.909774,0.906212,0.954751,0.944024,0.879782,0.068295,4
4,0.0209,0.005933,0.001505,0.00045,0.05,{'alpha': 0.05},0.745056,0.828018,0.89238,0.904654,0.753413,0.937456,0.913972,0.903754,0.956323,0.942213,0.877724,0.072361,5
5,0.022695,0.007518,0.001561,0.000335,0.06,{'alpha': 0.060000000000000005},0.740592,0.829417,0.892927,0.903451,0.737754,0.937723,0.91657,0.901788,0.957396,0.940522,0.875814,0.075872,6
6,0.032857,0.002409,0.001882,0.000438,0.07,{'alpha': 0.06999999999999999},0.737097,0.830497,0.893312,0.902222,0.72417,0.937958,0.91821,0.900161,0.958157,0.938984,0.874077,0.078923,7
7,0.024559,0.007403,0.001528,0.000305,0.08,{'alpha': 0.08},0.734326,0.831333,0.893599,0.901025,0.712268,0.938165,0.91924,0.898779,0.958714,0.937591,0.872504,0.081595,8
8,0.024392,0.007857,0.001865,0.001073,0.09,{'alpha': 0.09},0.732106,0.831976,0.893825,0.899886,0.701738,0.93835,0.91986,0.897582,0.959129,0.936327,0.871078,0.083955,9
9,0.027218,0.006942,0.001803,0.000447,0.1,{'alpha': 0.09999999999999999},0.730313,0.832467,0.894013,0.898812,0.692338,0.938517,0.920195,0.896525,0.959445,0.935174,0.86978,0.086056,10


In [26]:
gm_df[['params','mean_test_score','rank_test_score',
       'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,{'alpha': 0.01},0.880248,3,0.781692,0.817031,0.879153
1,{'alpha': 0.02},0.88325,1,0.768716,0.820748,0.887405
2,{'alpha': 0.03},0.881839,2,0.758488,0.823831,0.890149
3,{'alpha': 0.04},0.879782,4,0.750847,0.826202,0.891545
4,{'alpha': 0.05},0.877724,5,0.745056,0.828018,0.89238
5,{'alpha': 0.060000000000000005},0.875814,6,0.740592,0.829417,0.892927
6,{'alpha': 0.06999999999999999},0.874077,7,0.737097,0.830497,0.893312
7,{'alpha': 0.08},0.872504,8,0.734326,0.831333,0.893599
8,{'alpha': 0.09},0.871078,9,0.732106,0.831976,0.893825
9,{'alpha': 0.09999999999999999},0.86978,10,0.730313,0.832467,0.894013


In [23]:
gm_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_alpha', 'params', 'split0_test_score', 'split1_test_score',
       'split2_test_score', 'split3_test_score', 'split4_test_score',
       'split5_test_score', 'split6_test_score', 'split7_test_score',
       'split8_test_score', 'split9_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

In [27]:
## 최적 파라미터, 최고 정확도 표출
print('GridSearchCV의 최적 파라미터 {}'.format(grid_model.best_params_))
print('GridSearchCV의 최고 정확도 {:.4f}'.format(grid_model.best_score_))

GridSearchCV의 최적 파라미터 {'alpha': 0.02}
GridSearchCV의 최고 정확도 0.8832


In [28]:
#최적의 Estimator를 이용하여 평가, 정확도 추측
from sklearn.metrics import r2_score
estimator = grid_model.best_estimator_
#값 예측 
pred = estimator.predict(x_test)
print('테스트 정확도 : {:.4f}'.format(r2_score(y_test, pred)))

테스트 정확도 : 0.9102
