## 모듈화
- 순서 (전처리 등은 생략)
   - import data
   - scaling
   - split
   - model fit
   - gridsearch
   - predict using best params

In [167]:
import pandas as pd
import numpy as np
data = pd.read_csv('./house_price_data/boston.csv')
X = data.drop('medv', axis = 1)
y = data['medv']

# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
new_X = pd.DataFrame(scaled_X, columns = X.columns)

# split
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(new_X, y, test_size = 0.3, random_state = 42)

# model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

model_lr = LinearRegression()
model_lr.fit(train_X, train_y)
lr_pred_y = model_lr.predict(test_X)
print('LinearRegression R2:', r2_score(test_y, lr_pred_y))

# grid CV
from sklearn.model_selection import GridSearchCV
params_rf = {'n_estimators': [10, 100]}
model_rf = RandomForestRegressor()
grid_rf = GridSearchCV(model_rf, param_grid = params_rf, cv = 3, scoring = 'r2')
grid_rf.fit(train_X, train_y)
print('best_parameters:', grid_rf.best_params_)
print('best_score:', grid_rf.best_score_)

best_param = grid_rf.best_estimator_
rf_pred_y = best_param.predict(test_X)
print('RandomForest R2:', r2_score(test_y, rf_pred_y))

LinearRegression R2: 0.7112260057484926
best_parameters: {'n_estimators': 100}
best_score: 0.8092853506679635
RandomForest R2: 0.8698652941148588


## import data

In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv('./house_price_data/boston.csv')
print(data.shape)
print(data.columns)
data.head()

(506, 14)
Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'black', 'lstat', 'medv'],
      dtype='object')


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## scaling and train, test split (hold out CV)

In [78]:
## predictor and target
X = data.drop('medv', axis = 1)
y = data['medv']

## scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# StandardScaler는 (-Inf, Inf) 주의 & fit_transform 함수 지원
std_scaler = StandardScaler()
std_scaled_X = std_scaler.fit_transform(X)
scaled_X = pd.DataFrame(scaled_X, columns = X.columns)

# MinMaxScaler는 fit_transform 지원 안 함 & 원 데이터의 분포 유지
min_max_scaler = MinMaxScaler()
min_max_scaled_X = min_max_scaler.fit(X).transform(X)
minmax_X = pd.DataFrame(min_max_scaled_X, columns = X.columns)

# train_test_split
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(scaled_X, y, test_size = 0.3)
print(train_X.shape, test_X.shape)
print(train_y.shape, test_y.shape)

(354, 13) (152, 13)
(354,) (152,)


## 튜닝 필요없는 모형
- 선형 회귀모형
- 바로 적합한 후, predict 진행
- score는 mean_squared_error, mean_absolute_error,r2_score 사용한 후 추후 비교

In [74]:
## 튜닝 필요 없는 모형
# 선형 모형
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()
model_lr.fit(train_X, train_y)

## predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 적합된 모형 model_lr로 predict
lr_pred_y = model_lr.predict(train_X)

# train data의 성능 확인
models = {'LinearRegression':{'model': model_lr,
                              'RMSE': round(mean_squared_error(train_y, lr_pred_y, squared = False), 2),
                              'MAE': round(mean_absolute_error(train_y, lr_pred_y), 2),
                              'R2': round(r2_score(train_y, lr_pred_y), 2)}
                              }
models

RMSE of LinearRegression: 4.43
MAE of LinearRegression: 3.11
R2 of LinearRegression: 77.24


{'LinearRegression': {'model': LinearRegression(),
  'RMSE': 4.43,
  'MAE': 3.11,
  'R2': 0.77}}

## 튜닝 필요한 모형들
- GridSearchCV로 간단한 튜닝 진행 후 prediction

In [144]:
# StandardScaler 사용한 결과
## 튜닝 필요한 모형들
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

## model dictionary
model_name = {'Lasso': Lasso(),
              'Ridge': Ridge(),
              'SVR': SVR(),
              'DecisionTree': DecisionTreeRegressor(),
              'KNN': KNeighborsRegressor(),
              'RandomForest': RandomForestRegressor(),
              'GradientBoosting': GradientBoostingRegressor(),
              'AdaBoost': AdaBoostRegressor(),
              'ExtraTree': ExtraTreesRegressor(),
              'XGB': XGBRegressor()
              }

for name, model in model_name.items():
    models[name] = {'model': model}

# 튜닝 없이 fit and predict
for model in models.keys():
    models[model]['model'].fit(train_X, train_y)
    pred_y = models[model]['model'].predict(train_X)
    models[model]['RMSE'] = round(mean_squared_error(train_y, pred_y, squared = False), 2)
    models[model]['MAE'] = round(mean_absolute_error(train_y, pred_y), 2)
    models[model]['R2'] = round(r2_score(train_y, pred_y), 2)

    pred_test_y = models[model]['model'].predict(test_X)
    models[model]['test_RMSE'] = round(mean_squared_error(test_y, pred_test_y, squared = False), 2)
    models[model]['test_MAE'] = round(mean_absolute_error(test_y, pred_test_y), 2)
    models[model]['test_R2'] = round(r2_score(test_y, pred_test_y), 2)

score_df = pd.DataFrame(models.values())
score_df

Unnamed: 0,model,RMSE,MAE,R2,test_RMSE,test_MAE,test_R2
0,LinearRegression(),4.64,3.28,0.75,4.84,3.37,0.72
1,Lasso(),5.34,3.72,0.67,5.47,3.88,0.64
2,Ridge(),4.64,3.28,0.75,4.83,3.37,0.72
3,SVR(),5.11,2.9,0.69,5.96,3.33,0.57
4,DecisionTreeRegressor(),0.0,0.0,1.0,4.26,2.89,0.78
5,KNeighborsRegressor(),3.49,2.25,0.86,5.23,3.11,0.67
6,"(DecisionTreeRegressor(max_features=1.0, rando...",1.34,0.84,0.98,3.55,2.53,0.85
7,([DecisionTreeRegressor(criterion='friedman_ms...,1.25,0.98,0.98,3.24,2.36,0.87
8,"(DecisionTreeRegressor(max_depth=3, random_sta...",2.77,2.3,0.91,4.21,2.97,0.79
9,"(ExtraTreeRegressor(random_state=639252378), E...",0.0,0.0,1.0,3.24,2.24,0.87


In [77]:
# MinMaxScaler 사용한 결과
# StandardScaler 사용한 것이 훨씬 나았음.

Unnamed: 0,model,RMSE,MAE,R2,test_RMSE,test_MAE,test_R2
0,LinearRegression(),4.46,3.14,0.76,5.24,3.59,0.69
1,Lasso(),7.89,5.63,0.25,8.22,5.74,0.23
2,Ridge(),4.51,3.12,0.75,5.27,3.62,0.69
3,SVR(),5.61,3.33,0.62,6.14,3.8,0.57
4,DecisionTreeRegressor(),0.0,0.0,1.0,5.83,3.76,0.61
5,KNeighborsRegressor(),3.98,2.51,0.81,5.23,3.36,0.69
6,"(DecisionTreeRegressor(max_features=1.0, rando...",1.18,0.85,0.98,4.33,2.4,0.79
7,([DecisionTreeRegressor(criterion='friedman_ms...,1.31,1.01,0.98,4.22,2.42,0.8
8,"(DecisionTreeRegressor(max_depth=3, random_sta...",2.73,2.25,0.91,4.92,3.13,0.73
9,"(ExtraTreeRegressor(random_state=545581124), E...",0.0,0.0,1.0,3.93,2.22,0.82


In [52]:
# 이건 scaled 안 한 결과
# 확실히 scaling을 한 효과가 보임. SVR이나, KNN 등의 train 성능이 상당히 올라감.
# 반면, 부스팅의 test 성능은 약간 떨어짐.

Unnamed: 0,model,RMSE,MAE,R2,test_RMSE,test_MAE,test_R2
0,LinearRegression(),4.52,3.24,0.77,5.19,3.64,0.63
1,Lasso(),5.07,3.59,0.71,5.02,3.52,0.65
2,Ridge(),4.57,3.24,0.77,5.06,3.43,0.64
3,SVR(),8.54,5.51,0.19,7.63,4.87,0.19
4,DecisionTreeRegressor(),0.0,0.0,1.0,4.22,3.03,0.75
5,KNeighborsRegressor(),4.94,3.44,0.73,6.31,4.29,0.45
6,"(DecisionTreeRegressor(max_features=1.0, rando...",1.46,0.87,0.98,3.42,2.37,0.84
7,([DecisionTreeRegressor(criterion='friedman_ms...,1.29,1.01,0.98,2.71,2.14,0.9
8,"(DecisionTreeRegressor(max_depth=3, random_sta...",2.84,2.34,0.91,3.77,2.82,0.8
9,"(ExtraTreeRegressor(random_state=2115368027), ...",0.0,0.0,1.0,2.86,2.08,0.89


## 튜닝 필요 모형 튜닝 진행

In [145]:
## model params (맛보기만)
model_params = {'LinearRegression': None,
                'Lasso': {'alpha': [0.2, 0.5, 1]},
                'Ridge': {'alpha': [0.2, 0.5, 1]},
                'SVR': {'C': [100], 'kernel': ['linear', 'rbf'], 'epsilon': [0.1], 'gamma': [0.1]},
                'DecisionTree': {'max_depth': [2, 3, 4, 5]},
                'KNN': {'n_neighbors': [3, 4, 5, 6]},
                'RandomForest': {'max_depth': [3, 4, 5], 'n_estimators': [10, 100]},
                'GradientBoosting': {'n_estimators': [10, 50, 100]},
                'AdaBoost': {'n_estimators': [10, 50, 100]},
                'ExtraTree': {'n_estimators': [10, 50, 100]},
                'XGB': {'n_estimators': [10, 50, 100]}
                }

from sklearn.model_selection import GridSearchCV

for model in models.keys():
    if model != 'LinearRegression':
        model_fit = GridSearchCV(models[model]['model'], param_grid = model_params[model],
        cv = 3, scoring = 'r2')
        model_fit.fit(train_X, train_y)

        best_param = model_fit.best_estimator_
        pred_y = best_param.predict(test_X)

        models[model]['gridCV_test_RMSE'] = round(mean_squared_error(test_y, pred_y, squared = False), 2)
        models[model]['gridCV_test_MAE'] = round(mean_absolute_error(test_y, pred_y), 2)
        models[model]['gridCV_test_R2'] = round(r2_score(test_y, pred_y), 2)
    
GridsearchCV_score_df = pd.DataFrame(models.values())
GridsearchCV_score_df

Unnamed: 0,model,RMSE,MAE,R2,test_RMSE,test_MAE,test_R2,gridCV_test_RMSE,gridCV_test_MAE,gridCV_test_R2
0,LinearRegression(),4.64,3.28,0.75,4.84,3.37,0.72,,,
1,Lasso(),5.34,3.72,0.67,5.47,3.88,0.64,4.95,3.4,0.7
2,Ridge(),4.64,3.28,0.75,4.83,3.37,0.72,4.83,3.37,0.72
3,SVR(),5.11,2.9,0.69,5.96,3.33,0.57,4.03,2.38,0.8
4,DecisionTreeRegressor(),0.0,0.0,1.0,4.26,2.89,0.78,4.61,3.16,0.74
5,KNeighborsRegressor(),3.49,2.25,0.86,5.23,3.11,0.67,5.15,3.04,0.68
6,"(DecisionTreeRegressor(max_features=1.0, rando...",1.34,0.84,0.98,3.55,2.53,0.85,3.63,2.52,0.84
7,([DecisionTreeRegressor(criterion='friedman_ms...,1.25,0.98,0.98,3.24,2.36,0.87,3.28,2.38,0.87
8,"(DecisionTreeRegressor(max_depth=3, random_sta...",2.77,2.3,0.91,4.21,2.97,0.79,4.16,2.97,0.79
9,"(ExtraTreeRegressor(random_state=639252378), E...",0.0,0.0,1.0,3.24,2.24,0.87,3.29,2.28,0.87
