In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, model_selection, linear_model
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

<br>
<br>

## 1. Feature engineering & Feature selection <span style="font-size:0.7em;"> (+ 데이터 읽어들이기 & Binary label 만들어주기)</span>

In [2]:
df_data = pd.read_excel('boston_house_data.xlsx', index_col=0)
df_data.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

df_target = pd.read_excel('boston_house_target.xlsx', index_col=0)
df_target.columns = ['Price']

mean_price = df_target['Price'].mean()
df_target['Price'] = df_target['Price'].apply(lambda x : 1 if x > mean_price else 0)

In [3]:
# df_data['RAD'].value_counts(sort=False)

In [3]:
df_data.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03


- CRIM : **범죄율**
- ZN : **25,000 평방피트를 초과하는 거주지역 비율**
- INDUS : **비소매상업지역 면적 비율** 
- CHAS : **찰스강의 경계에 위치한 경우는 1, 아니면 0**
- NOX : **일산화질소 농도**
- RM : **주택당 방 수 (거실 외 subroom)**
- AGE : **1940년 이전에 건축된 주택의 비율**
- DIS : **직업센터의 거리**
- RAD : **방사형 고속도로까지의 거리**
- TAX : **재산세율**
- PTRATIO : **학생/교사 비율**
- B : **인구 중 흑인 비율**
- LSTAT : **인구 중 하위 계층 비율**

In [4]:
df_target.head(3)

Unnamed: 0,Price
0,1
1,0
2,1


<br>
<br>

## 2. Train-Test split

In [5]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(df_data, 
                                                                    df_target, 
                                                                    test_size=0.3, 
                                                                    random_state=0)

<br>
<br>

## 3. Make Pipeline for feature-transformer (StandardScaler & OneHotEncoder)

In [6]:
x_train.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
141,1.62864,0.0,21.89,0,0.624,5.019,100.0,1.4394,4,437,21.2,396.9,34.41
272,0.1146,20.0,6.96,0,0.464,6.538,58.7,3.9175,3,223,18.6,394.96,7.73
135,0.55778,0.0,21.89,0,0.624,6.335,98.2,2.1107,4,437,21.2,394.67,16.96


In [7]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
# numeric_features = list(df_data.columns)
# numeric_features.remove('CHAS')
# numeric_features.remove('RAD')

numeric_features = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']
numeric_transformer = StandardScaler() # cf) RobustScaler

categorical_features = ['CHAS', 'RAD']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 
# categories='auto' : just for ignoring warning messages
# handle_unknown='ignore' : if an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros.

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

<br>
<br>

## 4. Pipeline usage - 1) Preprocessing-only (fit & transform)

In [11]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)]) # preprocessing-only

In [12]:
preprocessor_pipe.fit(x_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['CRIM', 'ZN', 'INDUS', 'NOX',
                                                   'RM', 'AGE', 'DIS', 'TAX',
                                                   'PTRATIO', 'B', 'LSTAT']),
                                                 ('cat', OneHotEncoder(),
                                                  ['CHAS', 'RAD'])]))])

In [13]:
x_train_transformed = preprocessor_pipe.transform(x_train)
x_test_transformed = preprocessor_pipe.transform(x_test)

# 위에서 categorical_features 리스트에 포함시킨 열 중 숫자가 아닌 텍스트(문자열)로 이루어진 열이 있을 경우,
# .transform() 함수 실행 결과로 만들어진 변수의 타입이 np.array가 아닌 csr_matrix일 수 있습니다.
# 그 경우에는 .tranform() 함수 실행 직후 .todense() 함수를 추가로 실행해주시면 됩니다.

# ex) preprocessor_pipe.transform(x_train).todense()

In [14]:
# 11개의 numeric variables & 2개의 categorical variables (각각 2개 & 9개 카테고리) 
# -> 11 + (2 + 9) = 22개의 새로운 열

# x_train_transformed[0] 

In [15]:
pd.DataFrame(x_train_transformed).head(3)

# 위와 같이 Pipeline을 통해 preprocessing을 진행할 경우,
# inverse_transform은 작동하지 않습니다. (이후 새로운 데이터가 들어올 경우 전처리 -> predict만 수행)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.204163,-0.499979,1.548016,0.588213,-1.839367,1.107402,-1.12511,0.206735,1.227257,0.424543,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.385843,0.346774,-0.589747,-0.797821,0.327487,-0.367661,0.07509,-1.049493,0.056963,0.401853,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.332663,-0.499979,1.548016,0.588213,0.037907,1.043114,-0.799984,0.206735,1.227257,0.398461,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [16]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=200, random_state=0)
model.fit(x_train_transformed, y_train) # <- x_train_transformed (not x_train)

accuracy = model.score(x_test_transformed, y_test)
print("model score:", round(accuracy, 4))

model score: 0.8553


<br>
<br>

## 4. Pipeline usage - 2) Preprocessing + Training (at once)

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', GradientBoostingClassifier(n_estimators=200, random_state=0))])

model.fit(x_train, y_train) # <- x_train (not x_train_transformed)

accuracy = model.score(x_test, y_test)
print("model score:", round(accuracy, 4))

model score: 0.8553


<br>
<br>

## 4. Pipeline usage - 3) Preprocessing + Training + Tuning hyper-params (at once)

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', GradientBoostingClassifier())]) # removed hyper-params 

In [19]:
model.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__copy', 'preprocessor__num__with_mean', 'preprocessor__num__with_std', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unknown', 'preprocessor__cat__sparse', 'classifier__ccp_alpha', 'classifier__criterion', 'classifier__init', 'classifier__learning_rate', 'classifier__loss', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__min_impurity_decrease', 'classifier__min_impurity_split', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__n_estimators', 'classifier__n_iter_no_change', 'classifier__random_state', 

In [21]:
param_grid = {
    'classifier__loss': ['deviance', 'exponential'], # you can exclude "deviance" 
    'classifier__learning_rate': [0.01, 0.001], # you can exclude "0.001" 
    'classifier__n_estimators': [200, 400], # 500, 1000, 1500
    'classifier__min_samples_split': [2, 4],
    'classifier__max_depth': [2, 4],
    'classifier__random_state': [0]
}

grid_search = GridSearchCV(model, param_grid, 
                           refit=True, cv=3, n_jobs=1, verbose=1, scoring= 'accuracy')

grid_search.fit(x_train, y_train)
print("Best params:", grid_search.best_params_)

accuracy = grid_search.score(x_test, y_test)
print("\nmodel score:", round(accuracy, 4))

Best params: {'classifier__learning_rate': 0.01, 'classifier__loss': 'deviance', 'classifier__max_depth': 4, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 400, 'classifier__random_state': 0}

model score: 0.8421


In [None]:
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('classifier', GradientBoostingClassifier(learning_rate=0.01, 
#                                                                   loss='exponential', 
#                                                                   max_depth=4, 
#                                                                   min_samples_split=2, 
#                                                                   n_estimators=200, 
#                                                                   random_state=0))])

# model.fit(x_train, y_train)

# accuracy = model.score(x_test, y_test)
# print("model score:", round(accuracy, 4))