<img align="right" src="https://ds-cs-images.s3.ap-northeast-2.amazonaws.com/Codestates_Fulllogo_Color.png" width=100>

## *AIB / SECTION 2 / SPRINT 2 / NOTE 4*

# 📝 Assignment
---

# 모델선택(Model Selection)

### 1) 캐글 대회를 이어서 진행합니다. RandomizedSearchCV 를 사용하여 하이퍼파라미터 튜닝을 진행합니다.

- [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)를 사용하세요.
- 분류문제에서 맞는 [scoring parameter](https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values) metric을 사용하세요.
- [OrdinalEncoder](https://contrib.scikit-learn.org/categorical-encoding/ordinal.html) 사용을 권합니다.
- RandomizedSearchCV 를 사용해서 하이퍼파라미터 튜닝을 진행하고 최고 성능을 보이는 모델로 예측을 진행한 후 캐글에 제출합니다.
- **캐글 Leaderboard에서 개선된 본인 Score를 과제 제출폼에 제출하세요.**

In [None]:
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !pip install kaggle

# ! pip install category_encoders

In [3]:
import pandas as pd
import numpy as np

In [4]:
### EDA 시작
target = 'vacc_h1n1_f'
# target = 'vacc_seas_f'
train = pd.merge(pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/train.csv'), 
                 pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/train_labels.csv')[target], left_index=True, right_index=True)
test = pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/test.csv')
sample_submission = pd.read_csv('https://ds-lecture-data.s3.ap-northeast-2.amazonaws.com/vacc_flu/submission.csv')

In [5]:
def engineer(df):
    """특성을 엔지니어링 하는 함수입니다."""
    
    # 높은 카디널리티를 가지는 특성을 제거합니다.
#     selected_cols = df.select_dtypes(include=['number', 'object'])
#     colnames = selected_cols.columns.tolist()
#     labels = selected_cols.nunique()
    
#     selected_features = labels[labels <= 30].index.tolist()
#     df = df[selected_features]
    
    # 새로운 특성을 생성합니다.
    behaviorals = [col for col in df.columns if 'behavioral' in col] 
    df['behaviorals'] = df[behaviorals].sum(axis=1)
    
    
    # dels = [col for col in df.columns if ('employment' in col or 'seas' in col)]
    # df.drop(columns=dels, inplace=True)
        
    return df


train = engineer(train)
test = engineer(test)

In [6]:
features = train.drop(columns=[target]).columns

X_train = train[features]
y_train = train[target]
X_test = test[features]

### EDA 끝

In [None]:
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint, uniform

In [None]:
%%time
pipe = make_pipeline(
    OrdinalEncoder(handle_missing="return_nan"), 
    SimpleImputer(), 
    RandomForestClassifier(random_state=2)
)

dists = {
    'randomforestclassifier__n_estimators': randint(160, 200),
    'randomforestclassifier__max_depth': randint(11, 14),
    'randomforestclassifier__max_features': uniform(0.5, 1)
}

clf = RandomizedSearchCV(
    pipe, 
    param_distributions=dists, 
    n_iter=60, 
    cv=10, 
    scoring='f1',  
    verbose=1,
    n_jobs=-1
)

clf.fit(X_train, y_train);

In [None]:
print('최적 하이퍼파라미터: ', clf.best_params_)
print('F1: ', clf.best_score_)

In [None]:
# 최적 하이퍼파라미터:  {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_features': 0.8485466648445711, 'randomforestclassifier__n_estimators': 166}
# F1:  0.569834747536281

# 최적 하이퍼파라미터:  {'randomforestclassifier__max_depth': 11, 'randomforestclassifier__max_features': 0.7563127746684387, 'randomforestclassifier__n_estimators': 169}
# F1:  0.5699558806251526

# 최적 하이퍼파라미터:  {'randomforestclassifier__max_depth': 13, 'randomforestclassifier__max_features': 0.551813562377767, 'randomforestclassifier__n_estimators': 163}
# F1:  0.5714311138139805

# 최적 하이퍼파라미터:  {'randomforestclassifier__max_depth': 12, 'randomforestclassifier__max_features': 0.5383281712274792, 'randomforestclassifier__n_estimators': 184}
# F1:  0.5723167616525429

In [8]:
# 최고 성능 모델 불러오기
# pipe = clf.best_estimator_
pipe = make_pipeline(
    OrdinalEncoder(handle_missing="return_nan"), 
    SimpleImputer(), 
    RandomForestClassifier(max_depth=12, max_features=0.5383281712274792, n_estimators=184, random_state=2, n_jobs=-1)
)

pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['opinion_h1n1_vacc_effective',
                                      'opinion_h1n1_risk',
                                      'opinion_h1n1_sick_from_vacc',
                                      'opinion_seas_vacc_effective',
                                      'opinion_seas_risk',
                                      'opinion_seas_sick_from_vacc', 'agegrp',
                                      'employment_status', 'census_msa',
                                      'employment_industry',
                                      'employment_occupation', 'state'],
                                drop_invariant=False,
                                handl...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=12,
                                   

In [9]:
# 테스트 셋 예측 확률 얻기
y_pred_proba = pipe.predict_proba(X_train)[:, 1] # 타겟이 1(True)일 확률 가져오기

In [10]:
# ROC curve 확인하기
from sklearn.metrics import roc_curve

# roc_curve(타겟값, prob of 1)
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba)

# threshold 최대값의 인덱스, np.argmax()
optimal_idx = np.argmax(tpr - fpr) # 재현율은 최대로, 위양성률은 최소로 하는 데이터의 인덱스 값 찾기
optimal_threshold = thresholds[optimal_idx] # 해당 값의 임계값 가져오기

print('idx:', optimal_idx, ', threshold:', optimal_threshold)

idx: 3811 , threshold: 0.2727484419916789


In [11]:
# 최적 임계값 적용
y_test_pred_proba = pipe.predict_proba(X_test)[:, 1] # 타겟이 1(True)일 확률 가져오기
y_test_pred = y_test_pred_proba > optimal_threshold

In [12]:
# 최종 결과 담은 데이터프레임 생성
d = {'Id':X_test.index, 'vacc_h1n1_f': y_test_pred}
submission = pd.DataFrame(data = d)
submission.head()

Unnamed: 0,Id,vacc_h1n1_f
0,0,False
1,1,False
2,2,False
3,3,False
4,4,False


In [16]:
# 제출용 파일 생성
# submission.to_csv('submission.csv', index=False)

In [None]:
# !kaggle competitions submit -c prediction-of-h1n1-vaccination -f submission.csv -m "Gyeong ho Ahn, Fourth Submission(using Random Forest) using RandomizedSearchCV & threshold - upgraded"

## 🔥 도전과제


### 2) [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) 를 사용하여 하이퍼파라미터 튜닝을 진행합니다.
- 모델 성능을 높이기 위해 가능한 시도를 다 해보세요.
- 모델 성능에 가장 큰 영향을 준 하이퍼파라미터에 대해서 분석하고 설명해 보세요.



In [14]:
from sklearn.model_selection import GridSearchCV

pipe2 = make_pipeline(
    OrdinalEncoder(handle_missing="return_nan"), 
    SimpleImputer(), 
    RandomForestClassifier(random_state=2)
)

params = {
    'randomforestclassifier__max_depth': [5, 10, 15, 20, None],
    'randomforestclassifier__n_estimators': [50, 100, 150, 200, 250]
}

clf2 = GridSearchCV(
    pipe, 
    param_grid=params, 
    cv=3, 
    scoring='f1',  
    verbose=1,
    n_jobs=-1
)

clf2.fit(X_train, y_train);

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 13.4min finished


In [15]:
print('최적 하이퍼파라미터: ', clf2.best_params_)
print('F1: ', clf2.best_score_)

최적 하이퍼파라미터:  {'randomforestclassifier__max_depth': 20, 'randomforestclassifier__n_estimators': 150}
F1:  0.5731928383772006


In [23]:
pipe2 = clf2.best_estimator_

In [24]:
# 테스트 셋 예측 확률 얻기
y_pred_proba = pipe2.predict_proba(X_train)[:, 1] # 타겟이 1(True)일 확률 가져오기

In [25]:
# ROC curve 확인하기
# roc_curve(타겟값, prob of 1)
fpr, tpr, thresholds = roc_curve(y_train, y_pred_proba)

# threshold 최대값의 인덱스, np.argmax()
optimal_idx = np.argmax(tpr - fpr) # 재현율은 최대로, 위양성률은 최소로 하는 데이터의 인덱스 값 찾기
optimal_threshold = thresholds[optimal_idx] # 해당 값의 임계값 가져오기

print('idx:', optimal_idx, ', threshold:', optimal_threshold)

idx: 1306 , threshold: 0.31715270837494086


In [26]:
# 최적 임계값 적용
y_test_pred_proba = pipe.predict_proba(X_test)[:, 1] # 타겟이 1(True)일 확률 가져오기
y_test_pred = y_test_pred_proba > optimal_threshold

In [27]:
# 최종 결과 담은 데이터프레임 생성
d = {'Id':X_test.index, 'vacc_h1n1_f': y_test_pred}
submission = pd.DataFrame(data = d)
submission.head()

Unnamed: 0,Id,vacc_h1n1_f
0,0,False
1,1,False
2,2,False
3,3,False
4,4,False


In [31]:
# 제출용 파일 생성
# submission.to_csv('submission.csv', index=False)

In [None]:
# !kaggle competitions submit -c prediction-of-h1n1-vaccination -f submission.csv -m "Gyeong ho Ahn, Fourth Submission(using Random Forest) using GridSearchCV & threshold"