- smoking.csv와 competition_format에 있는 데이터에 전처리를 모두 적용하고자 전처리 코드 함수화 후 재업로드합니다.
- 추가하거나 수정하실 사항 있으시면 자유롭게 첨삭해주시면 감사하겠습니다.
- bmi 컬럼 추가를 위해 계산식을 포함시키고 주석처리 해뒀습니다 - 22/9/27
- wwi 컬럼 추가를 위해 계산식을 포함시키고 주석처리 해뒀습니다 - 22/9/28
    - 허리둘레를 체중의 제곱근으로 나눈 값
    - "waist(cm)"/sqrt("weight(kg)")
    - WWI 수치가 높으면 지방이 많은 비만 환자이고 수치가 낮으면 근육이 상대적으로 많은 정상 체중이라는 의미
- hearing(left), hearing(right) 피처 1, 2 => 1, 0 으로 변환 -22/9/28

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier # for modeling
from sklearn.preprocessing import LabelEncoder # for encoding
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import plot_confusion_matrix

In [None]:
def preprocessing(df_scaling, scaled_form = 'MinMaxScaler()'):
    # 불필요한 컬럼 제거
    if 'ID' in df_scaling:
        df_scaling = df_scaling.drop("ID", axis = 1)
        if len(df_scaling.columns) == 1:
            return df_scaling
        
        df_scaling = df_scaling.drop('oral', axis = 1) 
        # 범주형 피처 레이블 인코딩 
        cate_features = df_scaling[['gender','tartar']]

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["gender"])
        df_scaling["gender"] = lbe.fit_transform(df_scaling["gender"])

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["tartar"])
        df_scaling["tartar"] = lbe.fit_transform(df_scaling["tartar"])
        # hearing 피처 1, 2 => 1, 0으로 변환
        df_scaling['hearing(left)'] = df_scaling['hearing(left)'].apply(lambda x: x-2 if x ==2.0 else x )
        df_scaling['hearing(right)'] = df_scaling['hearing(right)'].apply(lambda x: x-2 if x ==2.0 else x )
        
#         # BMI 지수 계산
#         # bmi = kg/m^2
#         df_scaling['bmi'] = df_scaling['weight(kg)']/((df_scaling['height(cm)']*0.01)**2)
#         # wwi(비만 지수) 지수 계산
#         df_scaling['wwi'] = df_scaling['waist(cm)']/(df_scaling['weight(kg)'].apply(np.sqrt))

        # 정규화 작업
        cate_features = df_scaling[['gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
        scaled_features = df_scaling.drop(cate_features.columns, axis=1)
        if scaled_form == 'StandardScaler()':
            # Standard scaler
            scaler = StandardScaler()
            scaler.fit(scaled_features)
            scaled = scaler.transform(scaled_features)
            standard_scaled = pd.DataFrame(scaled,columns = scaled_features.columns)
            standard_scaled[cate_features.columns] = cate_features

            return standard_scaled
        else:
            # MinMax scaler
            scaler_M = MinMaxScaler()
            scaler_M.fit(scaled_features)
            scaled_M = scaler_M.transform(scaled_features)
            min_max_scaled = pd.DataFrame(scaled_M,columns = scaled_features.columns)
            min_max_scaled[cate_features.columns] = cate_features

            return min_max_scaled

In [None]:
smoking = pd.read_csv('./Smoking_raw/smoking.csv')

In [None]:
# scaled_data = preprocessing(smoking)

In [None]:
scaled_data = smoking

In [None]:
target = scaled_data['smoking']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_data, target, test_size = 0.1, random_state=0)

# competition_format

In [None]:
# competition_format
x_train = pd.read_csv('./Smoking_raw/competition_format/x_train.csv')
x_test = pd.read_csv('./Smoking_raw/competition_format/x_test.csv')
y_train = pd.read_csv('./Smoking_raw/competition_format/y_train.csv')
y_test = pd.read_csv('./Smoking_raw/competition_format/y_test.csv')

In [None]:
# x_train = preprocessing(x_train)
# x_test = preprocessing(x_test)
# y_train = preprocessing(y_train)
# y_test = preprocessing(y_test)

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

In [None]:
# x_train 시각화
x_train.hist(figsize = (20, 20), bins = 12, legend=False)

In [None]:
x_train['eyesight(right)'].max()

### 랜덤포레스트(Random Forest) 적용

In [None]:
# 독립변수 설정
x = smoking.drop("smoking", axis = 1)
# 종속변수 설정
y = smoking["smoking"]

In [None]:
# 랜덤포레스트 객체 생성
from sklearn.metrics import accuracy_score, precision_score , recall_score
rf_clf_2000 = RandomForestClassifier(random_state=0, n_estimators=2000)
rf_clf_2000.fit(x_train, y_train)
pred = rf_clf_2000.predict(x_test)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))

In [None]:
# 정확도 / 정밀도 / 재현율
print("정확도 : ", accuracy_score(y_test, pred))
print("정밀도 : ", precision_score(y_test, pred))
print("재현율 : ", recall_score(y_test, pred))

### GridSearchCV로 교차검증 및 하이퍼 파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[2000],
    'max_depth' : [6,8,10,12],
    'min_samples_leaf' : [8,12,18],
    'min_samples_split' : [8,16,20]
}

rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)

grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(x_train, y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

In [None]:
# 튜닝된 하이퍼파라미터로 재학습/예측/평가 - 분류기 수를 확장
rf_clf = RandomForestClassifier(n_estimators=2000, max_depth=10, min_samples_leaf=8, \
                                 min_samples_split=8, random_state=0)
rf_clf.fit(x_train,y_train)
pred= rf_clf.predict(x_test)
print('예측 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

### 개별 feature들의 중요도 시각화

In [None]:
# gender 원-핫 인코딩
from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder()
oh_labels= oh_encoder.fit_transform(x_train[["gender"]])
oh_labels.toarray()
gender = oh_labels.toarray()
pd.DataFrame(gender)
test = pd.DataFrame(gender)
test.columns = ['M','F']
test
x_train['M'] = test['M']
x_train['F'] = test['F']

In [None]:
test

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_import = rf_clf.feature_importances_
ftr_import_se = pd.Series(ftr_import, index=x_train.columns)
ftr_import_20 = ftr_import_se.sort_values(ascending=False)#[:20] # 중요도가 높은 20개의 피처만 추출

plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
# x축은 중요도 값, y축은 ftr_top20 시리즈의 index
sns.barplot(x=ftr_import_20,y=ftr_import_20.index) # 가로막대 그래프
plt.show()

### ROC AUC Curve

In [None]:
rf_roc_auc = roc_auc_score(y_test, rf_clf.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, rf_clf.predict_proba(x_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label = 'AUC (area = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1],'g--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()

### Confusion Matrix

In [None]:
plot_confusion_matrix(rf_clf, x_test, y_test, cmap = plt.cm.Greens, normalize = "true");