# Albiti 4주차 : 2021.05.31 ~ 2021.06.06
팀원: 김지환, 김채형, 이화영

1. 목표 : 당뇨 관련 지표들에 대한 이해 – open data 및 meta info.
2. 기한 : 2021.05.31 ~ 2021.06.06.
3. Task 1. Pima dataset을 사용한 분류모델 구축.
    - Kaggle에서 Pima dataset 다운로드.   
    - Accuracy 70% 이상, F1 70% 이상 모델 구축! 
4.	Task 2. Higher and higher.
    - Accuracy 85% 이상, F1 85% 이상.

# 1. Load libraries and read the data

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

# 2. EDA

In [None]:
display(df.info(),df.head())

In [None]:
df.describe()

- outlier 확인

In [None]:
plt.style.use('ggplot')

f, ax = plt.subplots(figsize=(20, 12))

ax.set_facecolor('#fafafa')
ax.set(xlim=(-1, 350))
plt.ylabel('Variables')
plt.title("Overview Data Set")
ax = sns.boxplot(data = df,
                 orient = 'h', 
                 palette = 'Set2')

- 변수 상관관계 확인

In [None]:
plt.figure(figsize=[15,10])
sns.heatmap(df.corr(), cmap='Blues', linewidths=0.5, annot=True)

# 3. Preprocessing

## 3-1. Replace missing values(0)

In [None]:
mv_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

df[mv_cols] = df[mv_cols].replace(0, np.NaN)

In [None]:
msno.matrix(df[mv_cols])

Missing values : 
* Insulin = 48.7% - 374
* SkinThickness = 29.56% - 227
* BloodPressure = 4.56% - 35
* BMI = 1.43% - 11
* Glucose = 0.65% - 5

In [None]:
for col in mv_cols:
    median_0 = df[(df[col].notnull()) & (df['Outcome']==0)][col].median()
    median_1 = df[(df[col].notnull()) & (df['Outcome']==1)][col].median()

    df.loc[(df['Outcome'] == 0 ) & (df[col].isnull()), col] = median_0
    df.loc[(df['Outcome'] == 1 ) & (df[col].isnull()), col] = median_1

In [None]:
msno.bar(df=df[mv_cols], color=(0.1, 0.6, 0.8))
# missing value replace 완료

- 0 value 변경 후 boxplot 재확인

In [None]:
plt.style.use('ggplot') # Using ggplot2 style visuals 

f, ax = plt.subplots(figsize=(20, 12))

ax.set_facecolor('#fafafa')
ax.set(xlim=(-.05, 350))
plt.ylabel('Variables')
plt.title("Overview Data Set")
ax = sns.boxplot(data = df,
                 orient = 'h', 
                 palette = 'Set2')

# 3. Prepare Data

## 3-1. Scaling

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=6271)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler

def scaler(train, test, scaling):
    
    if scaling=='MinMaxScaler':
        sc = MinMaxScaler()
    if scaling=='MaxAbsScaler':
        sc = MaxAbsScaler()
    if scaling=='StandardScaler':
        sc = StandardScaler()
    if scaling=='RobustScaler':
        sc = RobustScaler()

    train_scaled = sc.fit_transform(train)
    test_scaled = sc.transform(test)
    
    return train_scaled, test_scaled    

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score, f1_score

In [None]:
for scaling_type in ['MinMaxScaler', 'MaxAbsScaler', 'StandardScaler', 'RobustScaler']:
    
    # scaling
    X_train_scaled, X_test_scaled = scaler(X_train, X_test, scaling=scaling_type)
    
    # random forest
    rf_base = RandomForestClassifier(random_state=6271)
    rf_base.fit(X_train_scaled, y_train)
    y_pred = rf_base.predict(X_test_scaled)
    print('======== {} ========'.format(scaling_type))
    print("Accuracy for Random Forest: ",accuracy_score(y_test,y_pred))
    print("F1 score for Random Forest: ", f1_score(y_test, y_pred), '\n')

- MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler 중 MinMaxScaler 선택

In [None]:
X_train_scaled, X_test_scaled = scaler(X_train, X_test, scaling='MinMaxScaler')

# 4. Machine Learning

## 4-1. RandomForest

In [None]:
# hyperparameter tuning

from sklearn.model_selection import GridSearchCV

params ={
    'n_estimators':[50, 100, 300, 500],
    'max_depth':[4, 5, 6, 10],
    'min_samples_leaf':[2, 3, 4, 5],
    'min_samples_split':[2, 4, 6],
}

rf = RandomForestClassifier(random_state=6271)
grid_cv = GridSearchCV(rf, param_grid=params, scoring='f1', cv=5, n_jobs=-1)
grid_cv.fit(X_train_scaled, y_train)
grid_cv.best_estimator_

In [None]:
rf_tuned = grid_cv.best_estimator_
rf_tuned.fit(X_train_scaled, y_train)
pred = rf_tuned.predict(X_test_scaled)
print("Accuracy for Random Forest: ",accuracy_score(y_test,pred))
print("F1 score for Random Forest: ", f1_score(y_test, pred))

## 4-2. LGBM

In [None]:
import lightgbm as lgbm

params ={'learning_rate' : [0.01, 0.1, 0.2, 0.5],
              'n_estimators' : [64, 128, 256, 512],
              'max_depth': [4, 6, 8]}

model = lgbm.LGBMClassifier(random_state=401, silent=True, metric='f1', n_jobs=-1)
grid_cv = GridSearchCV(model, param_grid=params, cv=5, scoring='f1', n_jobs=-1)
grid_cv.fit(X_train_scaled, y_train)
grid_cv.best_estimator_

In [None]:
lgbm_tuned = grid_cv.best_estimator_
lgbm_tuned.fit(X_train_scaled, y_train)
pred = lgbm_tuned.predict(X_test_scaled)
print("Accuracy for LGBM: ",accuracy_score(y_test,pred))
print("F1 score for LGBM: ", f1_score(y_test, pred))

## 4-3. Voting(RF,LGBM, KNN, XGBoost)

In [None]:
import xgboost
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
voting_clf = VotingClassifier(estimators=[ 
    ('rf', rf_tuned),
    ('lgbm_clf', lgbm_tuned),
    ('xgb', xgboost.XGBClassifier(n_estimators=300)),
    ('knn', KNeighborsClassifier())], 
    voting='soft', 
    weights = [2,1,1,1]
    )

params = {
      'knn__n_neighbors': np.arange(1,30)
      }
      
random_search_vote = GridSearchCV(estimator=voting_clf, param_grid=params, cv=5, scoring='f1', n_jobs=-1)

random_search_vote.fit(X_train_scaled,y_train)

In [None]:
sv_tuned = random_search_vote.best_estimator_
sv_tuned.fit(X_train_scaled, y_train)
pred = sv_tuned.predict(X_test_scaled)
print("Accuracy for Voting: ",accuracy_score(y_test,pred))
print("F1 score for Voting: ", f1_score(y_test, pred))

# 5. Best Result
- Accuracy:  0.9285714285714286
- F1 score:  0.8952380952380953

# 5. Best Result

- Accuracy:  0.9285714285714286
- F1 score:  0.8952380952380953