# 1. Pima Indian Diabetes

kaggle : https://www.kaggle.com/uciml/pima-indians-diabetes-database

wikipedia : https://en.wikipedia.org/wiki/Pima_people

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')

sns.set(style = 'darkgrid')

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.shape

In [None]:
df.head()

# 2. Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.isnull().sum()

There is no missing value.

In [None]:
df.describe()

There seemed to be no missing values, but you can see that the minimum values of the Glucose, BloodPressure, Skin Thickness, Insulin, and BMI variables are zero.

This means that the missing value was marked as 0, not NaN.

In [None]:
plt.figure(figsize = (15,4))

df_null = df.iloc[:,1:-1].replace(0, np.nan)

sns.heatmap(df_null.isnull(), cmap = 'Greys')
plt.title('Missing Value', size = 15)
plt.show()

There are a lot of missing values for insulin.

In [None]:
sns.countplot(df['Outcome'])

plt.title('Count of Outcome')
plt.show()

we can see that 0 is about twice as many.

In [None]:
df_0 = df[df['Outcome'] == 0]
df_1 = df[df['Outcome'] == 1]
df_0.shape, df_1.shape

In [None]:
fig, axes = plt.subplots(nrows=4, ncols = 2, figsize = (15,15))

for i, col_name in enumerate(df.iloc[:,:-1]):
    row = i // 2
    col = i % 2
    sns.distplot(df_0[col_name], ax = axes[row,col], hist = False)
    sns.distplot(df_1[col_name], ax = axes[row,col], hist = False)
plt.show()

The higher the pregnancies, glucose, and BMI, the higher the risk of diabetes.

In the case of Pregnancies, the range is wide from 0 to 20.

It would be nice to categorize it for analysis.

In the case of Age, the proportion of people in their 20s and 30s is high, but it can be seen that the incidence rate increases after the age of 30.

Generally, Insulin is expected to be closely related to diabetes, but the graph is not.

because NaN is marked as 0.

In [None]:
df['Pregnancies_high'] = df['Pregnancies'] > 6

sns.countplot(data = df, x = 'Pregnancies_high', hue = 'Outcome')
plt.title('Diabetes , Pregnancies', size = 15)
plt.xlabel('Pregnancies > 6')
plt.show()

### Replace Insulin

In [None]:
df['Insulin'].replace(0, np.nan, inplace = True)
df.groupby('Outcome')['Insulin'].agg(['mean', 'median'])

People with diabetes have higher insulin levels than those who do not.

In [None]:
df['Insulin'].fillna(df.groupby('Outcome')['Insulin'].transform('median'), inplace = True)

### Correlation

In [None]:
df_matrix = df
df_corr = df.corr()

plt.figure(figsize = (12,6))
sns.heatmap(df_corr, vmax = 1, vmin = -1, cmap = 'coolwarm', annot = True)
plt.show()

Outcome and Glucose have the strongest correlation.

In [None]:
plt.figure(figsize = (10,4))

sns.lmplot(data = df, x = 'Insulin', y = 'Glucose',hue = 'Outcome')
plt.title('Glucose, Insulin', size = 15)
plt.show()

Since we replaced it with the median, there are many numbers in 102 and 169.

In [None]:
df['low_glu_insulin'] =(df['Glucose'] < 100) & (df['Insulin'] <= 102.5)

pd.crosstab(df['Outcome'], df['low_glu_insulin'])

### Outlier

In [None]:
plt.figure(figsize = (15,2))
sns.violinplot(df['Insulin'])

plt.title('Violinplot of Insulin')
plt.show()

In [None]:
df = df[df['Insulin'] < 600]
df.head()

### Log

In [None]:
plt.figure(figsize = (10,4))
df['Insulin_log'] = np.log(df['Insulin'] + 1)
sns.distplot(df["Insulin_log"])
plt.show()

### Split

In [None]:
from sklearn.model_selection import train_test_split

X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
        'Insulin_log', 'low_glu_insulin']]
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

print('train 개수: ', X_train.shape, y_train.shape)
print('test 개수: ', X_test.shape, y_test.shape)

# 3. Modeling

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier

estimators = [DecisionTreeClassifier(random_state = 42),
             RandomForestClassifier(random_state = 42),
             GradientBoostingClassifier(random_state = 42)
             ]
estimators

In [None]:
max_depth = np.random.randint(2,20,20)
max_features = np.random.uniform(0.5, 1.0, 20)

param_distributions = {
    'max_depth' : max_depth, 
    'max_features' : max_features}
param_distributions

In [None]:
from sklearn.model_selection import RandomizedSearchCV

results = []

for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions['n_estimators'] = np.random.randint(100,1000,10)
        
    clf = RandomizedSearchCV(estimator, 
                             param_distributions = param_distributions, 
                             n_iter = 100,
                             scoring = 'accuracy',
                             n_jobs = -1,
                             cv = 5,
                             verbose = 2)

    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_estimator_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)

In [None]:
df_cv = pd.DataFrame(results)
df_cv.columns = ['model', 'best_params', 'best_estimator', 'train_score', 'test_score', 'cv_result']
df_cv

Wow! RandomForestClassifier is the best model!

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

model = df_cv.loc[1, 'best_estimator']
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_predict))

In [None]:
print(classification_report(y_test, y_predict))

In [None]:
lgbm = LGBMClassifier(n_estimators = 100, num_leaves =61, random_state = 42)
lgbm.fit(X_train, y_train)
y_predict = lgbm.predict(X_test)

pd.DataFrame(confusion_matrix(y_test, y_predict))

In [None]:
lgbm_report = classification_report(y_test, y_predict)
print(lgbm_report)

In [None]:
plt.figure(figsize = (10,4))

feature_importance=model.feature_importances_
sns.barplot(x = feature_importance, y = X_train.columns)

plt.title('Feature Importance', size = 15)
plt.show()

We can see that insulin has a great deal of influence.

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

pred_proba = model.predict_proba(X_test)[:, 1] 
pred_proba_lgbm = lgbm.predict_proba(X_test)[:,1]

def roc_curve_plot(y_test, pred_proba):
    fprs, tprs, thresholds = roc_curve(y_test, pred_proba)
    fprs2, tprs2, threshholds = roc_curve(y_test, pred_proba_lgbm)
    plt.plot(fprs, tprs, label = 'Random Forest')
    plt.plot(fprs2, tprs2, label = 'LightGBM')
    plt.plot([0,1],[0,1], 'k--', label = "Random")
    plt.xlim(0,1); plt.ylim(0,1)
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
roc_curve_plot(y_test, pred_proba)
plt.legend()
plt.show()

In [None]:
print('Random Forest AUC: ', roc_auc_score(y_test, pred_proba))
print('LightGBM AUC: ', roc_auc_score(y_test, pred_proba_lgbm))