In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sn # data visualization
from matplotlib import pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Data

In [None]:
data_df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
data_df.head()

In [None]:
data_df.info()

# Statistical Analysis

In [None]:
data_df.hist(figsize = (15,15))

In [None]:
data_df_neg,data_df_pos = data_df.groupby(['Outcome'])
data_df_filtered = pd.concat([data_df_neg[1].sample(268),data_df_pos[1].sample(268)])

In [None]:
data_df_filtered[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data_df_filtered[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
data_df_filtered['Glucose'].fillna(data_df_filtered['Glucose'].mean(), inplace = True)
data_df_filtered['BloodPressure'].fillna(data_df_filtered['BloodPressure'].mean(), inplace = True)
data_df_filtered['SkinThickness'].fillna(data_df_filtered['SkinThickness'].median(), inplace = True)
data_df_filtered['Insulin'].fillna(data_df_filtered['Insulin'].median(), inplace = True)
data_df_filtered['BMI'].fillna(data_df_filtered['BMI'].median(), inplace = True)

In [None]:
data_df_filtered.hist(figsize = (15,15))

In [None]:
data_df_filtered.describe()

In [None]:
data_df_filtered.corr()

# Data Visualization

In [None]:
sn.jointplot(data=data_df_filtered,x='Glucose',y='BMI',hue='Outcome',kind='kde')

In [None]:
sn.histplot(data=data_df_filtered,x='Glucose',hue='Outcome',kde=True)

In [None]:
sn.scatterplot(data=data_df_filtered,x='Glucose', y='BMI',hue='Outcome')

# Transform Data

In [None]:
# LABELS = ['Glucose','BloodPressure','Insulin','BMI','Age']
LABELS = ['Pregnancies','Glucose','BloodPressure','Insulin','BMI','Age','DiabetesPedigreeFunction','SkinThickness']

In [None]:
X = data_df_filtered[LABELS].values
y = data_df_filtered['Outcome']
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.2,shuffle=True)

# Machine Learning
## kMeans

In [None]:
kmeans = KMeans(n_clusters=2).fit(X_train)
data_df_filtered['kMeansPrediction'] = kmeans.predict(X)
sn.scatterplot(data=data_df_filtered,x='Glucose', y='BMI',hue='kMeansPrediction')
print("kMeans f1 score:\t\t",f1_score(y_test,kmeans.predict(X_test)))

## Random Forest

In [None]:
# Hyperparameter search
param_score = []
param_arr = []
for num_estimators in range(100,300,10):
    clf = RandomForestClassifier(num_estimators)
    clf.fit(X_train[0:50],y_train[0:50]) # train
    param_score.append(clf.score(X_train,y_train))
    param_arr.append(num_estimators)
best_estimator = max(param_arr)
sn.scatterplot(x=param_arr,y=param_score)

In [None]:
clf = RandomForestClassifier(best_estimator)
clf.fit(X_train,y_train) # train
data_df_filtered['randomForestProba'] = clf.predict_proba(X)[:,1] # Probability of sample being negative
data_df_filtered['randomForestPrediction'] = clf.predict(X)
sn.scatterplot(data=data_df_filtered,x='Glucose', y='BMI',hue='randomForestPrediction') 
print("Random Forest f1 score:\t\t",f1_score(y_test,clf.predict(X_test)))

## Boosting

In [None]:
LABELS2 = LABELS + ['randomForestProba']
X2 = data_df_filtered[LABELS2].values
# y = data_df_filtered['Outcome']

In [None]:
X_train_boosted = np.concatenate((X_train.copy(),clf.predict_proba(X_train)[:,1].reshape(-1,1)),1)
X_test_boosted = np.concatenate((X_test.copy(),clf.predict_proba(X_test)[:,1].reshape(-1,1)),1)
X_train_boosted[0]

In [None]:
boost = AdaBoostClassifier(clf)
boost.fit(X_train_boosted,y_train)
data_df_filtered['BoostPrediction'] = boost.predict(X2)
sn.scatterplot(data=data_df_filtered,x='Glucose', y='BMI',hue='BoostPrediction')
print("Boosting f1 score:\t",f1_score(y_test,(boost.predict(X_test_boosted))))

## Predictions

In [None]:
data_df_filtered.head()

## Visualize the error region

In [None]:
data_df_filtered['predictedCorrectly'] = data_df_filtered['Outcome'] == data_df_filtered['BoostPrediction']
sn.scatterplot(data=data_df_filtered,x='Glucose', y='BMI',hue='predictedCorrectly')
# We can see that the error tends to be where we saw the two regions overlapping

In [None]:
print(classification_report(y_test,(boost.predict(X_test_boosted))))