In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Goal : EDA of Pima Indians Diabetes Database before predict the onset of diabetes based on diagnostic measures.

Data Variables:
* Pregnancies: Number of times pregnant
* Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure: Diastolic blood pressure (mm Hg)
* SkinThickness: Triceps skin fold thickness (mm)
* Insulin: 2-Hour serum insulin (mu U/ml)
* BMI: Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age (years)
* Outcome: Class variable (0 or 1)

In [None]:
# import libraries
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Basic EDA

In [None]:
file = "/kaggle/input/pima-indians-diabetes-database/diabetes.csv"
diabetes = pd.read_csv(file)
diabetes.head()

In [None]:
diabetes.shape

In [None]:
diabetes.describe(include="all")

# 2. Checking and handling missing data

In [None]:
# check null values
diabetes.isnull().sum()

In [None]:
# check how many zeros in each column
(diabetes == 0).astype(int).sum(axis=0)

In [None]:
# Replace zero to np.nan
for col in diabetes.iloc[:,1:6]:
    diabetes[col].replace(0, np.nan, inplace=True)
    
import missingno as msno
# check missing data types: MCAR, MAR or MNAR
sorted = diabetes.sort_values("Insulin")
msno.matrix(sorted)

In [None]:
### check if there is correlations between midding data variables
# msno.heatmap(sorted)
# msno.dendrogram(sorted) <-- The dendrogram allows you to more fully correlate variable completion

In [None]:
# Glucose and BMI's missing data appear to be completely missing at randome. 
# Replace null values in Glucose and BMI with its mean
for col in diabetes.iloc[:, [1,5]]:
    diabetes[col].replace(np.nan, diabetes[col].mean(), inplace=True)

#'BloodPressure', 'SkinThickness', 'Insulin' are missing big portions of its data points, mean might not be the best method for these variables.
# Impute with the most frequent value
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy ="most_frequent")
columns = ['BloodPressure', 'SkinThickness', 'Insulin']
for col in columns:
    diabetes[col] = imp.fit_transform(diabetes[col].values.reshape(-1,1))


In [None]:
diabetes.info()

In [None]:
# Add range columns
# Create conditions of each range assignment
Glucose_conditions = [(diabetes["Glucose"] <= 50),
                      (diabetes["Glucose"] > 50) & (diabetes["Glucose"] <= 100),
                      (diabetes["Glucose"] > 100) & (diabetes["Glucose"] <= 150),
                      (diabetes["Glucose"] > 150) & (diabetes["Glucose"] <= 200),
                     ]
BMI_conditions = [(diabetes["BMI"] <= 20),
                      (diabetes["BMI"] > 20) & (diabetes["BMI"] <= 40),
                      (diabetes["BMI"] > 40) & (diabetes["BMI"] <= 60),
                      (diabetes["BMI"] > 60) & (diabetes["BMI"] <= 80),
                     ]
Age_conditions = [(diabetes["Age"] >= 20) & (diabetes["Age"] < 30),
                  (diabetes["Age"] >= 30) & (diabetes["Age"] < 40),
                  (diabetes["Age"] >= 40) & (diabetes["Age"] < 50),
                  (diabetes["Age"] >= 50) & (diabetes["Age"] < 60),
                  (diabetes["Age"] >= 60) & (diabetes["Age"] < 70),
                  (diabetes["Age"] >= 70) & (diabetes["Age"] < 80),
                  (diabetes["Age"] >= 80) & (diabetes["Age"] < 90),
                     ]

# create a list of the values we want to assign for each condition
Glucose_values = ["0-50", "51-100","101-150","151-200"]
BMI_values = ["0-20", "21-40","41-60","61-80"]
Age_values = ["20-29","30-39","40-49","50-59","60-69","70-79","80-89"]

# create columns that map conditions with its values
diabetes["GlucoseRange"] = np.select(Glucose_conditions, Glucose_values)
diabetes["BMIRange"]= np.select(BMI_conditions, BMI_values)
diabetes["AgeRange"]= np.select(Age_conditions, Age_values)

# Convert data type of range columns to category type with order
Glucose_level = CategoricalDtype(categories=Glucose_values, ordered=True)
BMI_level = CategoricalDtype(categories=BMI_values, ordered=True)
Age_level = CategoricalDtype(categories=Age_values, ordered=True)
diabetes["GlucoseRange"] = diabetes["GlucoseRange"].astype(Glucose_level)
diabetes["BMIRange"] = diabetes["BMIRange"].astype(BMI_level)
diabetes["AgeRange"] = diabetes["AgeRange"].astype(Age_level)

# print first 6 rows of new df
print(diabetes.dtypes)

# 3. EDA: Visualization

In [None]:
# plot distribution of each variable.
diabetes.hist(figsize=(10,10))

In [None]:
# check correlation between different variables
matrix = diabetes.corr()
mask = np.triu(np.ones_like(matrix, dtype=bool))
cmap = sns.diverging_palette(220, 25, s=80, n=9, as_cmap=True, center="light")
plt.figure(figsize=(8, 6))
sns.heatmap(matrix, mask=mask, annot=True, cmap=cmap, square=True, fmt='.2f',linewidth=.2, center=0, vmin=-0.15, vmax=0.55)

In [None]:
#Plot a cross tab to check how Glucose and BMI level related to Outcome
cross = pd.crosstab(diabetes.GlucoseRange, diabetes.BMIRange, values = diabetes.Outcome, aggfunc="sum", margins=True, margins_name="Total", normalize='all')
sns.heatmap(cross, cmap="BuGn", annot=True, cbar=False)

# Rotate tick marks for visibility
plt.yticks(rotation=0)

In [None]:
#Plot a cross tab to check how Glucose and Age related to Outcome
cross = pd.crosstab(diabetes.GlucoseRange, diabetes.AgeRange, values = diabetes.Outcome, aggfunc="sum", margins=True, margins_name="Total", normalize='all')
sns.heatmap(cross, cmap="BuGn", annot=True, cbar=False)

# Rotate tick marks for visibility
plt.yticks(rotation=0)

In [None]:
# add noises to y axis for the better visualisation
SkinThickness_jitter = diabetes.SkinThickness + np.random.normal(0,2,len(diabetes.SkinThickness))
insulin_jitter = diabetes.Insulin + np.random.normal(0,2, len(diabetes.Insulin))
BloodPressure_jitter = diabetes.BloodPressure + np.random.normal(0,2, len(diabetes.BloodPressure))

# plot multiple subplots to validate correlations
fig, axs = plt.subplots(3, 2, figsize=(10,10))
axs[0, 0].plot(diabetes.Age, diabetes.Pregnancies, marker="o", linestyle="", markersize=1.2, alpha=.8)
axs[0, 0].set_title('Age vs Pregnancies')
axs[0, 1].plot(diabetes.BMI, SkinThickness_jitter, marker="o", linestyle="", markersize=1.1, alpha=.9)
axs[0, 1].set_title('BMI vs SkinThickness')
axs[1, 0].plot(diabetes.Glucose, insulin_jitter, marker="o", linestyle="", markersize=1.1, alpha=.9)
axs[1, 0].set_title('Glucose vs Insulin')
axs[1, 1].plot(diabetes.BMI, BloodPressure_jitter, marker="o", linestyle="", markersize=1.2, alpha=.8)
axs[1, 1].set_title('BMI vs BloodPressure')
axs[2, 0].plot(insulin_jitter, SkinThickness_jitter, marker="o", linestyle="", markersize=1.2, alpha=.8)
axs[2, 0].set_title('Insulin vs SkinThickness')
fig.tight_layout()

In [None]:
diabetes = diabetes.drop(['GlucoseRange',
       'BMIRange', 'AgeRange'], axis=1)

In [None]:
# Outcome vs other features
plt.figure(figsize = (15, 15))
plotnumber = 1
for col in diabetes.iloc[:, 0:8]:
    ax = plt.subplot(3, 3, plotnumber)
    sns.boxplot(x=diabetes["Outcome"], y=diabetes[col], data=diabetes)
    plt.ylabel(col, fontsize = 15)
    plotnumber += 1
plt.show()

In [None]:
# Handle outliers with Z score
from scipy import stats
z = np.abs(stats.zscore(diabetes))
threshold = 3
np.where(z>threshold)
diabetes_cleaned = diabetes[(z < 3).all(axis=1)]
print(diabetes_cleaned.shape)
print(diabetes.shape)

### Put it all together: pairplot

In [None]:
sns.pairplot(diabetes, vars = ['Glucose', 'BMI',"BloodPressure", "Insulin", "Pregnancies","Age"], hue="Outcome")

# 4. Build models

In [None]:
#Seperate dataset to features and target
X = diabetes_cleaned.drop(["Outcome"], axis=1).values
y = diabetes_cleaned["Outcome"].values

In [None]:
# First model: Knn
# Import libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# Setup the pipeline steps: steps
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
knn = KNeighborsClassifier()

# Set pipeline steps
steps = [('imputation', imp),
         ("scaler", StandardScaler()),
         ("knn", KNeighborsClassifier())
        ]
pipeline = Pipeline(steps)

# Split the dataset to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .3, stratify=y, random_state=21)

# Set n_neighbors range
parameters = {"knn__n_neighbors":np.arange(1,50)}

# Search best params n_neighbors
knn_cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)
knn_cv.fit(X_train, y_train)
print("The best param: " + str(knn_cv.best_params_)) #got 'n_neighbors': 17

# Using the best parames to calculate R square
y_pred = knn_cv.predict(X_test)
score = knn_cv.score(X_test, y_test)
print("The Knn score is " + str(score))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
"""
# Scale features
from sklearn.preprocessing import scale
X_scaled = scale(X)

#Visual EDA, similar to pairplot
#_ = pd.plotting.scatter_matrix(X, c=y, figsize=[15,15], s=150, marker=".", alpha=0.2)

# Handle outliers with IQR
q1 = diabetes.iloc[:, 1:8].quantile(0.25)
q3 = diabetes.iloc[:, 1:8].quantile(0.75)
IQR = q3 - q1
print(IQR)
outliers = (diabetes < q1 - 1.5 * IQR) | (diabetes > q3 + 1.5 * IQR)
diabetes_new2 = diabetes[~outliers.any(axis=1)]
print(diabetes_new2.shape)
print(diabetes.shape)
"""