In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Goal : EDA of Pima Indians Diabetes Database before predict the onset of diabetes based on diagnostic measures.

Data Variables:
* Pregnancies: Number of times pregnant
* Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure: Diastolic blood pressure (mm Hg)
* SkinThickness: Triceps skin fold thickness (mm)
* Insulin: 2-Hour serum insulin (mu U/ml)
* BMI: Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age (years)
* Outcome: Class variable (0 or 1)

In [None]:
# import libraries
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix, plot_roc_curve, confusion_matrix, classification_report

# 1. Basic EDA

In [None]:
file = "/kaggle/input/pima-indians-diabetes-database/diabetes.csv"
diabetes = pd.read_csv(file)
diabetes.head()

In [None]:
diabetes.shape

In [None]:
diabetes.describe(include="all")

# 2. Checking and handling missing data

In [None]:
# check null values
diabetes.isnull().sum()

In [None]:
# check how many zeros in each column
(diabetes == 0).astype(int).sum(axis=0)

In [None]:
# Replace zero to np.nan
for col in diabetes.iloc[:,1:6]:
    diabetes[col].replace(0, np.nan, inplace=True)
    
import missingno as msno
# check missing data types: MCAR, MAR or MNAR
sorted = diabetes.sort_values("Insulin")
msno.matrix(sorted)


In [None]:
### check if there is correlations between midding data variables
# msno.heatmap(sorted)
# msno.dendrogram(sorted) <-- The dendrogram allows you to more fully correlate variable completion

In [None]:
mean_table = diabetes.groupby("Outcome").mean()
mean_table

In [None]:
median_table = diabetes.groupby("Outcome").median()
median_table

In [None]:
# Replace null values with its mean of each outcome
col_names = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']
for col in col_names:
    diabetes.loc[(diabetes.Outcome == 0) & (diabetes[col].isnull()), col] = median_table.loc[0, col]
    diabetes.loc[(diabetes.Outcome == 1) & (diabetes[col].isnull()), col] = median_table.loc[1, col]

In [None]:
diabetes.info()

In [None]:
# Add range columns
# Create conditions of each range assignment
Glucose_conditions = [(diabetes["Glucose"] <= 50),
                      (diabetes["Glucose"] > 50) & (diabetes["Glucose"] <= 100),
                      (diabetes["Glucose"] > 100) & (diabetes["Glucose"] <= 150),
                      (diabetes["Glucose"] > 150) & (diabetes["Glucose"] <= 200),
                     ]
BMI_conditions = [(diabetes["BMI"] <= 20),
                      (diabetes["BMI"] > 20) & (diabetes["BMI"] <= 40),
                      (diabetes["BMI"] > 40) & (diabetes["BMI"] <= 60),
                      (diabetes["BMI"] > 60) & (diabetes["BMI"] <= 80),
                     ]
Age_conditions = [(diabetes["Age"] >= 20) & (diabetes["Age"] < 30),
                  (diabetes["Age"] >= 30) & (diabetes["Age"] < 40),
                  (diabetes["Age"] >= 40) & (diabetes["Age"] < 50),
                  (diabetes["Age"] >= 50) & (diabetes["Age"] < 60),
                  (diabetes["Age"] >= 60) & (diabetes["Age"] < 70),
                  (diabetes["Age"] >= 70) & (diabetes["Age"] < 80),
                  (diabetes["Age"] >= 80) & (diabetes["Age"] < 90),
                     ]

# create a list of the values we want to assign for each condition
Glucose_values = ["0-50", "51-100","101-150","151-200"]
BMI_values = ["0-20", "21-40","41-60","61-80"]
Age_values = ["20-29","30-39","40-49","50-59","60-69","70-79","80-89"]

# create columns that map conditions with its values
diabetes["GlucoseRange"] = np.select(Glucose_conditions, Glucose_values)
diabetes["BMIRange"]= np.select(BMI_conditions, BMI_values)
diabetes["AgeRange"]= np.select(Age_conditions, Age_values)

# Convert data type of range columns to category type with order
Glucose_level = CategoricalDtype(categories=Glucose_values, ordered=True)
BMI_level = CategoricalDtype(categories=BMI_values, ordered=True)
Age_level = CategoricalDtype(categories=Age_values, ordered=True)
diabetes["GlucoseRange"] = diabetes["GlucoseRange"].astype(Glucose_level)
diabetes["BMIRange"] = diabetes["BMIRange"].astype(BMI_level)
diabetes["AgeRange"] = diabetes["AgeRange"].astype(Age_level)

# print first 6 rows of new df
print(diabetes.dtypes)

# 3. EDA: Visualization

In [None]:
# plot histgram of each variable
def get_histograms(df, cols=1):
    rows = math.ceil(len(df.columns)/cols)
    figwidth = cols * 4
    fidheight = rows * 4
   
    fig, ax = plt.subplots(
        figsize = (figwidth,fidheight),
        nrows = rows,
        ncols = cols
    )
    ax = ax.flatten()  # ax need to be flattened first in order to iterate through subplots.
    for i, col in enumerate(df.columns):
        ax[i].hist(df[col],
                   color = "#45ADA8",
                   alpha = 1
                  )
        ax[i].set_title(f"{df[col].name}", fontsize=14)
    
    fig.suptitle(f"Histograms for all variables in the Data", fontsize=16)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0, top=0.8)
    
    plt.show()
    return

get_histograms(diabetes.iloc[:, 0:9], cols=3)    

In [None]:
# check correlation between different variables
def corr_heat(df, title = None):
    plt.figure(figsize=(8, 6))
    sns.set(font_scale=1)
    matrix = df.corr()
    mask = np.triu(np.ones_like(matrix, dtype=bool))
    cmap = sns.diverging_palette(220, 15, s=80, n=9, as_cmap=True, center="light")
    
    with sns.axes_style("white"):
        sns.heatmap(matrix, mask=mask, annot=True, cmap=cmap, square=True, fmt='.2f',linewidth=.6, center=0, vmin=-0.15, vmax=0.65)
    if title: plt.title(f"\n{title}\n", fontsize=14)
    
corr_heat(diabetes, "Diabetes Data: Variable correlations" )

In [None]:
#Plot a cross tab to check how Glucose and BMI level related to Outcome
def cross(col1, col2, value_col):
    cross = pd.crosstab(col1, col2, values = value_col, aggfunc="sum", margins=True, margins_name="Total", normalize='all')
    sns.heatmap(cross, cmap="BuGn", annot=True, cbar=False)
    plt.xlabel = col1
    plt.ylabel = col2
    plt.yticks(rotation=0)

cross(diabetes.GlucoseRange, diabetes.AgeRange, diabetes.Outcome)



In [None]:
cross(diabetes.GlucoseRange, diabetes.BMIRange, diabetes.Outcome)

In [None]:
# add noises to y axis for the better visualisation
SkinThickness_jitter = diabetes.SkinThickness + np.random.normal(0,2,len(diabetes.SkinThickness))
insulin_jitter = diabetes.Insulin + np.random.normal(0,2, len(diabetes.Insulin))
BloodPressure_jitter = diabetes.BloodPressure + np.random.normal(0,2, len(diabetes.BloodPressure))

# plot multiple subplots to validate correlations
fig, axs = plt.subplots(3, 2, figsize=(10,10))
axs[0, 0].plot(diabetes.Age, diabetes.Pregnancies, marker="o", linestyle="", markersize=1.2, alpha=.7)
axs[0, 0].set_title('Age vs Pregnancies')
axs[0, 1].plot(diabetes.BMI, SkinThickness_jitter, marker="o", linestyle="", markersize=1.1, alpha=.7)
axs[0, 1].set_title('BMI vs SkinThickness')
axs[1, 0].plot(diabetes.Glucose, insulin_jitter, marker="o", linestyle="", markersize=1.1, alpha=.7)
axs[1, 0].set_title('Glucose vs Insulin')
axs[1, 1].plot(diabetes.BMI, BloodPressure_jitter, marker="o", linestyle="", markersize=1.2, alpha=.7)
axs[1, 1].set_title('BMI vs BloodPressure')
axs[2, 0].plot(SkinThickness_jitter, insulin_jitter, marker="o", linestyle="", markersize=1.2, alpha=.7)
axs[2, 0].set_title('SkinThickness vs Insulin')
fig.tight_layout()

In [None]:
diabetes = diabetes.drop(['GlucoseRange',
       'BMIRange', 'AgeRange'], axis=1)

In [None]:
# Outcome vs other features
plt.figure(figsize = (15, 15))
plotnumber = 1
for col in diabetes.iloc[:, 0:8]:
    ax = plt.subplot(3, 3, plotnumber)
    sns.boxplot(x=diabetes["Outcome"], y=diabetes[col], data=diabetes)
    plotnumber += 1
plt.show()

In [None]:
# Handle outliers with Z score
from scipy import stats
z = np.abs(stats.zscore(diabetes))
print(z)
threshold = 3
np.where(z>threshold)
diabetes_cleaned = diabetes[(z < 3).all(axis=1)]
print(diabetes_cleaned.shape)
print(diabetes.shape)

In [None]:
# Outcome vs other features
plt.figure(figsize = (15, 15))
plotnumber = 1
for col in diabetes_cleaned.iloc[:, 0:8]:
    ax = plt.subplot(3, 3, plotnumber)
    sns.boxplot(x=diabetes_cleaned["Outcome"], y=diabetes_cleaned[col], data=diabetes_cleaned)
    plotnumber += 1
plt.show()

In [None]:
#sns.pairplot(diabetes, vars = ['Glucose', 'BMI',"BloodPressure", "Insulin", "Pregnancies","Age"], hue="Outcome")

# 4. Build models

In [None]:
#Seperate dataset to features and target
X = diabetes_cleaned.drop(["Outcome"], axis=1).values
y = diabetes_cleaned["Outcome"].values
# Split the dataset to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .3, stratify=y, random_state=21)

In [None]:
def Model(modelname, model):
  # Setup the pipeline steps: steps
  steps = [("scaler", StandardScaler()),
         (modelname, model)
        ]
  pipeline = Pipeline(steps)
  pipeline.fit(X_train,y_train)
  score=pipeline.score(X_test,y_test)
  y_pred=pipeline.predict(X_test)
  cm = confusion_matrix(y_test,y_pred)
  cr = classification_report(y_test, y_pred)
  print('Testing Score \n',score)
  print(cr)
  plot_confusion_matrix(pipeline,X_test,y_test,cmap='OrRd')
  plot_roc_curve(pipeline, X_test, y_test)

In [None]:
# Find the best param of Knn
# Setup the pipeline steps: steps
knn = KNeighborsClassifier()

# Set pipeline steps
steps = [("scaler", StandardScaler()),
         ("knn", KNeighborsClassifier())
        ]
pipeline = Pipeline(steps)

# Set n_neighbors range
parameters = {"knn__n_neighbors":np.arange(1,50)}

# Search best params n_neighbors
knn_cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)
knn_cv.fit(X_train, y_train)
print("The best param: " + str(knn_cv.best_params_)) 

# Using the best parames to calculate R square
#y_pred = knn_cv.predict(X_test)
#score = knn_cv.score(X_test, y_test)

In [None]:
Model("knn", KNeighborsClassifier(13))

In [None]:
Model("logreg", LogisticRegression())

In [None]:
Model("dt", DecisionTreeClassifier(max_depth=3, random_state=1))

In [None]:
"""
# Scale features
from sklearn.preprocessing import scale
X_scaled = scale(X)

#Visual EDA, similar to pairplot
#_ = pd.plotting.scatter_matrix(X, c=y, figsize=[15,15], s=150, marker=".", alpha=0.2)

# Handle outliers with IQR
q1 = diabetes.iloc[:, 1:8].quantile(0.25)
q3 = diabetes.iloc[:, 1:8].quantile(0.75)
IQR = q3 - q1
print(IQR)
outliers = (diabetes < q1 - 1.5 * IQR) | (diabetes > q3 + 1.5 * IQR)
diabetes_new2 = diabetes[~outliers.any(axis=1)]
print(diabetes_new2.shape)
print(diabetes.shape)

#Fill dummy values
from numpy.random import rand
def fill_dummy_values(df, scaling_factor=0.1):
  df_dummy = df.copy(deep=True)
  for col_name in df_dummy:
    col = df_dummy[col_name] 
    col_null = col.isnull()
    # Calculate number of missing values in column 
    num_nulls = col_null.sum()
    # Calculate column range
    col_range = col.max() - col.min()
    # Scale the random values to scaling_factor times col_range
    dummy_values = (rand(num_nulls) - 2) * col_range * scaling_factor + col.min()
    col[col_null] = dummy_values
  return df_dummy

# Fill dummy values in diabetes_dummy
diabetes_dummy = fill_dummy_values(diabetes)
# Sum the nullity of Insulin and SkinThickness
nullity = diabetes['Insulin'].isnull() + diabetes['SkinThickness'].isnull()

# Create a scatter plot of Insulin and SkinThickness
sns.scatterplot(diabetes_dummy.Insulin, diabetes_dummy.SkinThickness, hue=nullity)
"""

In [None]:
"""
#'BloodPressure', 'SkinThickness', 'Insulin' are missing big portions of its data points, mean might not be the best method for these variables.
# Impute with the most frequent value
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy ="most_frequent")
columns = ['BloodPressure', 'SkinThickness', 'Insulin']
for col in columns:
    diabetes[col] = imp.fit_transform(diabetes[col].values.reshape(-1,1))
"""