In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Diabetes Prediction with CART Algorithm

**Diabetes**, is a group of metabolic disorders in which there are high blood sugar levels over a prolonged period. Symptoms of high blood sugar include frequent urination, increased thirst, and increased hunger. If left untreated, diabetes can cause many complications. Acute complications can include diabetic ketoacidosis, hyperosmolar hyperglycemic state, or death. Serious long-term complications include cardiovascular disease, stroke, chronic kidney disease, foot ulcers, and damage to the eyes.

## Data Set and Story

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

* Pregnancies: Number of times pregnant
* Glucose: Glucose
* BloodPressure: Blood pressure 
* SkinThickness: Triceps skin fold thickness
* Insulin: Insulin
* BMI: Body mass index 
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age: Age (years)
* Outcome: The knowledge of whether there is diabetes (this is our target)

In [None]:
!pip install skompiler

In [None]:
# Importing the libraries necessary for the exercise.
import warnings
import pandas as pd
import numpy as np
from skompiler import skompile
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)


In [None]:
# Reading dataset
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

# Looking at the first 5 rows of the data set
df.head()

In [None]:
# Getting general information about the data set
df.info()

## EDA

In [None]:
# Looking at the descriptive statistics of the data set
df.describe().T

You can see the describe () function of our data set above. In this table, "min" shows the smallest number in that variable.

We see that the smallest values of Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin and BMI variables are 0. Except for the Pregnancies variable, there is no possibility that any of these variables are 0.

Not all 0 values in the "Pima Indians Diabetes Database" are actually 0. Empty values are also filled with 0.

In [None]:
# Before solving this problem, let's check the null values.
df.isnull().sum()

In [None]:
# We can solve this problem by assigning NaN to 0 values in variables that we think are errors.
df[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = \
    df[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.NaN)

In [None]:
# Now let's see how many rows we have assigned NaN instead of 0.
df.isnull().sum()

In [None]:
# With this function, we were able to separate the variables in the data set as categorical and numerical.
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    
    return cat_cols, cat_but_car, num_cols, num_but_cat


In [None]:
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

## Outliers

In [None]:
# Setting an upper and lower limit for outliers
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.25)
    quartile3 = dataframe[variable].quantile(0.75)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
# The function that examines whether there is an outlier according to the threshold values we have determined.
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
# Replacing outliers with upper and lower limit
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in num_cols:
        replace_with_thresholds(df, col)

## Missing Values


In [None]:
df.isnull().sum()

In [None]:
df.pivot_table(df, index=["Outcome"])

When the variables are examined according to Outcome's being 1 and 0, it is seen that there are differences. When filling the blank values, they should be filled in consideration of this situation.

In [None]:
for col in df.columns:
    df.loc[(df["Outcome"] == 0) & (df[col].isnull()), col] = df[df["Outcome"] == 0][col].median()
    df.loc[(df["Outcome"] == 1) & (df[col].isnull()), col] = df[df["Outcome"] == 1][col].median()

## Feature Engineering

In [None]:

df.loc[(df["BMI"] < 18.5), "NEW_BMI_CAT"] = "Underweight"
df.loc[(df["BMI"] > 18.5) & (df["BMI"] < 25), "NEW_BMI_CAT"] = "Normal"
df.loc[(df["BMI"] > 25) & (df["BMI"] < 30), "NEW_BMI_CAT"] = "Overweight"
df.loc[(df["BMI"] > 30) & (df["BMI"] < 40), "NEW_BMI_CAT"] = "Obese"

df.loc[(df["Glucose"] < 70), "NEW_GLUCOSE_CAT"] = "Low"
df.loc[(df["Glucose"] > 70) & (df["Glucose"] < 99), "NEW_GLUCOSE_CAT"] = "Normal"
df.loc[(df["Glucose"] > 99) & (df["Glucose"] < 126), "NEW_GLUCOSE_CAT"] = "Secret"
df.loc[(df["Glucose"] > 126) & (df["Glucose"] < 200), "NEW_GLUCOSE_CAT"] = "High"

df.loc[df['SkinThickness'] < 30, "NEW_SKIN_THICKNESS"] = "Normal"
df.loc[df['SkinThickness'] >= 30, "NEW_SKIN_THICKNESS"] = "HighFat"

df.loc[df['Pregnancies'] == 0, "NEW_PREGNANCIES"] = "NoPregnancy"
df.loc[((df['Pregnancies'] > 0) & (df['Pregnancies'] <= 4)), "NEW_PREGNANCIES"] = "StdPregnancy"
df.loc[(df['Pregnancies'] > 4), "NEW_PREGNANCIES"] = "OverPregnancy"

df.loc[(df['SkinThickness'] < 30) & (df['BloodPressure'] < 80), "NEW_CIRCULATION_LEVEL"] = "Normal"
df.loc[(df['SkinThickness'] >= 30) & (df['BloodPressure'] >= 80), "NEW_CIRCULATION_LEVEL"] = "CircularAtHighRisk"
df.loc[((df['SkinThickness'] < 30) & (df['BloodPressure'] >= 80))
       | ((df['SkinThickness'] >= 30) & (df['BloodPressure'] < 80)), "NEW_CIRCULATION_LEVEL"] = "CircularAtMediumRisk"

df["Pre_Age_Cat"] = df["Age"] * df["Pregnancies"]

df["Ins_Glu_Cat"] = df["Glucose"] * df["Insulin"]

## Label Encoding

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = preprocessing.LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [None]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O"
               and len(df[col].unique()) == 2]

In [None]:
for col in df.columns:
    label_encoder(df, col)

## One-Hot Encoding

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [None]:
ohe_cols = [col for col in df.columns if 10 >= len(df[col].unique()) > 2]

In [None]:
one_hot_encoder(df, ohe_cols, drop_first=True)

## Model

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

In [None]:
cart_model = DecisionTreeClassifier(random_state=17).fit(X_train, y_train)

In [None]:
cart_params = {'max_depth': range(1, 11),
               "min_samples_split": [2, 3, 4]}

cart_cv = GridSearchCV(cart_model, cart_params, cv=10, n_jobs=-1, verbose=True)
cart_cv.fit(X_train, y_train)

In [None]:
cart_cv.best_params_

In [None]:
cart_tuned = DecisionTreeClassifier(**cart_cv.best_params_).fit(X_train, y_train)

In [None]:
# test error
y_pred = cart_tuned.predict(X_test)
y_prob = cart_tuned.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
roc_auc_score(y_test, y_prob)