In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from scipy import stats
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score


**Diabetes Dataset**

Pregnancies: Number of times pregnant

Glucose: Plasma glucose concentration over 2 hours in an oral glucose tolerance test

BloodPressure: Diastolic blood pressure (mm Hg)

SkinThickness: Triceps skin fold thickness (mm)

Insulin: 2-Hour serum insulin (mu U/ml)

BMI: Body mass index (weight in kg/(height in m)2)

DiabetesPedigreeFunction: Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)

Age: Age (years)

Outcome: Class variable (0 if non-diabetic, 1 if diabetic)

In [None]:
data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
df = data.copy()

In [None]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head())
    print("##################### Tail #####################")
    print(dataframe.tail())
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

In [None]:
df.head()

In [None]:
df.corr()


## **Keşifsel Veri Analizi (Exploratory Data Analysis) - EDA**

In [None]:
sns.countplot(x ='Outcome',data = df, palette=('BuPu_d'))

In [None]:
plt.figure(figsize=(9,9))
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
sns.heatmap(df.corr(), cmap=cmap, vmin=-1, vmax=1,  cbar_kws={"shrink": .8}, square=True, annot=True, fmt='.2f',linewidths=.8)

plt.show()

In [None]:
def column_dplots():

    f,ax=plt.subplots(1,5,figsize=(27,7))
    
    ax[0].set_xlabel('Insulin',fontsize=20)
    sns.distplot(df.Insulin, ax = ax[0], color= "purple")
    ax[1].set_xlabel('SkinThickness',fontsize=20)
    sns.distplot(df.SkinThickness,ax=ax[1])
    ax[2].set_xlabel('BloodPressure',fontsize=20)
    sns.distplot(df.BloodPressure, ax=ax[2],color= "purple")
    ax[3].set_xlabel('Glucose',fontsize=20)
    sns.distplot(df.Glucose, ax=ax[3])
    ax[4].set_xlabel('BMI',fontsize=20)
    sns.distplot(df.BMI, ax=ax[4])


    plt.show()

column_dplots()

In [None]:
def outlier_plots():

    f,ax=plt.subplots(1,5,figsize=(25,6))
    sns.boxplot(data=df.Insulin, ax=ax[0]) 
    ax[0].set_xlabel('Insulin',fontsize=20)
    sns.boxplot(data=df.SkinThickness,ax=ax[1])
    ax[1].set_xlabel('SkinThickness',fontsize=20)
    sns.boxplot(data=df.BloodPressure, ax=ax[2])
    ax[2].set_xlabel('BloodPressure',fontsize=20)
    sns.boxplot(data=df.BMI, ax=ax[3])
    ax[3].set_xlabel('Glucose',fontsize=20)
    sns.boxplot(data=df.Glucose, ax=ax[4])
    ax[4].set_xlabel('BMI',fontsize=20)
    plt.show()

outlier_plots()

##**DATA PREPROCESSING**

### **UNDERSTANDING FEATURES**

In [None]:

def grab_col_names(dataframe, cat_th=10, car_th=20):

    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    print(f" categoric: {cat_cols}, cat_but_car: {cat_but_car}, numeric: {num_cols}, num_but_cat: {num_but_cat}")
    # cat_cols + num_cols + cat_but_car = değişken sayısı.
    # num_but_cat cat_cols'un içerisinde zaten.
    # dolayısıyla tüm şu 3 liste ile tüm değişkenler seçilmiş olacaktır: cat_cols + num_cols + cat_but_car
    # num_but_cat sadece raporlama için verilmiştir.

    return cat_cols, cat_but_car, num_cols, num_but_cat



In [None]:
grab_col_names(df)

In [None]:
cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

### **MISSING VALUE HANDLING**


*   replaced zeros(missing values) with NaN 
*   imputed NaN values



In [None]:
zero_list = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]

[df[col].replace(0,np.NaN,inplace=True) for col in df[zero_list]]

In [None]:
df.isnull().sum()

In [None]:
df.BloodPressure = df.BloodPressure.fillna(df.BloodPressure.mean())
df.Insulin = df.Insulin.fillna(df.Insulin.median())
df.Glucose = df.Glucose.fillna(df.Glucose.mean())


In [None]:
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
df[["BMI","SkinThickness"]] = imputer.fit_transform(df[["BMI","SkinThickness"]])


In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
def column_dplots():

    f,ax=plt.subplots(1,5,figsize=(27,7))
    
    ax[0].set_xlabel('Insulin',fontsize=20)
    sns.distplot(df.Insulin, ax = ax[0], color= "purple")
    ax[1].set_xlabel('SkinThickness',fontsize=20)
    sns.distplot(df.SkinThickness,ax=ax[1])
    ax[2].set_xlabel('BloodPressure',fontsize=20)
    sns.distplot(df.BloodPressure, ax=ax[2],color= "purple")
    ax[3].set_xlabel('Glucose',fontsize=20)
    sns.distplot(df.Glucose, ax=ax[3])
    ax[4].set_xlabel('BMI',fontsize=20)
    sns.distplot(df.BMI, ax=ax[4])


    plt.show()

column_dplots()

###**OUTLIER HANDLING**

In [None]:
def outlier_analysis(dataframe, numerical_columns):

    '''
    Outlier Analysis of the numerical columns
    '''
    df_without_outliers = dataframe.copy()
    outlier_boundries_dict = {}
    for col in numerical_columns:
            print(f"-----{col}------")
            lower_limit = dataframe[col].quantile(q=0.01)
            upper_limit = dataframe[col].quantile(q=0.99)

            number_of_outliers = len(dataframe[(dataframe[col] < lower_limit) | (dataframe[col] > upper_limit)])
            if number_of_outliers>0:
                print(f"{number_of_outliers} outliers are found for {col} feature. \n")
            # outlier_boundries_dict[col] = (lower_limit,upper_limit)
            outlier_boundries_dict.update({col:(lower_limit,upper_limit)})

            df_without_outliers.loc[df[col]<lower_limit, col] = lower_limit
            df_without_outliers.loc[df[col]>upper_limit, col] = upper_limit
    
    return df_without_outliers, outlier_boundries_dict

In [None]:
outlier_analysis(df,num_cols)

In [None]:
lower_limit = df["SkinThickness"].quantile(q=0.0)
upper_limit = df["SkinThickness"].quantile(q=0.999)
number_of_outliers = len(df[(df["SkinThickness"] < lower_limit) | (df["SkinThickness"] > upper_limit)])

print(lower_limit)
print(upper_limit)
f"{number_of_outliers}"


In [None]:
df = df[~((df["SkinThickness"] < lower_limit) | (df["SkinThickness"] > upper_limit))] 

### **FEATURE ENGINEERING**

In [None]:
df

**AGE**

In [None]:
# ortalama 69,5 yıl yaşıyorlar

df.loc[df['Age'] <= 18, 'AGE_CAT'] = 'Kid'
df.loc[((df['Age'] <= 30) & (df['Age'] > 18)), 'AGE_CAT'] = 'Young'
df.loc[(df['Age'] > 30) & (df['Age'] < 50), 'AGE_CAT'] = 'Mature'
df.loc[df['Age'] >= 50, 'AGE_CAT'] = 'Senior'

df.AGE_CAT.value_counts()

GLUCOSE

In [None]:
df.loc[(df['Glucose'] <= 70), 'GLUCOSE_CAT'] = 'HIPOGLISEMI'
df.loc[(df['Glucose'] <= 100) & (df['Glucose'] > 70), 'GLUCOSE_CAT'] = 'NORMAL'
df.loc[(df['Glucose'] <= 125) & (df['Glucose'] > 100), 'GLUCOSE_CAT'] = 'PREDIABETES'
df.loc[(df['Glucose'] > 125), 'GLUCOSE_CAT'] = 'DIABETES'

df.GLUCOSE_CAT.value_counts()


In [None]:
df

**SCALING**

In [None]:
for col in num_cols:
    transformer = RobustScaler().fit(df[[col]])
    df[col] = transformer.transform(df[[col]])

### **ENCODING**

In [None]:
df["AGE_CAT"].value_counts()

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [None]:
df = one_hot_encoder(df,["AGE_CAT","GLUCOSE_CAT"])

### **MODELING**

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=42)

In [None]:
cart_params = {'max_depth': range(1, 10),
               "min_samples_split": [3, 4]}

In [None]:

cart_model = DecisionTreeClassifier(random_state=17)

cart_cv = GridSearchCV(cart_model, cart_params, cv=10, n_jobs=-1, verbose=True)
cart_cv.fit(X_train, y_train)

cart_cv.best_params_

In [None]:
cart_tuned = DecisionTreeClassifier(**cart_cv.best_params_).fit(X_train, y_train)

In [None]:
xgb = XGBClassifier(n_estimators=500)
xgb.fit(X_train, y_train)
preds = xgb.predict(X_test)
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100

f"XGBoost's prediction accuracy is: {acc_xgb}"

**Tahmin**

In [None]:
y_pred = cart_tuned.predict(X_test)
y_prob = cart_tuned.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
roc_auc_score(y_test, y_prob)

In [None]:

def plot_confusion_matrix(y_test, y_pred):
    acc = round(accuracy_score(y_test, y_pred), 2)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt=".0f")
    plt.xlabel('y_pred')
    plt.ylabel('y')
    plt.title('Accuracy Score: {0}'.format(acc), size=10)
    plt.show()

plot_confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plot_roc_curve(cart_tuned, X_test, y_test)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'r--')
plt.show()