In [None]:
#import libraries
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from pandas_profiling import ProfileReport
from sklearn.feature_selection import RFECV
import operator
import warnings
warnings.simplefilter(action = "ignore") 

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe([0.10,0.25,0.50,0.75,0.90,0.95,0.99]).T

In [None]:
profile = ProfileReport(df, title="Pandas Profiling Report", progress_bar=False)
profile.to_widgets()

In [None]:
df.corr()

In [None]:
# Correlation matrix graph of the data set
f, ax = plt.subplots(figsize= [15,10])
sns.heatmap(df.corr(), annot=True, fmt=".2f", ax=ax, cmap = "magma" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

In [None]:
 g = sns.pairplot(df, hue="Outcome", palette="husl")

In [None]:
f, ax = plt.subplots(1, 2, figsize = (15, 7))
f.suptitle("Diabetes?", fontsize = 18.)
_ = df.Outcome.value_counts().plot.bar(ax = ax[0], rot = 0, color = (sns.color_palette()[0], sns.color_palette()[2])).set(xticklabels = ["No", "Yes"])
_ = df.Outcome.value_counts().plot.pie(labels = ("No", "Yes"), autopct = "%.2f%%", label = "", fontsize = 13., ax = ax[1],\
colors = (sns.color_palette()[0], sns.color_palette()[2]), wedgeprops = {"linewidth": 1.5, "edgecolor": "#F7F7F7"}), ax[1].texts[1].set_color("#F7F7F7"), ax[1].texts[3].set_color("#F7F7F7")

In [None]:
fig, ax = plt.subplots(4,2, figsize=(16,16))
sns.distplot(df.Age, bins = 20, ax=ax[0,0]) 
sns.distplot(df.Pregnancies, bins = 20, ax=ax[0,1]) 
sns.distplot(df.Glucose, bins = 20, ax=ax[1,0]) 
sns.distplot(df.BloodPressure, bins = 20, ax=ax[1,1]) 
sns.distplot(df.SkinThickness, bins = 20, ax=ax[2,0])
sns.distplot(df.Insulin, bins = 20, ax=ax[2,1])
sns.distplot(df.DiabetesPedigreeFunction, bins = 20, ax=ax[3,0]) 
sns.distplot(df.BMI, bins = 20, ax=ax[3,1]) 

# Data Preprocessing

In [None]:
df.isnull().sum()

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
df.isnull().sum()

In [None]:
def median_target(var):   
    temp = df[df[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median()
    return temp

In [None]:
features = ["Insulin","Glucose","SkinThickness",'BloodPressure','BMI']
for feature in features:
    print(median_target(feature))

In [None]:
features = ["Insulin","Glucose","SkinThickness",'BloodPressure','BMI']
for feature in features:
    negative = median_target(feature)[feature][0]
    positive = median_target(feature)[feature][1]
    df.loc[(df['Outcome'] == 0 ) & (df[feature].isnull()), feature] = negative
    df.loc[(df['Outcome'] == 1 ) & (df[feature].isnull()), feature] = positive

In [None]:
df.isnull().sum()

In [None]:
for feature in df.select_dtypes(include=['float',"int"]):
    
    Q1 = df[feature].quantile(0.05)
    Q3 = df[feature].quantile(0.95)
    IQR = Q3-Q1
    lower = Q1- 1.5*IQR
    upper = Q3 + 1.5*IQR
    
    if df[(df[feature] > upper)].any(axis=None):
        print(feature,"yes")
    else:
        print(feature, "no")

In [None]:
Q1 = df["Insulin"].quantile(0.05)
Q3 = df["Insulin"].quantile(0.95)
IQR = Q3-Q1
upper = Q3 + 1.5*IQR
df.loc[df["Insulin"] > upper, "Insulin"] = upper

In [None]:
Q1 = df["SkinThickness"].quantile(0.05)
Q3 = df["SkinThickness"].quantile(0.95)
IQR = Q3-Q1
upper = Q3 + 1.5*IQR
df.loc[df["SkinThickness"] > upper, "SkinThickness"] = upper

In [None]:
df["InsSki"]=df["Insulin"]*df["SkinThickness"]
df["GluIns"]=df["Glucose"]*df["Insulin"]

In [None]:
Q1 = df["InsSki"].quantile(0.05)
Q3 = df["InsSki"].quantile(0.95)
IQR = Q3-Q1
upper = Q3 + 1.5*IQR
df.loc[df["InsSki"] > upper, "InsSki"] = upper

Q1 = df["GluIns"].quantile(0.05)
Q3 = df["GluIns"].quantile(0.95)
IQR = Q3-Q1
upper = Q3 + 1.5*IQR
df.loc[df["GluIns"] > upper, "GluIns"] = upper


In [None]:
df.drop(["SkinThickness",'BloodPressure'],axis=1,inplace=True)

In [None]:
#We determine outliers between all variables with the Lof method
from sklearn.neighbors import LocalOutlierFactor
lof =LocalOutlierFactor(n_neighbors= 20)
lof.fit_predict(df)

In [None]:
df_scores = lof.negative_outlier_factor_
np.sort(df_scores)[0:30]

In [None]:
th = np.sort(df_scores)[8]
th

In [None]:
#We delete those that are higher than the threshold
df = df[df_scores > th]
df.shape

# First results

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis = 1)
cols = X.columns
index = X.index

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X,columns=cols,index=index)

In [None]:
# Tum modellerin train validasyon skorları

models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('XGB', GradientBoostingClassifier()))
models.append(("LightGBM", LGBMClassifier()))

# evaluate each model in turn
results = []
names = []

In [None]:
for name, model in models:
    
        kfold = KFold(n_splits = 10, random_state = 12345)
        cv_results = cross_val_score(model, X, y, cv = 10, scoring= "accuracy")
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Tuning işlemleri

## RF

In [None]:
rf_params = {"n_estimators" :[100,200,500], 
             "max_features": [3,5,8,], 
             "min_samples_split": [2,5,8],
            "max_depth": [3,5,8]}

In [None]:
rf_model = RandomForestClassifier(random_state = 12345)

In [None]:
gs_cv = GridSearchCV(rf_model, 
                    rf_params,
                    cv = 10,
                    n_jobs = -1,
                    verbose = 2).fit(X, y)

In [None]:
gs_cv.best_params_

In [None]:
rf_tuned = RandomForestClassifier(**gs_cv.best_params_,random_state=12345)

In [None]:
rf_tuned = rf_tuned.fit(X,y)

In [None]:
cross_val_score(rf_tuned, X, y, cv = 10).mean()

In [None]:
feature_imp = pd.Series(rf_tuned.feature_importances_,
                        index=X.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Değişken Önem Skorları')
plt.ylabel('Değişkenler')
plt.title("Değişken Önem Düzeyleri")
plt.show()

## XGBoost

In [None]:
xgb = GradientBoostingClassifier(random_state=12345)

In [None]:
xgb_params = {"n_estimators": [100, 500, 1000],
              "subsample" : [0.6, 0.8, 1.0],
              "max_depth" : [3, 4, 5],
              "learning_rate" : [0.1, 0.01, 0.05,],
              "min_samples_split" : [2, 5, 10]}

In [None]:
xgb_cv_model = GridSearchCV(xgb, 
                     xgb_params, 
                     cv = 10, 
                     n_jobs = -1, 
                     verbose = 2).fit(X, y)

In [None]:
xgb_cv_model.best_params_

In [None]:
xgb_tuned = GradientBoostingClassifier(**xgb_cv_model.best_params_,random_state=12345)

In [None]:
xgb_tuned = xgb_tuned.fit(X,y)

In [None]:
cross_val_score(xgb_tuned, X, y, cv = 10).mean()

In [None]:
feature_imp = pd.Series(xgb_tuned.feature_importances_,
                        index=X.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Değişken Önem Skorları')
plt.ylabel('Değişkenler')
plt.title("Değişken Önem Düzeyleri")
plt.show()

## LGBM tuning

In [None]:
lgbm = LGBMClassifier(random_state = 12345)

In [None]:
lgbm_params = {"learning_rate": [0.01, 0.05, 0.1],
              "n_estimators": [100,500, 1000],
              "max_depth":[3,5,8]}

In [None]:
gs_cv = GridSearchCV(lgbm, 
                     lgbm_params, 
                     cv = 10, 
                     n_jobs = -1, 
                     verbose = 2).fit(X, y)

In [None]:
gs_cv.best_params_

In [None]:
lgbm_tuned = LGBMClassifier(**gs_cv.best_params_).fit(X,y)

In [None]:
cross_val_score(lgbm_tuned, X, y, cv = 10).mean()

In [None]:
feature_imp = pd.Series(lgbm_tuned.feature_importances_,
                        index=X.columns).sort_values(ascending=False)

sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Değişken Önem Skorları')
plt.ylabel('Değişkenler')
plt.title("Değişken Önem Düzeyleri")
plt.show()