In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
plt.style.use("seaborn-whitegrid")       
import pandas_profiling as pp 

import seaborn as sns

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/diabetes/diabetes.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

# Data Visualization

* You can use PandasProfile in order to analysis data.


In [None]:
import pandas_profiling as pp 
profile_df = pp.ProfileReport(df)

In [None]:
profile_df

* You can use visualization tools(such as seaborn, matmatplotlib) in order to analysis data. 

In [None]:
df.hist(figsize=(10, 10), bins=50, xlabelsize=5, ylabelsize=5);

In [None]:
sns.catplot(x="Outcome",data=df, kind="count");

In [None]:
df.plot(kind="density", layout=(6,5),subplots=True,sharex=False, sharey=False, figsize=(15,15));
plt.tight_layout() 

In [None]:
sns.pairplot(df, kind = "reg")

In [None]:
df_corr = df.corr()

In [None]:
sns.heatmap(df_corr, linewidths = 1);

In [None]:
sns.pairplot(df_corr, kind = "reg");

* As can be seen in heatmap and pairplot, there is no specific correlation between both data and result.

# Model Selection

# Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
X = df.drop(["Outcome"], axis = 1)
y = df["Outcome"]

#or 
#X = df[:,0:8]
#y = df[:, 8]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.30, 
                                                    random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(solver = "liblinear")
log_model = log.fit(X_train,y_train)
log_model

In [None]:
y_pred = log_model.predict(X_test)

In [None]:

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))


In [None]:
accuracy_score(y_test, log_model.predict(X_test))
cross_val_score(log_model, X_test, y_test, cv = 10).mean()

In [None]:
logit_roc_auc = roc_auc_score(y_test, log_model.predict(X_test))

fpr, tpr, thresholds = roc_curve(y_test, log_model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive ')
plt.ylabel('True Positive ')
plt.title('ROC')
plt.show()

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB


nb = GaussianNB()
nb_model = nb.fit(X_train, y_train)
nb_model


In [None]:
y_pred = nb_model.predict(X_test)
accuracy_score(y_test, y_pred)


In [None]:
cross_val_score(nb_model, X_test, y_test, cv = 10).mean()

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn_model = knn.fit(X_train, y_train)
knn_model



In [None]:
y_pred = knn_model.predict(X_test)
accuracy_score(y_test, y_pred)



In [None]:
knn_params = {"n_neighbors": np.arange(1,20)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_cv.fit(X_train, y_train)

In [None]:
print("Best KNN score:" + str(knn_cv.best_score_))
print("Best KNN parameter: " + str(knn_cv.best_params_))

In [None]:
knn = KNeighborsClassifier(1)
knn_tuned = knn.fit(X_train, y_train)

In [None]:
y_pred = knn_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
d = {'Accuracy in KNN before GridSearchCV ': [0.77], 'Accuracy in KNN After GridSearchCV': [0.95]}
knn_data = pd.DataFrame(data=d)
knn_data

# SVM - Support Vector Model

In [None]:
from sklearn.svm import SVC


svm_model = SVC(kernel = "linear").fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
accuracy_score(y_test, y_pred)



In [None]:
svc_params = {"C": np.arange(1,10)}

svc = SVC(kernel = "linear")

svc_cv_model = GridSearchCV(svc,svc_params, 
                            cv = 10, 
                            n_jobs = -1, 
                            verbose = 2 )
svc_cv_model.fit(X_train, y_train)


In [None]:
print("Best Params: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned = SVC(kernel = "linear", C = 2).fit(X_train, y_train)

y_pred = svc_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

In [None]:
d = {'Accuracy in SVM before GridSearchCV ': [0.7983], 'Accuracy in SVM After GridSearchCV': [0.7933]}
svm_data = pd.DataFrame(data=d)
svm_data

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier().fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
rf_params = {"max_depth": [2,5,8],
            "max_features": [2,5,8],
            "n_estimators": [10,500,1000],
            "min_samples_split": [2,5,10]}

rf_model = RandomForestClassifier()

rf_cv_model = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                           n_jobs = -1, 
                           verbose = 2) 

rf_cv_model.fit(X_train, y_train)

In [None]:
print("Best Params: " + str(rf_cv_model.best_params_))

In [None]:
rf_tuned = RandomForestClassifier(max_depth = 8, 
                                  max_features = 8, 
                                  min_samples_split = 2,
                                  n_estimators = 1000)

In [None]:
rf_tuned.fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

In [None]:
Importance = pd.DataFrame({"Importance": rf_tuned.feature_importances_*100},
                         index = X_train.columns)

Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r");



In [None]:
d = {'Accuracy in RF before GridSearchCV ': [0.97], 'Accuracy in RF After GridSearchCV': [0.92]}
rf_data = pd.DataFrame(data=d)
rf_data

# GBM

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

y_pred = gbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
gbm_params = {"learning_rate" : [0.001, 0.01, 0.1, 0.05],
             "n_estimators": [100,500,100],
             "max_depth": [3,5,10],
             "min_samples_split": [2,5,10]}

gbm = GradientBoostingClassifier()

gbm_cv = GridSearchCV(gbm, gbm_params, cv = 10, n_jobs = -1, verbose = 2)
gbm_cv.fit(X_train, y_train)

In [None]:
print("Best Params: " + str(gbm_cv.best_params_))

In [None]:
gbm = GradientBoostingClassifier(learning_rate = 0.1, 
                                 max_depth = 10,
                                min_samples_split = 2,
                                n_estimators = 100)

gbm_tuned =  gbm.fit(X_train,y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

In [None]:
Importance = pd.DataFrame({"Importance": gbm_tuned.feature_importances_*100},
                         index = X_train.columns)

Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "r");

In [None]:
d = {'Accuracy in GBM before GridSearchCV ': [0.87], 'Accuracy in GBM After GridSearchCV': [0.95]}
gbm_data = pd.DataFrame(data=d)
gbm_data

In [None]:
models = [
    knn_tuned,
    log_model,
    svc_tuned,
    nb_model,
    rf_tuned,
    gbm_tuned,
    
]


for model in models:
    name = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("-"*28)
    print(name + ":" )
    print("Accuracy: {:.4%}".format(accuracy))

In [None]:
result = []

results = pd.DataFrame(columns= ["Models","Accuracy"])

for model in models:
    name = model.__class__.__name__
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)    
    result = pd.DataFrame([[name, accuracy*100]], columns= ["Models","Accuracy"])
    results = results.append(result)
    
    
sns.barplot(x= 'Accuracy', y = 'Models', data=results, color="r")
plt.xlabel('Accuracy %')
plt.title('accuracy rate of models'); 

# Suggestions

* There is a high difference between the outcome data.( 0: 1316 , 1 : 684). This can affect model results. You use *from sklearn.utils import class_weight* in order to avoid unbalanced distribution.
  
* There are too many zeros in the database.Especially in values such as insulin, glucose, BMI.If your values are zero, you are probably dead.:) So, you find zeros in this values and drop its. After doing this, you can model it again.

* You can apply a standardscaler to the data before modeling.

* You can examine how all these changes affect the results.

In [None]:
sns.catplot(x="Outcome",data=df, kind="count");

In [None]:
df["Outcome"].value_counts()

In [None]:
df["Insulin"].value_counts() 

In [None]:
df["BMI"].value_counts()