### Data Description
##### Pregnancies: Number of times pregnant
##### Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
##### BloodPressure: Diastolic blood pressure (mm Hg)
##### SkinThickness: Triceps skin fold thickness (mm)
##### Insulin: 2-Hour serum insulin (mu U/ml)
##### BMI: Body mass index (weight in kg/(height in m)^2)
##### DiabetesPedigreeFunction: Diabetes pedigree function
##### Age: Age (years)
##### Outcome: Class variable (0 or 1)

# Installing


In [None]:
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

warnings.simplefilter(action="ignore")

In [None]:
df=pd.read_csv("../input/diabetes-data-set/diabetes.csv")

Since the number of observations is low, we will do an exploratory data analysis using the Cross Validation (CV) method instead of separating the data set as "test and train" with the holdout method.

In [None]:
df.shape


In [None]:
df["Outcome"].value_counts() * 100 / len(df) # bagimli degiskenin siniflarinin oranlarin bakiyoruz...

In [None]:
def outcome_agg(col):
    for i in col:  
        print(df.groupby("Outcome").agg({i: "mean"}))
    

In [None]:
def get_cols(df, target):
    cols = []
    for col in df.columns:
        if col!=target:
            cols.append(col)
    return cols

In [None]:
var_names=get_cols(df, "Outcome")
var_names

In [None]:
outcome_agg(var_names)

In [None]:
def get_cols2(df, target):
    cols = [col for col in df.columns if col != target]
    return cols

In [None]:
get_cols2(df, "Outcome")

In [None]:
df.groupby("Outcome").agg({"Pregnancies": "mean"})

In [None]:
df.groupby("Outcome").agg({"Glucose": "mean"})

In [None]:
df.groupby("Outcome").agg({"BloodPressure": "mean"})

In [None]:
df.groupby("Outcome").agg({"SkinThickness": "mean"})

In [None]:
df.groupby("Outcome").agg({"Insulin": "mean"})

In [None]:
df.groupby("Outcome").agg({"BMI": "mean"})

In [None]:
df.groupby("Outcome").agg({"DiabetesPedigreeFunction": "mean"})

In [None]:
df.groupby("Outcome").agg({"Age": "mean"})

In [None]:
df.describe([0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T


We are looking at the description of numeric variables.

In [None]:
sns.countplot(x='Outcome', data=df)
plt.show()

In [None]:
df["Outcome"].value_counts().plot.pie(autopct = "%.1f");

df["Outcome"].value_counts().plot.pie(autopct = "%.1f")    alternatif visual.

In [None]:
df.describe([0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T

Blood Pressure cannot be "0". Be careful! We need new feature...

In [None]:
df.info()

In [None]:
df.Outcome.unique()

In [None]:
df.Outcome.value_counts()

# Data Visualization

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df.corr(),cmap='Blues',annot=False);

If we change the 0 values to NaN before looking at the correlation, we find a more significant correlation.

In [None]:
# NaN values of 0 for Glucose, Blood Pressure, Skin Thickness, Insulin, BMI
# We can write Nan instead of 0
cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols:
    df[col].replace(0,np.NaN,inplace=True)

In [None]:
#Outcome correlation matrix
k = 9 #number of variables for heatmap
cols = df.corr().nlargest(k, 'Outcome')['Outcome'].index
cm = df[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis');

In [None]:
# see how the data is distributed.
df.hist(figsize = (20,20));

In [None]:
def c_dis_plot(df, cols):
    for col in cols:
        sns.distplot(df[col], hist=False)
        plt.axvline(df[col].mean(),color='r',label='mean')
        plt.axvline(np.median(df[col]),color='b',label='median')
        plt.axvline((df[col].mode())[0],color='g',label='mode')
        plt.legend()
        plt.show();
        

In [None]:
c_dis_plot(df, var_names)

In [None]:
# A scatter plot for show how two variables are related to each other
sns.lmplot("BloodPressure", "Glucose", df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("Glucose", "SkinThickness", df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("Glucose", "Insulin", df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("Glucose", "BMI", df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("Glucose", "Age", df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("Glucose", "DiabetesPedigreeFunction", df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("Insulin","BloodPressure",df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("Age", "BloodPressure", df, hue='Outcome', fit_reg=False, height = 5)
sns.lmplot("BMI", "SkinThickness", df, hue='Outcome', fit_reg=False, height = 5)

# DATA PREPROCESSING

In [None]:
#Observation units for variables with a minimum value of zero are NaN, except for the pregnancy variable.
df.describe([0.05,0.25,0.50,0.75,0.90,0.95,0.99]).T

### Missing Values

In [None]:
# NaN values of 0 for Glucose, Blood Pressure, Skin Thickness, Insulin, BMI
# We can write Nan instead of 0
cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols:
    df[col].replace(0,np.NaN,inplace=True)

In [None]:
# now we can see missing values
df.isnull().sum()

In [None]:
# We can fill in NaN values with a median according to the target
for col in df.columns:
    df.loc[(df["Outcome"]==0) & (df[col].isnull()),col] = df.loc[(df["Outcome"]==0), col].median()
    df.loc[(df["Outcome"]==1) & (df[col].isnull()),col] = df.loc[(df["Outcome"]==1), col].median()

By looking at the distribution graphs above (93), a more accurate decision can be made about filling empty values with median, mode or mean.

In [None]:
df.isnull().sum()

### Outliers

In [None]:
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.10)
    quartile3 = dataframe[variable].quantile(0.90)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def has_outliers(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    if dataframe[(dataframe[variable] < low_limit) | (dataframe[variable] > up_limit)].any(axis=None):
        print(variable, "yes")
    print(variable, "no")

In [None]:
for col in df.columns:
    has_outliers(df, col)

In [None]:
def replace_with_thresholds(dataframe, numeric_columns):
    for variable in numeric_columns:
        low_limit, up_limit = outlier_thresholds(dataframe, variable)
        dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
        dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
replace_with_thresholds(df, df.columns)

In [None]:
for col in df.columns:
    has_outliers(df, col)

In [None]:
df.describe([0.05,0.25,0.50,0.75,0.90,0.95,0.99]).T

### FEATURE ENGINEERING

In [None]:
df['New_Glucose_Class'] = pd.cut(x=df['Glucose'], bins=[0,139,200],labels = ["Normal","Prediabetes"])

In [None]:
df['New_BMI_Range'] = pd.cut(x=df['BMI'], bins=[0,18.5,24.9,29.9,100],labels = ["Underweight","Healty","Overweight","Obese"])

In [None]:
df['New_BloodPressure'] = pd.cut(x=df['BloodPressure'], bins=[0,79,89,123],labels = ["Normal","HS1","HS2"])

In [None]:
df['New_SkinThickness'] = df['SkinThickness'].apply(lambda x: 1 if x <= 18.0 else 0)

In [None]:
df.head()

### Categorical Variables

In [None]:
def one_hot_encoder(dataframe, categorical_columns, nan_as_category=False):
    original_columns = list(dataframe.columns)
    dataframe = pd.get_dummies(dataframe, columns=categorical_columns,
                               dummy_na=nan_as_category, drop_first=True)
    new_columns = [col for col in dataframe.columns if col not in original_columns]
    return dataframe, new_columns

In [None]:
categorical_columns = [col for col in df.columns
                           if len(df[col].unique()) <= 10
                      and col != "Outcome"]
categorical_columns

In [None]:
df, new_cols_ohe = one_hot_encoder(df,categorical_columns)
new_cols_ohe

In [None]:
df.head()

### Standardization

* ! Robust is less susceptible to outliers..., x-median(x)/q3-q1 


In [None]:
def robust_scaler(variable):
    var_median = variable.median()
    quartile1 = variable.quantile(0.25)
    quartile3 = variable.quantile(0.75)
    interquantile_range = quartile3 - quartile1
    if int(interquantile_range) == 0:
        quartile1 = variable.quantile(0.05)
        quartile3 = variable.quantile(0.95)
        interquantile_range = quartile3 - quartile1
        if int(interquantile_range) == 0:
            quartile1 = variable.quantile(0.10)
            quartile3 = variable.quantile(0.99)
            interquantile_range = quartile3 - quartile1
            z = (variable - var_median) / interquantile_range
            return round(z, 3)

        z = (variable - var_median) / interquantile_range
        return round(z, 3)
    else:
        z = (variable - var_median) / interquantile_range
    return round(z, 3)

In [None]:
like_num = [col for col in df.columns if df[col].dtypes != 'O' and len(df[col].value_counts()) < 10]
cols_need_scale = [col for col in df.columns if col not in new_cols_ohe
                   and col not in "Outcome"
                   and col not in like_num]

for col in cols_need_scale:
    df[col] = robust_scaler(df[col])

In [None]:
df.head()

In [None]:
# see how the data is distributed.
df.hist(figsize = (20,20));

In [None]:
df.info()

# MODELLING

In [None]:
X = df.drop("Outcome",axis=1)
y = df["Outcome"]

In [None]:
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import RobustScaler
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
warnings.simplefilter(action="ignore")

In [None]:
models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('SVR', SVC(gamma='auto')),
          ('XGBM', XGBClassifier()),
          ('GB',GradientBoostingClassifier()),
          ("LightGBM", LGBMClassifier())]

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=123456)
    cv_results = cross_val_score(model, X, y, cv=10, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
#Let's choose the highest 4 models
# GBM
gbm_model = GradientBoostingClassifier()
# Model Tuning
gbm_params = {"learning_rate": [0.01, 0.1, 0.001],
               "max_depth": [3,5, 8, 10],
               "n_estimators": [200, 500, 1000],
               "subsample": [1, 0.5, 0.8]}
gbm_cv_model = GridSearchCV(gbm_model,
                            gbm_params,
                            cv=10,
                            n_jobs=-1,
                            verbose=2).fit(X, y)
gbm_cv_model.best_params_
# Final Model
gbm_tuned = GradientBoostingClassifier(**gbm_cv_model.best_params_).fit(X,y)

In [None]:
# LightGBM: 
lgb_model = LGBMClassifier()
# Model Tuning
lgbm_params = lgbm_params = {"learning_rate": [0.01, 0.5, 1],
                             "n_estimators": [200, 500, 1000],
                             "max_depth": [6, 8, 10],
                             "colsample_bytree": [1, 0.5, 0.4]}
lgbm_cv_model = GridSearchCV(lgb_model,
                             lgbm_params,
                             cv=10,
                             n_jobs=-1,
                             verbose=2).fit(X, y)
lgbm_cv_model.best_params_
# Final Model
lgbm_tuned = LGBMClassifier(**lgbm_cv_model.best_params_).fit(X, y)

In [None]:
# Random Forests:
rf_model = RandomForestClassifier()
# Model Tuning
rf_params = {"max_depth": [5,10,None],
            "max_features": [2,5,10],
            "n_estimators": [100, 500, 900],
            "min_samples_split": [2,10,30]}
rf_cv_model = GridSearchCV(rf_model, 
                           rf_params, 
                           cv=10, 
                           n_jobs=-1, 
                           verbose=2).fit(X, y)
rf_cv_model.best_params_
# Final Model
rf_tuned = RandomForestClassifier(**rf_cv_model.best_params_).fit(X, y)

In [None]:
# evaluate each model in turn
models = [('RF', rf_tuned),
          ('GBM',gbm_tuned )]
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=123456)
    cv_results = cross_val_score(model, X, y, cv=10, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
kfold = KFold(n_splits=10, random_state=123456)
cv_results = cross_val_score(gbm_tuned, X, y, cv=10, scoring="accuracy")
msg = "%s: %f (%f)" % ("gbm", cv_results.mean(), cv_results.std())
print(msg)

In [None]:
log_model = LogisticRegression().fit(X, y)


In [None]:
log_model.intercept_


In [None]:
log_model.coef_

y= 7.7029389 + 1.17252354e-01*Pregnancies + 3.36001406e-02*Glucose -1.40872987e-02*BloodPressure......etc.

Note: There is a difference in logstic regression. We cannot interpret these coefficients as in classical regression, we interpret them as e^coefficient.

Let's predict the logistic regression model!

In [None]:
log_model.predict(X)

In [None]:
log_model.predict(X)[0:10]


In [None]:
y[0:10]

In [None]:
log_model.predict_proba(X)[0:10]


We have predicted probabilities, not the results. 0.28902396 in the first line, the occurrence of "0" class; 0.71097604, probability of occurrence of class "1" ...

In [None]:
y_pred = log_model.predict(X)


We recorded the predicted values as y_pred. Now we're bringing in the accuracy values. Real value / predicted values

In [None]:
accuracy_score(y, y_pred)

Prediction success is 0.78.

Now let's test the data set with the 10-fold cross-validation (CV) method, so let's divide the data by 10, build a model with 9 and test it with 1. 

In [None]:
cross_val_score(log_model, X, y, cv=10)

10 scores came for each cross valudation. Now let's add .mean () to see the average score ...

In [None]:
cross_val_score(log_model, X, y, cv=10).mean()

In [None]:
print(classification_report(y, y_pred))

We used the Cross Valudation method for logistic regression, and if we use it for all other models, we can compare the success of the models from the metric perspective. However, when we have plenty of data, using the holdout method as a "train" and "test" allows us to measure the predictive power more accurately. Because the model will try to guess the test data it has never seen.

When the number of classes is unbalanced, values such as "support", "presicion", "recall"  will be important when looking for what to do. Sometimes - class sometimes + class will be important ... look at this!! ....

"macro avg" and "weighted avg" values should have gotten this value because of there are two classes !! I guess, it will increase when there are more classes ...


### Visualization   
 "roc_auc_score" and "roc_curve" are another metric for classification problems.

In [None]:
logit_roc_auc = roc_auc_score(y, log_model.predict(X))
fpr, tpr, thresholds = roc_curve(y, log_model.predict_proba(X)[:, 1])
plt.figure()


In [None]:
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')




At the above, we tried to understand the predictive power of the model by looking at the accucy scores. Here, we are looking at the auc score with the curve above. This scor gives the area between the curve and the line.

In [None]:
logit_roc_auc = roc_auc_score(y, log_model.predict(X))
fpr, tpr, thresholds = roc_curve(y, log_model.predict_proba(X)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


In [None]:
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

In [None]:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()


## Random Forest


In [None]:
rf_model = RandomForestClassifier(random_state=12345).fit(X, y)


In [None]:

cross_val_score(rf_model, X, y, cv=10).mean()



We looked cross validation with all data

In [None]:
rf_params = {"n_estimators": [200, 500],
             "max_features": [5, 7],
             "min_samples_split": [5, 10],
             "max_depth": [5, None]}



Reminder: Since we use CV method in logistic regression above, we also use CV here. Since there is little data, we did not separate it as 'test-train'. The best way is to separate the data set called "test" and  "train", to apply CV to the train set, to test it with the test set.

In [None]:
rf_model = RandomForestClassifier(random_state=12345)

We created an empty model, put it in GridSearchCV, leave the model to GridSearchCV to test it ...!

In [None]:
gs_cv = GridSearchCV(rf_model,
                     rf_params,
                     cv=10,
                     n_jobs=-1,
                     verbose=2).fit(X, y)

gs_cv.best_params_

In [None]:
rf_tuned = RandomForestClassifier(**gs_cv.best_params_)

In [None]:
cross_val_score(rf_tuned, X, y, cv=10).mean()

# Light GBM

In [None]:
lgbm = LGBMClassifier(random_state=12345)


In [None]:
cross_val_score(lgbm, X, y, cv=10).mean()

In [None]:
lgbm_params = {"learning_rate": [0.01],
               "n_estimators": [100],
               "max_depth": [3, 5]}

In [None]:
gs_cv = GridSearchCV(lgbm,         # Try all the above parameters. Whichever parameters give the best results,  
                     lgbm_params,  # fit the model with those parameters.
                     cv=5,
                     n_jobs=-1,
                     verbose=2).fit(X, y)

We re-entered this Lightgbm and looked at our CV error again.

lgbm_tuned = LGBMClassifier(**gs_cv.best_params_).fit(X, y)
cross_val_score(lgbm_tuned, X, y, cv=10).mean()

In [None]:
lgbm_tuned = LGBMClassifier(**gs_cv.best_params_).fit(X, y)
cross_val_score(lgbm_tuned, X, y, cv=10).mean()

In [None]:
rf_model = RandomForestClassifier(random_state=12345).fit(X, y)




In [None]:
cross_val_score(rf_model, X, y, cv=10).mean()



In [None]:
rf_params = {"n_estimators": [200, 500],
             "max_features": [5, 7],
             "min_samples_split": [5, 10],
             "max_depth": [5, None]}

rf_model = RandomForestClassifier(random_state=12345)





In [None]:
gs_cv = GridSearchCV(rf_model,
                     rf_params,
                     cv=10,
                     n_jobs=-1,
                     verbose=2).fit(X, y)

gs_cv.best_params_

In [None]:
rf_tuned = RandomForestClassifier(**gs_cv.best_params_)
cross_val_score(rf_tuned, X, y, cv=10).mean()

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from yellowbrick.cluster import KElbowVisualizer
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram

import warnings

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
sc = MinMaxScaler((0, 1))
df = sc.fit_transform(df)
df[0:5]

In [None]:
kmeans = KMeans(n_clusters=4)
k_fit = kmeans.fit(df)
k_fit

In [None]:
dir(k_fit)

In [None]:
k_fit.n_clusters
k_fit.cluster_centers_
k_fit.labels_
df[0:5]

In [None]:
k_means = KMeans(n_clusters=2).fit(df)
kumeler = k_means.labels_
type(df)
df = pd.DataFrame(df)

In [None]:
plt.scatter(df.iloc[:, 0],
            df.iloc[:, 1],
            c=kumeler,
            s=50,
            cmap="viridis")
plt.show()

In [None]:
merkezler = k_means.cluster_centers_

plt.scatter(df.iloc[:, 0],
            df.iloc[:, 1],
            c=kumeler,
            s=50,
            cmap="viridis")

plt.scatter(merkezler[:, 0],
            merkezler[:, 1],
            c="black",
            s=200,
            alpha=0.5)
plt.show()

In [None]:
kmeans = KMeans()
ssd = []
K = range(1, 30)

In [None]:
for k in K:
    kmeans = KMeans(n_clusters=k).fit(df)
    ssd.append(kmeans.inertia_)

ssd


In [None]:
plt.plot(K, ssd, "bx-")
plt.xlabel("Distance Residual Sums for different K Values")
plt.title("Elbow Method for Optimum Number of Clusters")
plt.show()


In [None]:
kmeans = KMeans()
visu = KElbowVisualizer(kmeans, k=(2, 20))
visu.fit(df)
visu.show();

In [None]:
kmeans = KMeans(n_clusters=5).fit(df)
kumeler = kmeans.labels_