In [None]:
!pip install seaborn==0.11.1

# Import library and read data

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
data.head()

## Column info
1. Pregnancies: number of child
2. Glucose: also called dextrose, one of a group of carbohydrates known as simple sugars
3. BloodPressure: Blood presure
4. SkinThickness: Thickness of skin
5. Insulin: Insulin is a peptide hormone to control Glucose
6. BMI: Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters
7. DiabetesPedigreeFunction
8. Age:  Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
9. Outcome

### Check datatype

In [None]:
data.info()

### Validate the incorrect value

In [None]:
data.describe()

In [None]:
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Pregnancies']
for col in cols:
    print(f"Total zero value of {col}: {len(data[data[col]==0])}")

### Fill Zero value as Average

In [None]:
non_zero_columns = ['BloodPressure', 'BMI','Glucose']
for col in non_zero_columns:
    data[col] = data[col].replace(0, data[data[col]!=0][col].mean())

In [None]:
data['Outcome'].replace({1:True, 0:False})

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize=(30,15))

for index, column in enumerate(cols,1):
    plt.subplot(3,5, index)
    print(column)
    sns.histplot(data, x=column,hue='Outcome', kde=True)

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize=(30,15))

for index, column in enumerate(cols,1):
    plt.subplot(3,4, index)
    sns.swarmplot(data=data,y=column,x='Outcome')
    sns.boxplot(data=data, y=column,x='Outcome')

In [None]:
data.describe()

## Feature selection

### Find high correlated value

In [None]:
corr = data.corr()
corr

In [None]:
sns.heatmap(data.corr(), cmap='RdYlGn', vmax=.6, vmin=-.6, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
x_col = ['Glucose', 'BMI','Pregnancies']
x = data[x_col]
y = data['Outcome']

# Machine Learning with Logistics Regression

#### Split dataset 75% for training dataset, 25% for validation

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 17)


### Train dataset

In [None]:
from sklearn.linear_model import LogisticRegression 

model = LogisticRegression(random_state=0)
model.fit(x_train, y_train)
predict = model.predict(x_test)

# Metrics

### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test,predict)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in
                matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     matrix.flatten()/np.sum(matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(matrix, annot=labels, fmt='', cmap='Blues')

### Classification Report

In [None]:

from sklearn.metrics import classification_report
print(classification_report(model.predict(x_test),y_test))

### ROC and AUC 

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, model.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()


### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, x_train, y_train, cv=5)
print('Cross-Validation Accuracy Scores', scores)

### Edit Threshold to increase Presicion (TP + FP)

In [None]:
decisions = (model.predict_proba(x_test)[:, 1] >= .11).astype(int)
print(classification_report(decisions,y_test))

In [None]:
matrix = confusion_matrix(decisions,y_test)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ['{0:0.0f}'.format(value) for value in
                matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     matrix.flatten()/np.sum(matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(matrix, annot=labels, fmt='', cmap='Blues')

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize=(30,15))

for index, column in enumerate(x_col,1):
    plt.subplot(3,5, index)
    sns.histplot(x=x_test[column],hue=decisions, kde=True)