In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# What we will see in this notebook:
1. analysis on the dataset to understand each independent variable and its relationship with the target variable
2. necessary preprocessing of data
3. Application of  algorithms and select the best among them


# load libraries and read data

In [None]:
import warnings  
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
data

**Check for null values**

In [None]:
data.isnull().sum()

# Univariate Analysis of Data

In [None]:
sns.countplot(data['Pregnancies'])

In [None]:
sns.distplot(data['Glucose'])

**Most Observations in Glucose column lies within the limits of 100-150**

In [None]:
sns.distplot(data['BloodPressure'])

**Systolic Blood Pressure is also Normal**

In [None]:
sns.distplot(data['SkinThickness'])

In [None]:
sns.distplot(data['Insulin'])

**Insulin column seems to be skewed. We need to check it further..**

In [None]:
sns.distplot(data['BMI'])

In [None]:
sns.distplot(data['DiabetesPedigreeFunction'])

**This column also seems to be skewed**

In [None]:
sns.distplot(data['Age'])

In [None]:
sns.countplot(data['Outcome'])

**Here we see imbalance in the Target variable and we need to treat it before applying any algorithm**

# Relationship between variables

In [None]:
sns.swarmplot(x="Outcome", y="Pregnancies", data=data)

In [None]:
sns.swarmplot(x="Outcome", y="Age", data=data)

In [None]:
sns.lmplot(x='Insulin',y='Glucose', hue = 'Outcome',data = data)

**If normal levels of Insulin are maintained in blood then glucose level is also low**

In [None]:
sns.lmplot(x='DiabetesPedigreeFunction',y='Glucose', hue = 'Outcome',data = data)

**Presence of Family history of diabetes can lead to rise inglucose levels but not necessarily. This can be seen in the above plot**

In [None]:
sns.lmplot(x='Pregnancies',y='Glucose', hue = 'Outcome',data = data)

In [None]:
sns.lmplot(x='BloodPressure',y='Glucose',hue = 'Outcome',data = data)

**Long term High systolic blood pressure can increase glucose level**

In [None]:
sns.lmplot(x='SkinThickness',y='BMI',hue = 'Outcome',data = data)

In [None]:
sns.lmplot(x='SkinThickness',y='Insulin',hue = 'Outcome',data = data)

**skin thickness is more if insulin content is more**

# What does the relationships tell us?
1. High levels of Insulin in blood post 2 hrs indiacte that glucose levels are high and can lead to diabetes
2. Presence of family history does not necessarily lead to diabetes but person can be at risk
3. High Systolic blood pressure in long term can also lead to increase in glucose level as we can observe it from graph
4. Skin thickness can increase BMI and in turn becomes tolerant to insulin action due to which insulin effect on glucose level can decrease. This in long term can lead to diabetes and therefore it is said that BMI should be maintained.
5. Insulin levels are increased in cases where skin thickness is more****

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(method='pearson'), annot=True)

# Building models

**Splitting into features and labels**

In [None]:
y=data['Outcome']
X=data.drop(columns=['Outcome'], inplace=True)

In [None]:
features = ['Age', 'Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI', 'DiabetesPedigreeFunction']
X=data[features]

**Splitting of data into train and test**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, shuffle=True)

**Logistic Regression (Baseline Model)**

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [None]:
y_pred = log_reg.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, y_pred)
print(classification_report)

**so we get an accuracy of around 76% with precision of positive case as 72%. let's see if we can improve it**

**Since target variable is imbalanced, Applying smote for its treatment**

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
x_test_smote, y_test_smote = smote.fit_resample(X_test, y_test)

In [None]:
X_train = x_train_smote
X_test = x_test_smote
y_train = y_train_smote
y_test = y_test_smote

**Again predicting results after treatment of imbalance**

In [None]:
pred1=log_reg.predict(X_test)
pred1

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, pred1)
print(classification_report)

**Though accuracy from the baseline model is little less but precision has improved very much. In this type of dataset where we need to be more precise about the early detection of positive case, accuracy becomes less important. Thus we need to understand what our model building purpose is.**

In [None]:
cm = confusion_matrix(y_test, pred1)
cm

In [None]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)

**ROC curve**

In [None]:
predicted_probab = log_reg.predict_proba(X_test)
predicted_probab = predicted_probab[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab)
from matplotlib import pyplot
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
pyplot.plot(fpr, tpr, marker='.', color='red', label='Logistic Regression')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()


**XGBoost classifier**

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
y_pred1 = model.predict(X_test)


In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, y_pred1)
print(classification_report)

In [None]:
roc_auc_score(y_test, y_pred1)

**This model is not performing well as compared to logistic regression**

In [None]:
predicted_probab = model.predict_proba(X_test)
predicted_probab = predicted_probab[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab)
from matplotlib import pyplot
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
pyplot.plot(fpr, tpr, marker='.', color='red', label='XGB')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()


**Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_leaf_nodes=3, random_state=0, criterion='entropy')
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, dt_pred)
print(classification_report)

In [None]:
auc = roc_auc_score(y_test, dt_pred)
auc

**This model also does not perform well as compared to logistic regression**

In [None]:

predicted_probab = dt_clf.predict_proba(X_test)
predicted_probab = predicted_probab[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab)
from matplotlib import pyplot
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
pyplot.plot(fpr, tpr, marker='.', color='red', label='Decision Tree')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()


**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
rfc_predict = rfc.predict(X_test)
roc_auc_score(y_test, rfc_predict)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, rfc_predict)
print(classification_report)

In [None]:
#RF
predicted_probab = rfc.predict_proba(X_test)
predicted_probab = predicted_probab[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab)
from matplotlib import pyplot
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
pyplot.plot(fpr, tpr, marker='.', color='red', label='Random Forest')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()



**From the above application of models we see that Random forest performs the best as its f1 score is the best among all the models.
**For this type of dataset, we need our model to perform well to identify positive case and thus accuracy becomes secondary metric of secondary importance**

**The performance of models can further be improved by Hyperparameter Tuning and we might get other model which can perform better than Logistic Regression**

**Work in progress....**

**If you find the notebook useful then provide feedback and do suggest for any sort of improvements which can be made. I am a beginner and would like to get any suggestions for further enhancement of my knowledge**