In [None]:
import warnings
warnings.simplefilter('ignore')

### Importing required libraries -

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

### Loading the dataset - 

In [None]:
filepath = '../input/pima-indians-diabetes-database/diabetes.csv'
data = pd.read_csv(filepath)

### Performing EDA -

In [None]:
data.sample(10)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().T

### Checking the outcome labels - 

In [None]:
data['Outcome'].value_counts()

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(data=data, x='Outcome')
plt.show()

### Checking for null values - 

In [None]:
data.isnull().sum().any()

### Checking for duplicate rows -

In [None]:
duplicate_rows = data[data.duplicated()]
duplicate_rows.shape[0]

### Checking the distribution of data -

In [None]:
data.hist(figsize=(12, 10))
plt.show()

### Checking Correlation between Dependent & Independent variables -

In [None]:
plt.figure(figsize = (10, 8))
corr = data.corr(method='spearman')
mask = np.triu(np.ones_like(corr, dtype=bool))
cormat = sns.heatmap(corr, mask=mask, linewidths=1, annot=True, fmt=".2f")
cormat.set_title('Correlation Matrix')
plt.show()

In [None]:
sns.set(style="ticks", color_codes=True)
sns.pairplot(data)
plt.show()

### Outlier Detection & Removal -

In [None]:
def diagnostic_plot(data, col):
    plt.figure(figsize=(15, 3))
    
    plt.subplot(1, 3, 1)
    sns.distplot(data[col], bins=10)
    plt.title('Histogram')
    
    plt.subplot(1, 3, 2)
    stats.probplot(data[col], dist='norm', fit=True, plot=plt)
    plt.title('Q-Q Plot')
    
    plt.subplot(1, 3, 3)
    sns.boxplot(y=data[col])
    plt.title('Boxplot')
    
    plt.show()

Checking the 'Pregnancies' column -

In [None]:
data['Pregnancies'].value_counts()

In [None]:
max_threshold = data['Pregnancies'].quantile(0.95)
data = data[data['Pregnancies']<=max_threshold]

Checking the 'Age' column -

In [None]:
print("Maximum Age is: {}".format(data['Age'].max()))
print("Minimum Age is: {}".format(data['Age'].min()))

Checking the 'Glucose' column -

In [None]:
diagnostic_plot(data, 'Glucose')

In [None]:
data = data[data['Glucose']>=25]

Checking the 'BloodPressure' column -

In [None]:
diagnostic_plot(data, 'BloodPressure')

In [None]:
data = data[data['BloodPressure']!=0]

Checking the 'BMI' column -

In [None]:
diagnostic_plot(data, 'BMI')

In [None]:
data = data[(data['BMI']>10) & (data['BMI']<50)]

Checking the 'Insulin' column -

In [None]:
data['Insulin'].value_counts().sort_index(ascending=False)

In [None]:
data = data[data['Insulin'].between(15, 600)]

Checking the 'SkinThickness' column -

In [None]:
diagnostic_plot(data, 'SkinThickness')

In [None]:
data = data[data['SkinThickness']<60]

Checking the 'DiabetesPedigreeFunction' column -

In [None]:
diagnostic_plot(data, 'DiabetesPedigreeFunction')

In [None]:
data.shape

### Seperating Dependent and Independent features - 

In [None]:
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']

### Performing train-test split -

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Scaling the data -

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Checking the accuracies for 3 different models -

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline
pipeline_lr = Pipeline([('lr', LogisticRegression())])
pipeline_svc = Pipeline([('svc', SVC())])
pipeline_dt = Pipeline([('dt', DecisionTreeClassifier())])

pipelines = [pipeline_lr, pipeline_svc, pipeline_dt]
best_acc = 0
best_clf = 0
best_pipeline=""
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Classifier', 2: 'Decision Tree Classifier'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)
    
for i, model in enumerate(pipelines):
    print("{} - Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
    
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test)>best_acc:
        best_acc = model.score(X_test, y_test)
        best_pipeline = model
        best_clf = i
print("Classifier with best accuracy is {}". format(pipe_dict[best_clf]))

### Importing Performance Metrics for Classification -

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
print("Train accuracy :{}".format(accuracy_score(y_train, lr.predict(X_train))))
print("Test accuracy :{}".format(accuracy_score(y_test, lr.predict(X_test))))

In [None]:
y_pred_lr = lr.predict(X_test)

In [None]:
y_pred_proba_lr = lr.predict_proba(X_test)[:, 1]

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True)
plt.title("Confusion Matrix")
plt.show()

In [None]:
print("Classification Report")
print(classification_report(y_test, y_pred_lr))

In [None]:
print("AUC Score: {}".format(roc_auc_score(y_test, y_pred_proba_lr)))

In [None]:
from sklearn.model_selection import cross_val_score
lr_acc = np.mean(cross_val_score(lr, X, y, cv=10, scoring='accuracy')) 
print("Cross Validation accuracy: {}".format(lr_acc))

## Support Vector Classifier

In [None]:
from sklearn.svm import SVC
svc = SVC(probability=True)
svc.fit(X_train, y_train)

In [None]:
print("Train accuracy :{}".format(accuracy_score(y_train, svc.predict(X_train))))
print("Test accuracy :{}".format(accuracy_score(y_test, svc.predict(X_test))))

In [None]:
y_pred_svc = svc.predict(X_test)

In [None]:
y_pred_proba_svc = svc.predict_proba(X_test)[:, 1]

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_svc), annot=True)
plt.title("Confusion Matrix")
plt.show()

In [None]:
print("Classification Report")
print(classification_report(y_test, y_pred_svc))

In [None]:
print("AUC Score: {}".format(roc_auc_score(y_test, y_pred_proba_svc)))

In [None]:
from sklearn.model_selection import cross_val_score
svc_acc = np.mean(cross_val_score(svc, X, y, cv=10, scoring='accuracy')) 
print("Cross Validation accuracy: {}".format(svc_acc))

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
print("Train accuracy :{}".format(accuracy_score(y_train, dt.predict(X_train))))
print("Test accuracy :{}".format(accuracy_score(y_test, dt.predict(X_test))))

In [None]:
y_pred_dt = dt.predict(X_test)

In [None]:
y_pred_proba_dt = dt.predict_proba(X_test)[:, 1]

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_dt), annot=True)
plt.title("Confusion Matrix")
plt.show()

In [None]:
print("Classification Report")
print(classification_report(y_test, y_pred_dt))

In [None]:
print("AUC Score: {}".format(roc_auc_score(y_test, y_pred_proba_dt)))

In [None]:
from sklearn.model_selection import cross_val_score
dt_acc = np.mean(cross_val_score(dt, X, y, cv=10, scoring='accuracy')) 
print("Cross Validation accuracy: {}".format(dt_acc))

### Plotting ROC Curve for all 3 models to compare their accuracies -

In [None]:
fpr_lr, tpr_lr, threshold_lr = roc_curve(y_test, y_pred_proba_lr)
fpr_svc, tpr_svc, threshold_svc = roc_curve(y_test, y_pred_proba_svc)
fpr_dt, tpr_dt, threshold_dt = roc_curve(y_test, y_pred_proba_dt)

plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(8, 5))
plt.plot(fpr_lr, tpr_lr, label="Logistic Regression")
plt.plot(fpr_svc, tpr_svc, label="Support Vector Classifier")
plt.plot(fpr_dt, tpr_dt, label="Decision Tree Classifier")

plt.legend(loc='lower right', frameon=True)
plt.title("ROC Curve")
plt.ylabel("TPR")
plt.xlabel("FPR")
plt.show()

### Tuning the Hyperparameter 'C' for Logistic Regression -

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid = GridSearchCV(estimator=lr, param_grid=params, cv=10, scoring='accuracy', n_jobs=-1)
grid.fit(X,y)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

### Tuning the Hyperparameters 'C', 'kernel' & 'degree' for Support Vector Classifier -

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = { 'C': [1, 10, 100, 1000],
           'kernel': ['poly'],
           'degree': [2, 3, 4],
         }

random = RandomizedSearchCV(svc, param_distributions=params, cv=10, scoring='accuracy', n_jobs=-1)
random.fit(X,y)

In [None]:
random.best_params_

In [None]:
random.best_score_

### Tuning the Hyperparameters 'max_depth', 'min_samples_split' & 'min_samples_leaf' for Decision Tree -

In [None]:
from sklearn.model_selection import GridSearchCV

params = { 'max_depth': [3, 4, 5],
           'min_samples_split': [2, 3],
           'min_samples_leaf': [1, 2, 3]}

grid = GridSearchCV(estimator=dt, param_grid=params, cv=10, scoring='accuracy', n_jobs=-1)
grid.fit(X,y)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
classifiers = [('Logistic Regression', lr), ('Support Vector Classifier', svc), ('Decision Tree Classifier', dt)]
vc = VotingClassifier(estimators=classifiers, voting='soft')
vc.fit(X_train, y_train)

In [None]:
y_pred_vc = vc.predict(X_test)

In [None]:
y_pred_proba_vc = vc.predict_proba(X_test)[:, 1]

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_vc), annot=True)
plt.title("Confusion Matrix")
plt.show()

In [None]:
print("Classification Report")
print(classification_report(y_test, y_pred_vc))

In [None]:
print("AUC Score: {}".format(roc_auc_score(y_test, y_pred_proba_vc)))

In [None]:
from sklearn.model_selection import cross_val_score
vc_acc = np.mean(cross_val_score(vc, X, y, cv=10, scoring='accuracy')) 
print("Cross Validation accuracy: {}".format(vc_acc))

> If you find this notebook useful then please provide your valuble feedback.
 
> Any kind of suggestions are welcomed. 
 
> Don't forget to upvote if you like my work.