In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for statistical data visualization
%matplotlib inline

**Importing the Dataset**

In [None]:
df=pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
df

**Exploratory Data Analysis**

In [None]:
df.shape

In [None]:
df['Outcome'].value_counts()

This implies there are 268 1's and the rest 0's

**View summary of dataset**

In [None]:
df.info

**Checking for categorical variables**

In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

**Checking for null values**

In [None]:
df.isnull().sum()

**Explore Numerical Variables**

In [None]:
numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

In [None]:
df[numerical].head()

**Declare feature vector and target variable**

In [None]:
X = df.drop(['Outcome'], axis=1)

y = df['Outcome']

**Split data into separate training and test set**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
X_train.shape, X_test.shape

**Feature Engineering**

In [None]:
X_train.dtypes


In [None]:
X_train.head()

In [None]:
X_test.head()

**Feature Scaling**

In [None]:
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model
gnb.fit(X_train, y_train)

In [None]:
y_pred = gnb.predict(X_test)

y_pred

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))


**Check for overfitting and underfitting**

In [None]:
print('Training set score: {:.4f}'.format(gnb.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(gnb.score(X_test, y_test)))

Since both the values are similar there is no overfitting or undrefitting

**Checking for Outcome Counts**

In [None]:
y_test.value_counts()

**Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d')

**Classification metrices**

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

**Classification accuracy**

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

**Precision**

In [None]:
precision = TP / float(TP + FP)


print('Precision : {0:0.4f}'.format(precision))

**Recall**

In [None]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))


**Specificity**

In [None]:
specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

**Calculate class probabilities**

In [None]:
y_pred_prob = gnb.predict_proba(X_test)[0:10]

y_pred_prob

In [None]:
y_pred1 = gnb.predict_proba(X_test)[:, 1]
plt.rcParams['font.size'] = 12
plt.hist(y_pred1, bins = 10,color='blue')
plt.title('Histogram of predicted probabilities of salaries >50K')
plt.xlim(0,1)
plt.xlabel('Predicted probabilities of salaries >50K')
plt.ylabel('Frequency')

**ROC - AUC**

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred1 )

plt.figure(figsize=(6,4))

plt.plot(fpr, tpr, linewidth=2)

plt.plot([0,1], [0,1], 'k--' )

plt.rcParams['font.size'] = 12

plt.title('ROC curve for Gaussian Naive Bayes Classifier for Predicting Salaries')

plt.xlabel('False Positive Rate (1 - Specificity)')

plt.ylabel('True Positive Rate (Sensitivity)')

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

ROC_AUC = roc_auc_score(y_test, y_pred1)

print('ROC AUC : {:.4f}'.format(ROC_AUC))

In [None]:
from sklearn.model_selection import cross_val_score

Cross_validated_ROC_AUC = cross_val_score(gnb, X_train, y_train, cv=5, scoring='roc_auc').mean()

print('Cross validated ROC AUC : {:.4f}'.format(Cross_validated_ROC_AUC))

**k-Fold Cross Validation**

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(gnb, X_train, y_train, cv = 10, scoring='accuracy')

print('Cross-validation scores:{}'.format(scores))

In [None]:
print('Average cross-validation score: {:.4f}'.format(scores.mean()))