### Importing python libraries

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

### Importing dataset

In [2]:
data = pd.read_csv("diabetes.csv") # Reading the Data
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### Data Preprocessing

In [4]:
# replacing zero values with the mean of the column
data['BMI'] = data['BMI'].replace(0,data['BMI'].mean())
data['BloodPressure'] = data['BloodPressure'].replace(0,data['BloodPressure'].mean())
data['Glucose'] = data['Glucose'].replace(0,data['Glucose'].mean())
data['Insulin'] = data['Insulin'].replace(0,data['Insulin'].mean())
data['SkinThickness'] = data['SkinThickness'].replace(0,data['SkinThickness'].mean())

In [5]:
# Handling the Outliers
q = data['Pregnancies'].quantile(0.98)
# we are removing the top 2% data from the Pregnancies column
data_cleaned = data[data['Pregnancies']<q]
q = data_cleaned['BMI'].quantile(0.99)
# we are removing the top 1% data from the BMI column
data_cleaned  = data_cleaned[data_cleaned['BMI']<q]
q = data_cleaned['SkinThickness'].quantile(0.99)
# we are removing the top 1% data from the SkinThickness column
data_cleaned  = data_cleaned[data_cleaned['SkinThickness']<q]
q = data_cleaned['Insulin'].quantile(0.95)
# we are removing the top 5% data from the Insulin column
data_cleaned  = data_cleaned[data_cleaned['Insulin']<q]
q = data_cleaned['DiabetesPedigreeFunction'].quantile(0.99)
# we are removing the top 1% data from the DiabetesPedigreeFunction column
data_cleaned  = data_cleaned[data_cleaned['DiabetesPedigreeFunction']<q]
q = data_cleaned['Age'].quantile(0.99)
# we are removing the top 1% data from the Age column
data_cleaned  = data_cleaned[data_cleaned['Age']<q]

### Train-test Split

In [6]:
X = data.drop(columns = ['Outcome'])
y = data['Outcome']

# we need to scale our data as well
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

x_train,x_test,y_train,y_test = train_test_split(X_scaled,y, test_size= 0.25, random_state = 355)

## Naive Bayes Model


### Model Training

In [7]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [8]:
model.fit(x_train,y_train)

GaussianNB()

### Model Prediction

In [10]:
y_pred = model.predict(x_test)

In [11]:
print(accuracy_score(y_test, y_pred))

0.7864583333333334


### Confusion Matrix

In [13]:
conf_mat = confusion_matrix(y_test,y_pred)
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]
conf_mat

array([[109,  16],
       [ 25,  42]], dtype=int64)

### Accuracy

In [14]:
Accuracy = (true_positive + true_negative) / (true_positive +false_positive + false_negative + true_negative)
Accuracy

0.7864583333333334

### Precision 

In [15]:
Precision = true_positive/(true_positive+false_positive)
Precision

0.872

### Recall

In [16]:
Recall = true_positive/(true_positive+false_negative)
Recall

0.8134328358208955

## Using the model

### 1

In [45]:
[x_test[8]]

[array([-0.84488505,  0.01046794,  1.46557097,  2.53440011,  1.08944444,
         2.51047245, -0.44358386, -0.19067191])]

In [46]:
y[8]

1

In [47]:
model.predict([x_test[8]])

array([1], dtype=int64)

### 2

In [48]:
x_test[13]

array([ 2.42174604,  0.96390741, -0.18622389,  1.39153921,  1.63771517,
        1.36069517,  0.81580563,  0.40494237])

In [49]:
y[13]

1

In [50]:
model.predict([x_test[8]])

array([1], dtype=int64)

### 3

In [51]:
x_test[134]

array([-1.14185152, -0.91009431, -0.260103  , -0.6306536 , -0.41776815,
       -0.06669102, -0.65197205, -0.70119842])

In [52]:
y[134]

0

In [53]:
model.predict([x_test[134]])

array([0], dtype=int64)