In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
#If we observe above in the min the minumum values of some parameters are 0.0000 like for gulcose,bloodpressure etc...
#so, we replace those values with mean of its column..
df['Glucose'] = df['Glucose'].replace(0,df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].replace(0,df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].replace(0,df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].replace(0,df['Insulin'].mean())
df['BMI'] = df['BMI'].replace(0,df['BMI'].mean())

In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.681605,72.254807,26.606479,118.660163,32.450805,0.471876,33.240885,0.348958
std,3.369578,30.436016,12.115932,9.631241,93.080358,6.875374,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,20.536458,79.799479,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,79.799479,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
#Independent and dependent features
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=80)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(514, 8)
(254, 8)
(514,)
(254,)


In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [8]:
parameters = {
    'C' : [0.1,1,10,100,200],
    'gamma' : [1,0.1,0.01,0.001,0.0001],
    'kernel' : ['linear','rbf','poly','sigmoid']
    }

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_scaled,y_train)

In [10]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
ypred = svc.predict(x_test_scaled)
print(accuracy_score(ypred,y_test))

0.7677165354330708


In [11]:
model_svc = RandomizedSearchCV(svc,param_distributions=parameters,cv=10,scoring='accuracy')
model_svc.fit(x_train_scaled,y_train)

In [12]:
model_svc.best_params_

{'kernel': 'sigmoid', 'gamma': 0.01, 'C': 1}

In [15]:
svc = SVC(kernel = 'sigmoid',gamma = 0.01,C = 1)
svc.fit(x_train_scaled,y_train)
y_pred = svc.predict(x_test_scaled)
print(accuracy_score(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.7519685039370079
[[146  42]
 [ 21  45]]
              precision    recall  f1-score   support

           0       0.87      0.78      0.82       188
           1       0.52      0.68      0.59        66

    accuracy                           0.75       254
   macro avg       0.70      0.73      0.71       254
weighted avg       0.78      0.75      0.76       254



# The Accuracy of Support Vector Classifier is : 76.77%

# Naive Bayers Algorithm

In [16]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train_scaled,y_train)

In [17]:
y_pred1 = gnb.predict(x_test_scaled)
print(accuracy_score(y_pred1,y_test))
print(confusion_matrix(y_pred1,y_test))
print(classification_report(y_pred1,y_test))

0.7204724409448819
[[136  40]
 [ 31  47]]
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       176
           1       0.54      0.60      0.57        78

    accuracy                           0.72       254
   macro avg       0.68      0.69      0.68       254
weighted avg       0.73      0.72      0.72       254



In [18]:
parameters1 = {
    'priors': [None],
    'var_smoothing': [0.00000001, 0.000000001, 0.00000001]
}

In [20]:
random_gnb = RandomizedSearchCV(gnb,param_distributions=parameters1,cv=9,scoring = 'accuracy',verbose = 3)
random_gnb.fit(x_train_scaled,y_train)

Fitting 9 folds for each of 3 candidates, totalling 27 fits
[CV 1/9] END ..priors=None, var_smoothing=1e-08;, score=0.810 total time=   0.0s
[CV 2/9] END ..priors=None, var_smoothing=1e-08;, score=0.719 total time=   0.0s
[CV 3/9] END ..priors=None, var_smoothing=1e-08;, score=0.737 total time=   0.0s
[CV 4/9] END ..priors=None, var_smoothing=1e-08;, score=0.702 total time=   0.0s
[CV 5/9] END ..priors=None, var_smoothing=1e-08;, score=0.649 total time=   0.0s
[CV 6/9] END ..priors=None, var_smoothing=1e-08;, score=0.667 total time=   0.0s
[CV 7/9] END ..priors=None, var_smoothing=1e-08;, score=0.825 total time=   0.0s
[CV 8/9] END ..priors=None, var_smoothing=1e-08;, score=0.789 total time=   0.0s
[CV 9/9] END ..priors=None, var_smoothing=1e-08;, score=0.860 total time=   0.0s
[CV 1/9] END ..priors=None, var_smoothing=1e-09;, score=0.810 total time=   0.0s
[CV 2/9] END ..priors=None, var_smoothing=1e-09;, score=0.719 total time=   0.0s
[CV 3/9] END ..priors=None, var_smoothing=1e-09;,

In [31]:
y_pred2 = random_gnb.predict(x_test_scaled)
print(accuracy_score(y_pred2,y_test))

0.7204724409448819


In [28]:
from sklearn.model_selection import GridSearchCV
grid_gnb = GridSearchCV(gnb,param_grid=parameters1,cv=9,scoring = 'accuracy',verbose = 3)
grid_gnb.fit(x_train_scaled,y_train)

Fitting 9 folds for each of 3 candidates, totalling 27 fits
[CV 1/9] END ..priors=None, var_smoothing=1e-08;, score=0.810 total time=   0.0s
[CV 2/9] END ..priors=None, var_smoothing=1e-08;, score=0.719 total time=   0.0s
[CV 3/9] END ..priors=None, var_smoothing=1e-08;, score=0.737 total time=   0.0s
[CV 4/9] END ..priors=None, var_smoothing=1e-08;, score=0.702 total time=   0.0s
[CV 5/9] END ..priors=None, var_smoothing=1e-08;, score=0.649 total time=   0.0s
[CV 6/9] END ..priors=None, var_smoothing=1e-08;, score=0.667 total time=   0.0s
[CV 7/9] END ..priors=None, var_smoothing=1e-08;, score=0.825 total time=   0.0s
[CV 8/9] END ..priors=None, var_smoothing=1e-08;, score=0.789 total time=   0.0s
[CV 9/9] END ..priors=None, var_smoothing=1e-08;, score=0.860 total time=   0.0s
[CV 1/9] END ..priors=None, var_smoothing=1e-09;, score=0.810 total time=   0.0s
[CV 2/9] END ..priors=None, var_smoothing=1e-09;, score=0.719 total time=   0.0s
[CV 3/9] END ..priors=None, var_smoothing=1e-09;,

In [32]:
y_pred3 = grid_gnb.predict(x_test_scaled)
print(accuracy_score(y_pred3,y_test))

0.7204724409448819


# Accuracy of Naive Bayes : 72.04%

# Accuracy of All Models are : 

# LogisticRegression - 76.77%
# Decision Tree Classifier - 74.80%
# Naive Bayes : 72.04%
# SVC - 76.77%