In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.preprocessing import Binarizer,Normalizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

data = pd.read_csv("D:\\dataset.csv")

In [5]:
#This is the data after K-means clustering
print(data)

     Pregnancies_binarized   Glucose  BloodPressure  SkinThickness   Insulin  \
0                        0  0.512563       0.285714       0.147130  0.079085   
1                        1  0.542714       0.387755       0.271739  0.050481   
2                        1  0.557789       0.387755       0.147130  0.079085   
3                        0  0.688442       0.448980       0.076087  0.161058   
4                        1  0.487437       0.469388       0.086957  0.079085   
5                        1  0.472362       0.418367       0.163043  0.079085   
6                        1  0.562814       0.520408       0.271739  0.079085   
7                        1  0.542714       0.408163       0.147130  0.079085   
8                        1  0.447236       0.428571       0.173913  0.096154   
9                        1  0.527638       0.346939       0.147130  0.079085   
10                       1  0.457286       0.408163       0.184783  0.079085   
11                       1  0.582915    

## Splitting preprocessed data into train and test

In [6]:
X = data[['Pregnancies_binarized','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
y = data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=data['Outcome'],test_size=0.25, random_state=42)

## Naive bayes on preporcessed data

In [7]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (y_test != y_pred).sum(),
          100*(1-(y_test!= y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 192 points : 51, performance 73.44%


## Logistic Regression on preprocessed data

In [8]:
logreg = LogisticRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.766
Test set score: 0.745




## Testing accuracy after applying only K-means 

In [9]:
# from sklearn.cluster import KMeans
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
print(accuracy_score(data['Outcome'], data['cluster_0_1']))
print(confusion_matrix(data['Outcome'], data['cluster_0_1']))

0.72265625
[[371 129]
 [ 84 184]]


## Pulling correctly classified data after K-means

In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
new_list = list()
newDF = pd.DataFrame()
for (idx,row) in data.iterrows():
    if row.Outcome == row.cluster_0_1:
        new_list.append(row)
        #print(new_list)
        d = pd.DataFrame(new_list)

In [11]:
#This is the correctly classified data
# d

In [12]:
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report

## Applying logistic regression on Correctly classified data

In [13]:
# Splitting Data into train and test for logistic regression
X = d[['Pregnancies_binarized','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
y = d['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=d['Outcome'],test_size=0.25, random_state=42)

In [14]:
# from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.962
Test set score: 0.957




In [15]:
#Printing the confusion matrix and final accuracy score
print(confusion_matrix(y_test,logreg.predict(X_test)))
print(accuracy_score(y_test,logreg.predict(X_test)))


[[93  0]
 [ 6 40]]
0.9568345323741008


## Printing the classification report After logistic Regression

In [16]:
print(classification_report(y_test,logreg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        93
           1       1.00      0.87      0.93        46

   micro avg       0.96      0.96      0.96       139
   macro avg       0.97      0.93      0.95       139
weighted avg       0.96      0.96      0.96       139



## Performing 10-fold cross validation on correctly classified data

In [17]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kf = KFold(n_splits=10) 
# kfold = KFold(n_splits=10, random_state=10) 
score = cross_val_score(logreg, X, y, cv=kf, scoring='accuracy').mean()
print(score)

0.9527597402597403




## Applying Naive Baye's on correctly classified Data

In [18]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (y_test != y_pred).sum(),
          100*(1-(y_test!= y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 139 points : 3, performance 97.84%


In [19]:
print(classification_report(y_test,gnb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98        93
           1       0.96      0.98      0.97        46

   micro avg       0.98      0.98      0.98       139
   macro avg       0.97      0.98      0.98       139
weighted avg       0.98      0.98      0.98       139



In [22]:
print("ID    ", "Prediction_Value")
print(y_test)

ID     Prediction_Value
434    0
466    1
53     0
250    0
514    0
179    0
613    1
227    0
601    1
280    1
358    0
76     0
619    1
472    1
117    0
317    0
387    0
302    0
13     0
41     0
506    1
492    1
295    0
304    0
490    1
191    0
255    0
504    1
134    1
151    0
      ..
196    0
133    1
207    0
607    0
21     0
349    0
616    1
640    1
57     0
88     0
565    0
263    0
500    0
69     0
435    0
229    0
603    1
224    0
63     0
260    0
628    1
488    0
587    1
67     0
14     0
684    1
445    0
407    0
493    1
576    1
Name: Outcome, Length: 139, dtype: int64


## 