In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [None]:
"""Attributes:

Dataset information:

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases.
Several constraints were placed on the selection of these instances from a larger database. 
In particular, all patients here are females at least 21 years old of Pima Indian heritage.


Pregnancies: Number of times pregnant

Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test

BloodPressure: Diastolic blood pressure (mm Hg)

SkinThickness: Triceps skin fold thickness (mm)

Insulin: 2-Hour serum insulin (mu U/ml)

BMI: Body mass index (weight in kg/(height in m)^2)

DiabetesPedigreeFunction: Diabetes pedigree function

Age: Age (years)

Outcome: Class variable (0 or 1)

"""

In [None]:
diabetesDF = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
diabetesDF.head()

In [None]:
diabetesDF.info()

In [None]:
corr = diabetesDF.corr()
corr

In [None]:
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
#Total 768 patients record
#Using 650 data for training
# Using 100 data for testing
#Using 18 data for checking

dfTrain = diabetesDF[:650]
dfTest = diabetesDF[650:750]
dfCheck = diabetesDF[750:]

In [None]:
#Separating label and features and converting to numpy array to feed into our model
trainLabel = np.asarray(dfTrain['Outcome'])
trainData = np.asarray(dfTrain.drop('Outcome',1))
testLabel = np.asarray(dfTest['Outcome'])
testData = np.asarray(dfTest.drop('Outcome',1))


In [None]:
# Normalize the data 
means = np.mean(trainData, axis=0)
stds = np.std(trainData, axis=0)

trainData = (trainData - means)/stds
testData = (testData - means)/stds

# means = np.mean(trainData, axis=0)
# stds = np.std(trainData, axis=0)

In [None]:
#Now , we will use the our training data to 
#create a bayesian classifier.

diabetesCheck = SVC()
diabetesCheck.fit(trainData, trainLabel)

#After we train our bayesian classifier , 
#we test how well it works using our test data.
accuracy = diabetesCheck.score(testData,testLabel)
print("accuracy = ",accuracy * 100,"%")

In [None]:
diabetesCheck = LogisticRegression()
diabetesCheck.fit(trainData,trainLabel)
accuracy = diabetesCheck.score(testData,testLabel)
print("accuracy = ",accuracy * 100,"%")

In [None]:
coeff = list(diabetesCheck.coef_[0])
coeff

In [None]:
labels = list(dfTrain.drop('Outcome',1).columns)
labels

In [None]:
features = pd.DataFrame()
features['Features'] = labels
features['importance'] = coeff
features.sort_values(by=['importance'], ascending=True, inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'}))
plt.xlabel('Importance')

In [None]:
#testing loaded model to make prediction
accuracyModel = diabetesCheck.score(testData,testLabel)
print("accuracy = ",accuracyModel * 100,"%")

In [None]:
dfCheck.head()

In [None]:
sampleData = dfCheck[:1]
sampleDataFeatures = np.asarray(sampleData.drop('Outcome',1))
sampleDataFeatures

In [None]:
prediction = diabetesCheck.predict(sampleDataFeatures)
predictionProbab = diabetesCheck.predict_proba(sampleDataFeatures)
print('prediction : ',prediction)
print('prediction Probabability : ', predictionProbab)

In [None]:
prediction

In [None]:
knnmodel = KNeighborsClassifier(n_neighbors=11)
knnmodel.fit(trainData, trainLabel)
knnpred = knnmodel.predict(testData)

In [None]:
print('accuracy = ', knnmodel.score(testData, testLabel) * 100)

In [None]:
confusion_matrix(knnpred, testLabel)