## Importing the libraries

In [154]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

## Importing the dataset

In [155]:
!ls

 output.html  'Pima Indian Diabetes Database.ipynb'


In [156]:

url = "https://raw.githubusercontent.com/VishwajeetEkal/LP-II-Mini-Project/main/diabetes.csv"
dataset = pd.read_csv(url)

diabetes = dataset


#EDA using Pandas Profiling
# file = ProfileReport(dataset)
# file.to_file(output_file='output.html')

## Imputation


In [157]:
#Replacing the zero-values for Blood Pressure
df1 = dataset.loc[dataset['Outcome'] == 1]
df2 = dataset.loc[dataset['Outcome'] == 0]

df1 = df1.replace({'BloodPressure':0}, np.median(df1['BloodPressure']))
df2 = df2.replace({'BloodPressure':0}, np.median(df2['BloodPressure']))


In [158]:
#Replacing the zero-values for BMI
df1 = df1.replace({'BMI':0}, np.median(df1['BMI']))
df2 = df2.replace({'BMI':0}, np.median(df2['BMI']))


In [159]:
#Replacing the zero-values for Insulin
df1 = df1.replace({'Insulin':0}, np.median(df1['Insulin']))
df2 = df2.replace({'Insulin':0}, np.median(df2['Insulin']))


In [160]:
#Replacing the zero-values for SkinThickness
df1 = df1.replace({'SkinThickness':0}, np.median(df1['SkinThickness']))
df2 = df2.replace({'SkinThickness':0}, np.median(df2['SkinThickness']))

dataframe = [df1, df2]
dataset = pd.concat(dataframe)

## Feature Scaling

In [161]:
y = dataset.Outcome
x = dataset.drop('Outcome', axis = 1)
columns = x.columns

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(x)

data_x = pd.DataFrame(X, columns = columns)

## Splitting the dataset into the Training set and Test set 

In [162]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_x, y, test_size = 0.15, random_state = 45)


In [163]:
from imblearn.over_sampling import SMOTE

smt = SMOTE()

X_train, y_train = smt.fit_sample(X_train, y_train)

np.bincount(y_train)

array([430, 430])

## Training the Naive Bayes model on the Training set

In [164]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Predicting the Test set Results 

In [165]:
y_pred = classifier.predict(X_test)
print('Accuracy of NB classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of NB classifier on test set: 0.74


## Making the Confusion Matrix 

In [166]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[54 16]
 [14 32]]


In [167]:
import sklearn.metrics as metrics

print(metrics.f1_score(y_test, y_pred, average="macro"))
print(metrics.precision_score(y_test, y_pred, average="macro"))
print(metrics.recall_score(y_test, y_pred, average="macro"))

0.7317298797409806
0.7303921568627451
0.7335403726708074


## Training the K-NN model on the Training set 

In [168]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

## Predicting the Test set Results 

In [169]:
y_pred = classifier.predict(X_test)
print('Accuracy of K-NN model on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of K-NN model on test set: 0.74


## Making the Confusion Matrix 

In [170]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[54 16]
 [14 32]]


In [171]:
import sklearn.metrics as metrics

print(metrics.f1_score(y_test, y_pred, average="macro"))
print(metrics.precision_score(y_test, y_pred, average="macro"))
print(metrics.recall_score(y_test, y_pred, average="macro"))

0.7317298797409806
0.7303921568627451
0.7335403726708074


## Training the SVM model on the Training set 

In [172]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

## Predicting the Test set Results 

In [173]:
y_pred = classifier.predict(X_test)
print('Accuracy of SVM(linear) model on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of SVM(linear) model on test set: 0.73


## Making the Confusion Matrix 

In [174]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[52 18]
 [13 33]]


In [175]:
import sklearn.metrics as metrics

print(metrics.f1_score(y_test, y_pred, average="macro"))
print(metrics.precision_score(y_test, y_pred, average="macro"))
print(metrics.recall_score(y_test, y_pred, average="macro"))

0.7253913707521955
0.723529411764706
0.7301242236024845


## Training Random Forest Classifer on Training Set

In [176]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
classifier.fit(X_train, y_train)


RandomForestClassifier(max_features='sqrt', n_estimators=300)

## Predicting Test Set Results 

In [177]:
y_pred = classifier.predict(X_test)
print('Accuracy of Random Forest on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of Random Forest on test set: 0.84


In [178]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[63  7]
 [11 35]]


In [179]:
import sklearn.metrics as metrics

print(metrics.f1_score(y_test, y_pred, average="macro"))
print(metrics.precision_score(y_test, y_pred, average="macro"))
print(metrics.recall_score(y_test, y_pred, average="macro"))

0.8352272727272727
0.8423423423423424
0.8304347826086957
