In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().values.any()

In [None]:
df.hist(figsize=(20,20))
#before preprocessing

In [None]:
df.groupby('Outcome').size()

In [None]:
sns.countplot(x='Outcome', data=df)

In [None]:
df.plot(kind='box', figsize=(20,10))
plt.show()

In [None]:
df = df[df['SkinThickness'] < 80]
df = df[df['Insulin'] <= 600]
df.shape

In [None]:
corrmat = df.corr()
plt.figure(figsize=(20,10))
sns.heatmap(corrmat, annot=True, cmap='coolwarm')

In [None]:
df.corr()

In [None]:
print("total number of rows : {0}".format(len(df)))
print("number of missing pregnancies: {0}".format(len(df.loc[df['Pregnancies'] == 0])))
print("number of missing glucose: {0}".format(len(df.loc[df['Glucose'] == 0])))
print("number of missing bp: {0}".format(len(df.loc[df['BloodPressure'] == 0])))
print("number of missing skinthickness: {0}".format(len(df.loc[df['SkinThickness'] == 0])))
print("number of missing insulin: {0}".format(len(df.loc[df['Insulin'] == 0])))
print("number of missing bmi: {0}".format(len(df.loc[df['BMI'] == 0])))
print("number of missing diabetespedigree: {0}".format(len(df.loc[df['DiabetesPedigreeFunction'] == 0])))
print("number of missing age: {0}".format(len(df.loc[df['Age'] == 0])))

In [None]:
df.loc[df['Insulin'] == 0, 'Insulin'] = df['Insulin'].mean() 
df.loc[df['Glucose'] == 0, 'Glucose'] = df['Glucose'].mean() 
df.loc[df['BMI'] == 0, 'BMI'] = df['BMI'].mean() 
df.loc[df['BloodPressure'] == 0, 'BloodPressure'] = df['BloodPressure'].mean() 
df.loc[df['SkinThickness'] == 0, 'SkinThickness'] = df['SkinThickness'].mean() 

In [None]:
df.head()

In [None]:
sns.pairplot(df, hue='Outcome')

In [None]:
df.hist(figsize=(20,20))
#after preprocessing

In [None]:
df = df/df.max()
df.head()

In [None]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]
X.head(10)
#y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [None]:
l=[]

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
classifier = SVC(kernel = 'linear', random_state = 42)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('SVM:', acc * 100)
l.append(acc)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='YlGnBu')
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Logistic Regression:', acc * 100)
l.append(acc)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='YlGnBu')
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import classification_report as cr
print(cr(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Decision Tree:', acc * 100)
l.append(acc)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Naive Bayes:', acc * 100)
l.append(acc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Random Forest:',acc * 100)
l.append(acc)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Knn:',acc * 100)
l.append(acc)

In [None]:
l

In [None]:
y_axis=['Support Vector Classifier',
      'Logistic Regression',
      'Decision Tree Classifier',
       'Gaussian Naive Bayes',
      'Random Forest Classifier',
      'K-Neighbors Classifier']
x_axis=l
sns.barplot(x=x_axis,y=y_axis)
plt.xlabel('Accuracy')

Logistic Regression shows the best accuracy (82.35 %)