In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
data.head()

# Replacing zero values with null values since these features should not have zero values

In [None]:
data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

# Replacing null values with means of each feature

In [None]:
data['Glucose'].fillna(data['Glucose'].mean(), inplace = True)
data['BloodPressure'].fillna(data['BloodPressure'].mean(), inplace = True)
data['SkinThickness'].fillna(data['SkinThickness'].mean(), inplace = True)
data['Insulin'].fillna(data['Insulin'].mean(), inplace = True)
data['BMI'].fillna(data['BMI'].mean(), inplace = True)

In [None]:
print(data.corr())
plt.figure(figsize=(20,15))
sns.heatmap(data.corr(), annot = True)
plt.show()

In [None]:
scatter = pd.plotting.scatter_matrix(data, figsize = (20, 20))

In [None]:
scatter_2 = sns.pairplot(data, hue = 'Outcome')

# Assign X and y values

In [None]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Splitting dataset into training set and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# KNN Classifier

## Grid Search to find best hyper parameters for KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
parameters = {'n_neighbors': list(range(0,51)), 'metric':['minkowski', 'euclidean', 'manhattan', 'chebyshev']}
grid_search = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('KNN Classifier Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('KNN Classifier Best Parameters:', grid_search.best_params_)

## Training KNN Classifier

In [None]:
KNN = KNeighborsClassifier(n_neighbors = 31, metric = 'minkowski')
KNN.fit(X_train, y_train)
KNN_y_pred = KNN.predict(X_test)

## Computing TP, TN, FP, FN and Accuracy of KNN Classifier Model

In [None]:
print('k-Nearest Neighbors Confusion Matrix:', confusion_matrix(y_test, KNN_y_pred))
print('k-Nearest Neighbors Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, KNN_y_pred)*100))

## k-Fold Cross-Validation

In [None]:
KNN_scores = cross_val_score(estimator = KNN, X = X_train, y = y_train, cv = 10)
print('k-Nearest Neighbors Accuracies:', KNN_scores)
print('k-Nearest Neighbors Mean Accuracy = {:.2f}%'.format(KNN_scores.mean()*100))
KNN_score = round(KNN_scores.mean(), 4)
print('k-Nearest Neighbors Standard Deviation = {:.2f}%'.format(KNN_scores.std()*100))

# Logistic Regression

## Grid Search to find best hyper parameters for Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
C = list(range(1,11))
parameters = {'C': C, 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(estimator = LogisticRegression(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Logistic Regression Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('Logistic Regression Best Parameters:', grid_search.best_params_)

In [None]:
C = np.arange(0, 1.1, 0.001) #narrowing possible values of C
parameters = {'C': C, 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(estimator = LogisticRegression(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Logistic Regression Grid Search Best Accuracy = {:.5f}%'.format(grid_search.best_score_ *100))
print('Logistic Regression Best Parameters:', grid_search.best_params_)

In [None]:
LR = LogisticRegression(C = 0.005, solver = 'liblinear', random_state = 0)
LR.fit(X_train, y_train)
LR_y_pred = LR.predict(X_test)
print('Logistic Regression Confusion Matrix:', confusion_matrix(y_test, LR_y_pred))
print('Logistic Regression Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, LR_y_pred)*100))
LR_scores = cross_val_score(estimator = LR, X = X_train, y = y_train, cv = 10)
print('Logistic Regression Scores:', LR_scores)
print('Logistic Regression Mean Accuracy = {:.2f}%'.format(LR_scores.mean()*100))
LR_score = round(LR_scores.mean(), 4)
print('Logistic Regression Standard Deviation = {:.2f}%'.format(LR_scores.std()*100))

## Support Vector Machine

## Grid Search to find best hyper parameters for SVM Classifier

In [None]:
from sklearn.svm import SVC
C = list(range(1,11))
parameters = {'C': C}
grid_search = GridSearchCV(estimator = SVC(kernel = 'linear', random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('SVM Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('SVM Best Parameters:', grid_search.best_params_)

In [None]:
C = np.arange(0, 2.1, 0.001) #narrowing possible values of C
parameters = {'C': C}
grid_search = GridSearchCV(estimator = SVC(kernel = 'linear', random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('SVM Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('SVM Best Parameters:', grid_search.best_params_)

## Training SVM Classifier

In [None]:
svc = SVC(C = 0.007, kernel = 'linear', random_state = 0)
svc.fit(X_train, y_train)
svc_y_pred = svc.predict(X_test)
print('SVM Confusion Matrix:', confusion_matrix(y_test, svc_y_pred))
print('Support Vector Machine Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, svc_y_pred)*100))
svc_scores = cross_val_score(estimator = svc, X = X_train, y = y_train, cv = 10)
print('Support Vector Machine Scores:', svc_scores)
print('Support Vector Machine Mean Accuracy = {:.2f}%'.format(svc_scores.mean()*100))
svc_score = round(svc_scores.mean(), 4)
print('Support Vector Machine Standard Deviation = {:.2f}%'.format(svc_scores.std()*100))

# Kernel Support Vector Machine

## Grid Search to find best hyper parameters for Kernel SVM Classifier

In [None]:
gamma = ['scale', 'auto'] + list(range(1,51))
C = list(range(1,11))
parameters = {'C': C, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma}
grid_search = GridSearchCV(estimator = SVC(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Kernel SVM Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('Kernel SVM Best Parameters:', grid_search.best_params_)

In [None]:
gamma = ['scale', 'auto'] + list(np.arange(0, 2.1, 0.1))
C = np.arange(0, 1.1, 0.1)
parameters = {'C': C, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma}
grid_search = GridSearchCV(estimator = SVC(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Kernel SVM Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('Kernel SVM Best Parameters:', grid_search.best_params_)

In [None]:
gamma = ['scale', 'auto'] + list(np.arange(0, 0.5, 0.01))
C = np.arange(0, 0.5, 0.01)
parameters = {'C': C, 'kernel': ['rbf', 'sigmoid'], 'gamma': gamma}
grid_search = GridSearchCV(estimator = SVC(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Kernel SVM Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('Kernel SVM Best Parameters:', grid_search.best_params_)

## Training Kernel SVM Classifier

In [None]:
ksvc = SVC(C = 0.09, gamma = 0.07, kernel = 'sigmoid', random_state = 0)
ksvc.fit(X_train, y_train)
ksvc_y_pred = ksvc.predict(X_test)
print('Kernel SVM Confusion Matrix:', confusion_matrix(y_test, ksvc_y_pred))
print('Kernel Support Vector Machine Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, ksvc_y_pred)*100))
ksvc_scores = cross_val_score(estimator = ksvc, X = X_train, y = y_train, cv = 10)
print('Kernel Support Vector Machine Scores:', ksvc_scores)
print('Kernel Support Vector Machine Mean Accuracy = {:.2f}%'.format(ksvc_scores.mean()*100))
ksvc_score = round(ksvc_scores.mean(), 4)
print('Kernel Support Vector Machine Standard Deviation = {:.2f}%'.format(ksvc_scores.std()*100))

# Naive Bayes

## Training Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_y_pred = gnb.predict(X_test)
print('Naive Bayes Confusion Matrix:', confusion_matrix(y_test, gnb_y_pred))
print('Naive Bayes Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, gnb_y_pred)*100))
gnb_scores = cross_val_score(estimator = gnb, X = X_train, y = y_train, cv = 10)
print('Naive Bayes Scores:', gnb_scores)
print('Naive Bayes Mean Accuracy = {:.2f}%'.format(gnb_scores.mean()*100))
gnb_score = round(gnb_scores.mean(), 4)
print('Naive Bayes Standard Deviation = {:.2f}%'.format(gnb_scores.std()*100))

# Decision Tree Classifier

## Grid Search to find best hyper parameters

In [None]:
from sklearn.tree import DecisionTreeClassifier
max_feat = list(range(0,9)) + ['auto', 'sqrt', 'log2']
parameters = {'criterion':['gini', 'entropy'], 'max_features': max_feat}
grid_search = GridSearchCV(estimator = DecisionTreeClassifier(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Decision Tree Grid Search Best Accuracy =  {:.2f}%'.format(grid_search.best_score_ *100))
print('Decision Tree Best Parameters:', grid_search.best_params_)

## Training Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(criterion = 'entropy', max_features = 5, random_state = 0)
dt.fit(X_train, y_train)
dt_y_pred = dt.predict(X_test)
print('Decision Tree Confusion Matrix:', confusion_matrix(y_test, dt_y_pred))
print('Decision Tree Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, dt_y_pred)*100))
dt_scores = cross_val_score(estimator = dt, X = X_train, y = y_train, cv = 10)
print('Decision Tree Scores:', dt_scores)
print('Decision Tree Mean Accuracy = {:.2f}%'.format(dt_scores.mean()*100))
dt_score = round(dt_scores.mean(), 4)
print('Decision Tree Standard Deviation = {:.2f}%'.format(dt_scores.std()*100))

# Random Forest Classifier

## GridSearch to find best hyper parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier
max_feat = list(range(1, 9)) + ['auto', 'sqrt', 'log2']
parameters = {'n_estimators':list(range(1,31)), 'criterion':['gini', 'entropy'], 'max_features': max_feat}
grid_search = GridSearchCV(estimator = RandomForestClassifier(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Random Forest Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('Random Forest Best Parameters:', grid_search.best_params_)

In [None]:
max_feat = list(range(1, 9)) + ['auto', 'sqrt', 'log2']
parameters = {'n_estimators':list(range(1, 51)), 'criterion':['gini', 'entropy'], 'max_features': max_feat}
grid_search = GridSearchCV(estimator = RandomForestClassifier(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('Random Forest Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('Random Forest Best Parameters:', grid_search.best_params_)

## Training Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators = 30, criterion = 'gini', max_features = 3, random_state = 0)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
print('Random Forest Confusion Matrix:', confusion_matrix(y_test, rf_y_pred))
print('Random Forest Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, rf_y_pred)*100))
rf_scores = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)
print('Random Forest Scores:', rf_scores)
print('Random Forest Mean Accuracy = {:.2f}%'.format(rf_scores.mean()*100))
rf_score = round(rf_scores.mean(), 4)
print('Random Forest Standard Deviation = {:.2f}%'.format(rf_scores.std()*100))

# XGBoost Classifier

## Grid Search to find best hyper parameters

In [None]:
from xgboost import XGBClassifier
parameters = {'booster': ['gbtree', 'dart'], 'gamma': list(range(0, 11)), 'max_depth':list(range(1,7))}
grid_search = GridSearchCV(estimator = XGBClassifier(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('XGBoost Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('XGBoost Best Parameters:', grid_search.best_params_)

## Train XGBoost Classifier

In [None]:
xgb = XGBClassifier(booster = 'gbtree', gamma = 1, max_depth = 3, random_state = 0)
xgb.fit(X_train, y_train)
xgb_y_pred = xgb.predict(X_test)
print('XGBoost Confusion Matrix:', confusion_matrix(y_test, xgb_y_pred))
print('XGBoost Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, xgb_y_pred)*100))
xgb_scores = cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 10)
print('XGBoost Scores:', xgb_scores)
print('XGBoost Mean Accuracy = {:.2f}%'.format(xgb_scores.mean()*100))
xgb_score = round(xgb_scores.mean(), 4)
print('XGBoost Standard Deviation = {:.2f}%'.format(xgb_scores.std()*100))

# CatBoost Classifier

## Grid Search to find best hyper parameters

In [None]:
from catboost import CatBoostClassifier
parameters = {'n_estimators': list(range(1,51)), 'max_depth':list(range(1,7))}
grid_search = GridSearchCV(estimator = CatBoostClassifier(random_state = 0), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
print('CatBoost Grid Search Best Accuracy = {:.2f}%'.format(grid_search.best_score_ *100))
print('CatBoost Best Parameters:', grid_search.best_params_)

## Training CatBoost Classifier

In [None]:
cb = CatBoostClassifier(n_estimators = 18, max_depth = 6, random_state = 0)
cb.fit(X_train, y_train)
cb_y_pred = xgb.predict(X_test)
print('CatBoost Confusion Matrix:', confusion_matrix(y_test, cb_y_pred))
print('CatBoost Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, cb_y_pred)*100))
cb_scores = cross_val_score(estimator = cb, X = X_train, y = y_train, cv = 10)
print('CatBoost Scores:', cb_scores)
print('CatBoost Mean Accuracy = {:.2f}%'.format(cb_scores.mean()*100))
cb_score = round(cb_scores.mean(), 4)
print('CatBoost Standard Deviation = {:.2f}%'.format(cb_scores.std()*100))

# Finding the most accurate model

In [None]:
accuracies = []
accuracies.append(KNN_score)
print(accuracies)
accuracies.extend((LR_score, svc_score, ksvc_score, gnb_score, dt_score, rf_score, xgb_score, cb_score))
print(accuracies)

In [None]:
models = ['KNN', 'LR', 'SVC', 'KSVC', 'NB', 'DT', 'RF', 'XGB', 'CB']
score_dict = dict(zip(models, accuracies))
print(score_dict)
best_model = max(score_dict, key = score_dict.get)
print('Best Model is', best_model)

## The model which achieved the highest mean accuracy of 77.52% is the CatBoost Classifier.

# Final Model

In [None]:
cb = CatBoostClassifier(n_estimators = 18, max_depth = 6, random_state = 0)
cb.fit(X_train, y_train)
cb_y_pred = xgb.predict(X_test)
print('CatBoost Confusion Matrix:', confusion_matrix(y_test, cb_y_pred))
print('CatBoost Model Accuracy = {:.2f}%'.format(accuracy_score(y_test, cb_y_pred)*100))
cb_scores = cross_val_score(estimator = cb, X = X_train, y = y_train, cv = 10)
print('CatBoost Scores:', cb_scores)
print('CatBoost Mean Accuracy = {:.2f}%'.format(cb_scores.mean()*100))
cb_score = round(cb_scores.mean(), 4)
print('CatBoost Standard Deviation = {:.2f}%'.format(cb_scores.std()*100))