In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.style.use("Solarize_Light2")

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe(percentiles = [0.01, 0.1, 0.25, 0.90, 0.99])

In [None]:
from pandas.plotting import parallel_coordinates
from pandas.plotting import andrews_curves

In [None]:
plt.figure(figsize=(8,5))
plt.subplot(211)
parallel_coordinates(df, class_column = "Outcome")
plt.xticks(rotation = "vertical")
plt.show()

In [None]:
from pandas.plotting import radviz
plt.figure(figsize=(8,6))
radviz(df, class_column = "Outcome", colormap="Set1", alpha = 0.5)
plt.show()

In [None]:
plt.figure(figsize=(12,10))
sns.pairplot(df, hue = "Outcome", palette = "Set1")

In [None]:
df.head()

In [None]:
sns.countplot(df["Outcome"])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.iloc[:, :8]
Y = df.iloc[:, 8]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size=0.20)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score, f1_score

In [None]:
def get_model_evaluate(model, X_train, X_test, y_train, y_test):
    
    
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    print("**********************Training Results:*************************\n")
    
    #classification report 

    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict = True))

    print("Confusion Matrix:\n")
    
    #confusion matrix

    conf  = confusion_matrix(y_train, y_train_pred)

    print(conf) #Printing the confusion matrix

    TP = conf[1,1]
    TN = conf[0,0]
    FP = conf[0,1]
    FN = conf[1,0]
    
    print("\n")
    
    print("TP: %s, TN: %s, FP: %s, FN: %s \n" %(TP, TN, FP, FN))

    #printing the accuracy score
    
    print("Accuracy Score:%s \n" % accuracy_score(y_train, y_train_pred))

    #printing the accuracy error
    
    accuracy_error = (1-accuracy_score(y_train, y_train_pred))
    
    print("Accuracy Error: %s \n" % (accuracy_error))
    
    #printing the recall_score
    
    print("Sensitivity Score: %s \n" % (recall_score(y_train, y_train_pred)))
    
    #printing the specificity score
    
    print("Specificity Score: %s \n" % (TN/(TN+FP)))
    
    #printing the false positive rate

    print("False Positive Rate: %s \n" % (FP/float(TN+FP)))
    
    #printing the true positive rate
    
    print("True Positive Rate: %s \n" %(TP/float(TP+FN)))

    #printing the Precision Score

    print("Precision Score: %s \n" % precision_score(y_train, y_train_pred))

    #printing the f1_score

    print("F1 Score: %s \n" % f1_score(y_train, y_train_pred))

    #printing the classification report

    print("**Classification Report** \n")

    print(clf_report)


    print("\n**************************Testing Results:********************************\n")

    #Clasification Report

    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict = True))

    print("Confusion Matrix: \n")

    #confusion matrix 

    conf = confusion_matrix(y_test, y_test_pred)

    print(conf)

    TP = conf[1,1]
    TN = conf[0,0]
    FP = conf[0,1]
    FN = conf[1,0]

    print("\nTP: %s, TN: %s, FP: %s, FN: %s \n" %(TP, TN, FP, FN))

    #accuracy score

    print("Accuracy Score: %s \n" %(accuracy_score(y_test, y_test_pred)))

    #accuracy error
    
    accuracy_error = (1-accuracy_score(y_train, y_train_pred))
    
    print("Accuracy Error: %s \n" % (accuracy_error))
    
    #printing the recall_score

    print("Sensitivity Score: %s \n" % (recall_score(y_test, y_test_pred)))

    #printing the specificity score

    print("Specificity Score: %s \n" % (TN/(TN+FP)))

    #printing the false positive rate

    print("False Positive Rate: %s \n" %(FP/float(TN+FP)))
    
    #printing the true positive rate
    
    print("True Positive Rate: %s \n" %(TP/float(TP+FN)))

    #printing the Precision Score

    print("Precision Score: %s \n" % precision_score(y_test, y_test_pred))

    #printing the f1_score

    print("F1 Score: %s \n" %f1_score(y_test, y_test_pred))

    #printing the classification report

    print("**Classification Report** \n")

    print(clf_report)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression(random_state = 0, verbose = 0, n_jobs = -1)

In [None]:
lg.fit(X_train, y_train)

In [None]:
get_model_evaluate(lg, X_train, X_test, y_train, y_test)

In [None]:
scores = {
    
    "Logistic Regression":{
        
        "Train": accuracy_score(y_train, lg.predict(X_train)),
        "Test": accuracy_score(y_test, lg.predict(X_test))
        
    }
}

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier()

In [None]:
bagging_clf = BaggingClassifier(base_estimator = tree, n_estimators=1500, random_state = 42)

In [None]:
bagging_clf.fit(X_train, y_train)

In [None]:
get_model_evaluate(bagging_clf, X_train, X_test, y_train, y_test)

In [None]:
scores["Bagging Score"] = {
        
        "Train": accuracy_score(y_train, bagging_clf.predict(X_train)),
        "Test": accuracy_score(y_test, bagging_clf.predict(X_test))
        
    }

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(random_state = 42, n_estimators = 1000)

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
get_model_evaluate(rf_clf, X_train, X_test, y_train, y_test)

In [None]:
scores["Random Forest"] = {
    
    "Train": accuracy_score(y_train, rf_clf.predict(X_train)),
    "Test": accuracy_score(y_test, rf_clf.predict(X_test))
}

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
ex_tree = ExtraTreesClassifier(n_estimators = 1000, max_features = 7, random_state = 42)

In [None]:
ex_tree.fit(X_train, y_train)

In [None]:
get_model_evaluate(ex_tree, X_train, X_test, y_train, y_test)

In [None]:
scores["Extra Tree Classifier"] = {
    
    "Train": accuracy_score(y_train, ex_tree.predict(X_train)),
    "Test": accuracy_score(y_test, ex_tree.predict(X_test))
}

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_boost_clf = AdaBoostClassifier(n_estimators = 30)

In [None]:
ada_boost_clf.fit(X_train, y_train)

In [None]:
get_model_evaluate(ada_boost_clf, X_train, X_test, y_train, y_test)

In [None]:
scores["AdaBoost Classifier"] = {
    
    "Train": accuracy_score(y_train, ada_boost_clf.predict(X_train)),
    "Test": accuracy_score(y_test, ada_boost_clf.predict(X_test))
}

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
grad_boost = GradientBoostingClassifier(n_estimators = 100, random_state = 42)

In [None]:
grad_boost.fit(X_train, y_train)

In [None]:
get_model_evaluate(grad_boost, X_train, X_test, y_train, y_test)

In [None]:
scores["Gradient Boosting Classifier"] = {
    
    "Train": accuracy_score(y_train, grad_boost.predict(X_train)),
    "Test": accuracy_score(y_test, grad_boost.predict(X_test))
}

In [None]:
scores

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()

In [None]:
nb.fit(X_train, y_train)

In [None]:
get_model_evaluate(nb, X_train, X_test, y_train, y_test)

In [None]:
scores["Naive Bayes Score"] = {
    
    "Train": accuracy_score(y_train, nb.predict(X_train)),
    "Test": accuracy_score(y_test, nb.predict(X_test))
}

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd = SGDClassifier(random_state = 42)

In [None]:
sgd.fit(X_train, y_train)

In [None]:
get_model_evaluate(sgd, X_train, X_test, y_train, y_test)

In [None]:
scores["SGD Classifier"] = {
    
    "Train": accuracy_score(y_train, sgd.predict(X_train)),
    "Test": accuracy_score(y_test, sgd.predict(X_test))
}

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)

In [None]:
get_model_evaluate(knn, X_train, X_test, y_train, y_test)

In [None]:
scores["KNN Score"] = {
    
    "Train": accuracy_score(y_train, knn.predict(X_train)),
    "Test": accuracy_score(y_test, knn.predict(X_test))
}

In [None]:
scores

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC(kernel = "linear", random_state = 42)

In [None]:
svm.fit(X_train, y_train)

In [None]:
get_model_evaluate(svm, X_train, X_test, y_train, y_test)

In [None]:
scores["SVM Score"] = {
    
    "Train": accuracy_score(y_train, svm.predict(X_train)),
    "Test": accuracy_score(y_test, svm.predict(X_test))
}

In [None]:
scores = pd.DataFrame(scores).T

In [None]:
scores

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_clf = MLPClassifier(random_state = 0)

In [None]:
mlp_clf.fit(X_train, y_train)

In [None]:
get_model_evaluate(mlp_clf, X_train, X_test, y_train, y_test)

In [None]:
mlp = {
    
    "MLPClassifier":{
    
    "Train": accuracy_score(y_train, mlp_clf.predict(X_train)),
    "Test": accuracy_score(y_test, mlp_clf.predict(X_test))
        
    }
}

In [None]:
mlp = pd.DataFrame(mlp).T

In [None]:
scores = pd.concat([scores, mlp])

In [None]:
plt.figure(figsize=(8,12))
plt.subplot(211)
plt.barh(scores.index, scores["Train"])
plt.title("Training Model Scores")
plt.tight_layout()
for key, value in enumerate(scores["Train"]):
    plt.text(value, key, float(value))
plt.figure(figsize=(8,10))
plt.subplot(212)
plt.barh(scores.index, scores["Test"], color = "green")
plt.title("Testing Model Scores")
plt.tight_layout()
for key, value in enumerate(scores["Test"]):
    plt.text(value, key, float(value))    

Based on the above results, it seems logistic regression is performing well so we will be going to use logistic regression along with optimizing it's hyperparamters.

In [None]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

In [None]:
random_search = {
    
    "penalty": ["l1", "l2","elasticnet", "none"],
    "tol": [0.000001, 0.00001, 0.0001, 0.001],
    "C": [0.1, 0.5, 0.7, 0.01, 1.0],
    "fit_intercept": [True, False],
    "solver": ["lbfgs"],
    "max_iter": [100, 150, 200, 500, 1000],
    "multi_class": ['auto', 'ovr', 'multinomial']
}

In [None]:
lg_clf = LogisticRegression()

In [None]:
model = RandomizedSearchCV(estimator = lg_clf, param_distributions = random_search, n_iter = 100, scoring = "accuracy", verbose = 0, n_jobs = -1, random_state = 0)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.best_estimator_

In [None]:
lg_model = model.best_estimator_.predict(X_test)

In [None]:
print(confusion_matrix(y_test, lg_model))

In [None]:
print(accuracy_score(y_test, lg_model))

In [None]:
lg_model = pd.Series(lg_model)

In [None]:
y_test = y_test.reset_index().drop("index", axis = 1)

In [None]:
lg_model = pd.DataFrame(lg_model, columns = ["Predicted"])

In [None]:
pred_actual = pd.concat([y_test, lg_model], axis = 1)

In [None]:
X_test = X_test.reset_index().drop("index", axis = 1)

In [None]:
final_df = pd.concat([X_test, pred_actual], axis = 1)

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(121)
final_df["Outcome"].value_counts().plot(kind = "bar")
plt.title("Acutal")
plt.subplot(122)
final_df["Predicted"].value_counts().plot(kind = "bar", color = "orange")
plt.title("Predicted")
plt.show()

In [None]:
final_df

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(121)
sns.scatterplot(x = "Glucose", y = "BMI", hue = "Predicted", data = final_df, palette = "Set1")
plt.subplot(122)
sns.scatterplot(x = "Glucose", y = "BMI", hue = "Outcome", data = final_df, palette = "viridis")

![](http://)[If you liked this notebook, dont forget to upvote!!!!](http://)