In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_recall_fscore_support, f1_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pickle

In [42]:
with open("pickle/X_train.pkl", "rb") as f:
    X_train = pickle.load(f)

with open("pickle/X_test.pkl", "rb") as f:
    X_test= pickle.load(f)

with open("pickle/y_train.pkl", "rb") as f:
    y_train = pickle.load(f)

with open("pickle/y_test.pkl", "rb") as f:
    y_test= pickle.load(f)


In [43]:
X_test.isnull().sum()

Pregnancies                 0
BloodPressure               0
SkinThickness               0
Insulin                     0
DiabetesPedigreeFunction    0
Age                         0
BMI_Normal                  0
BMI_Obesity I               0
BMI_Obesity II              0
BMI_Obesity III             0
BMI_Overweight              0
BMI_Underweight             0
Glucose_High Glucose        0
Glucose_Low Glucose         0
Glucose_Normal              0
Glucose_Prediabetic         0
dtype: int64

In [44]:
model_1 = LogisticRegression()
model_1.fit(X_train, y_train)

In [45]:
y_pred_model_1 = model_1.predict(X_test)
acc = accuracy_score(y_test, y_pred_model_1) * 100


recall  = recall_score(y_test, y_pred_model_1) * 100
precision = precision_score(y_test,y_pred_model_1) * 100
f1= f1_score(y_test, y_pred_model_1) * 100


print(f"Logistic Regression model recall: {recall:.2f}%")
print(f"Logistic Regression model precision : {precision :.2f}%")
print(f"Logistic Regression model f1: {f1:.2f}%")
print(f"Logistic Regression model acc: {acc:.2f}%")

Logistic Regression model recall: 78.95%
Logistic Regression model precision : 74.26%
Logistic Regression model f1: 76.53%
Logistic Regression model acc: 82.90%


In [46]:
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_model_1)
print("Confusion Matrix:")
print(cm)

# Optional: more details
print("\nClassification Report:")
print(classification_report(y_test, y_pred_model_1))

Confusion Matrix:
[[148  26]
 [ 20  75]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.85      0.87       174
           1       0.74      0.79      0.77        95

    accuracy                           0.83       269
   macro avg       0.81      0.82      0.82       269
weighted avg       0.83      0.83      0.83       269



In [51]:
with open("models/model_1.pkl", "wb") as file:
    pickle.dump(model_1, file)


In [47]:
param_grid = [
    {'penalty': ['l2'], 'solver': ['lbfgs', 'newton-cg', 'sag'], 'max_iter': [100, 1000, 2500]},
    {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'max_iter': [100, 1000, 2500]},
    {'penalty': ['l2'], 'solver': ['liblinear', 'saga'], 'max_iter': [100, 1000, 2500]}
]


In [48]:
from sklearn.model_selection import GridSearchCV
model_2 = LogisticRegression()

clf = GridSearchCV(model_2,param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)
best_clf = clf.fit(X_train,y_train)
best_clf.best_estimator_

Fitting 3 folds for each of 21 candidates, totalling 63 fits


In [49]:
best_lr = clf.best_estimator_
y_pred_model_2 = best_lr.predict(X_test)

cm_2 = confusion_matrix(y_test, y_pred_model_2)
print("Confusion Matrix:")
print(cm_2)

# Optional: more details
print("\nClassification Report:")
print(classification_report(y_test, y_pred_model_2))


Confusion Matrix:
[[149  25]
 [ 20  75]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       174
           1       0.75      0.79      0.77        95

    accuracy                           0.83       269
   macro avg       0.82      0.82      0.82       269
weighted avg       0.84      0.83      0.83       269



In [50]:
with open("models/lr_model_2.pkl", "wb") as file:
    pickle.dump(model_2, file)


In [None]:
from sklearn.model_selection import GridSearchCV
model_2 = LogisticRegression()

clf = GridSearchCV(model_2,param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)
best_clf = clf.fit(X_train,y_train)
best_clf.best_estimator_

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(n_neighbors=13)
model_knn.fit(X_train, y_train)
knn_pred = model_knn.predict(X_test)

# Predictions and Evaluations
# Let's evaluate our KNN model !
print(confusion_matrix(y_test, knn_pred))
print(classification_report(y_test, knn_pred))

In [None]:
error_rate = []

# Will take some time
for i in range(1, 40):

    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
    # take the mean of of error
    #print(error_rate.append(np.mean(pred_i != y_test)))

plt.figure(figsize=(10, 6))
plt.plot(range(1, 40), error_rate, color='blue',
         linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)

plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()

In [None]:
# NOW WITH K = 10
model_knn = KNeighborsClassifier(n_neighbors = 10)

model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

print('WITH K = 10')
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred_knn))
print('Classification Report')
print(classification_report(y_test, y_pred_knn))

In [None]:
with open("models/model_knn.pkl", "wb") as file:
    pickle.dump(model_knn, file)


In [None]:
# decision tree
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
y_pred_dt = model_dt.predict(X_test)
print(confusion_matrix(y_test, y_pred_dt))
print('Classification Report')
print(classification_report(y_test, y_pred_dt))

In [None]:
param_grid = {
    'max_depth' : [10,20,30, None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf' : [1,4,7],
    'criterion' : ['gini']
}

model_dt_2 = DecisionTreeClassifier(random_state=42)
grid= GridSearchCV(estimator= model_dt_2, param_grid= param_grid,cv=5, n_jobs=-1 )
grid.fit(X_train,y_train)
y_pred_dt_2 = grid.predict(X_test)
print(confusion_matrix(y_test, y_pred_dt_2))
print('Classification Report')
print(classification_report(y_test, y_pred_dt_2))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rt = RandomForestClassifier(random_state=54, n_estimators=50)
rt.fit(X_train,y_train)
y_pred_rt = rt.predict(X_test)
print(confusion_matrix(y_test, y_pred_rt))
print('Classification Report')
print(classification_report(y_test, y_pred_rt))


In [None]:
with open("models/model_rt.pkl", "wb") as file:
    pickle.dump(rt, file)


In [None]:
## Model  Comparison
from sklearn import metrics
plt.figure(figsize=(8,5))
models = [
{
    'label': 'LR',
    'model': model_1,
},
{
    'label': 'DT',
    'model': model_dt_2,
},

{
    'label': 'KNN',
    'model': model_knn,
},

{
    'label': 'RF',
    'model': rt,
},

]
for m in models:
    model = m['model'] 
    model.fit(X_train, y_train) 
    y_pred=model.predict(X_test) 
    fpr1, tpr1, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
    auc = metrics.roc_auc_score(y_test,model.predict(X_test))
    plt.plot(fpr1, tpr1, label='%s - ROC (area = %0.2f)' % (m['label'], auc))

plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity (False Positive Rate)', fontsize=12)
plt.ylabel('Sensitivity (True Positive Rate)', fontsize=12)
plt.title('ROC - Diabetes Prediction', fontsize=12)
plt.legend(loc="lower right", fontsize=12)
plt.savefig("images/roc_diabetes.png", format='jpeg', dpi=400, bbox_inches='tight')
plt.show()
