MODEL SELECTION: SCIKIT-LEARN

Imports

In [53]:
import sys
sys.setrecursionlimit(5000) # no RecursionError: maximum recursion depth exceeded
import pandas as pd
from sklearn.model_selection import train_test_split
#__SOCATHIC GRADIENT DESCENT___#
from sklearn.linear_model import SGDClassifier 
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from imblearn.combine import SMOTEENN
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

Reading the data

In [41]:
#Colums for CSV data
colums = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv('Data/Diabetes.csv',names=colums)

#First 5 test
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Data looks good, clean, no null values

Now train test split

In [42]:
X = df.drop(columns=['Outcome']) #This is what we are trying to predict
Y = df['Outcome'] #What we are predicting

# 80/20 training split for the Model
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X,Y,test_size=0.2,random_state=42)
print(f"Training Data Shape: {X_Train.shape}")
print(f"Testing Data Shape: {X_Test.shape}")

Training Data Shape: (614, 8)
Testing Data Shape: (154, 8)


Pefect 80/20 split. 8 Features as usual.

Logistic regression since outcome is binary (1 or 0) 

Using Stochastic Gradient Descent 
​


In [None]:
#_______REDACTED____________#
#Get another model#

#Data augment using SMOTE
# Standardize the features (important for gradient descent)
scaler = MinMaxScaler()
X_Train_Scaled = scaler.fit_transform(X_Train)
smote_enn = SMOTEENN(random_state=42)
X_Train_Resampled, Y_Train_Resampled = smote_enn.fit_resample(X_Train_Scaled, Y_Train)
X_Test_Scaled = scaler.transform(X_Test)

#SGD itself with log regression
SGD = SGDClassifier(
    penalty='l2', 
    loss='modified_huber', 
    learning_rate='adaptive', 
    eta0=0.01, 
    alpha=0.0001, 
    class_weight='balanced',  # Keep this if you need to handle imbalanced data
    max_iter=1000,
    random_state=42,
    n_iter_no_change=5,
    early_stopping=True
) 
#loss function is log, 1000 standard limit, smallest stepsize 1e-3, 42 is constant noise

#Adding Params(Beta) to the model
SGD.fit(X_Train_Resampled, Y_Train_Resampled)
#Finding best params
param_grid = { 
    'loss': ['hinge', 'log_loss', 'modified_huber'],
    'learning_rate': ['constant', 'adaptive'],
    'eta0': [0.001, 0.01],
    'alpha': [0.0001, 0.001],
    'penalty': ['l2', 'l1'],
}
RandomizedSearchCV_search = RandomizedSearchCV(SGD, param_grid, n_iter=50, cv=3, scoring='f1', n_jobs=2, random_state=42)
RandomizedSearchCV_search.fit(X_Train_Resampled, Y_Train_Resampled)
print(f"Best parameters: {RandomizedSearchCV_search.best_params_}")
Y_Pred = RandomizedSearchCV_search.best_estimator_.predict(X_Test_Scaled)
print(classification_report(Y_Test, Y_Pred))

#Accuarcy
accuracy = SGD.score(X_Test_Scaled, Y_Test)
print(f"Accuracy: {accuracy * 100: .2f} ")



Best parameters: {'penalty': 'l2', 'loss': 'modified_huber', 'learning_rate': 'adaptive', 'eta0': 0.01, 'alpha': 0.0001}
              precision    recall  f1-score   support

           0       0.85      0.64      0.73        99
           1       0.55      0.80      0.65        55

    accuracy                           0.69       154
   macro avg       0.70      0.72      0.69       154
weighted avg       0.74      0.69      0.70       154

Accuracy:  69.48 


Using a different model

In [60]:
#Still picking