# Comparing the performance of 3 liberaries using basic logistic regression models

### 1- siket-learn model

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('/Users/shahadaleissa/hyper_code/Dataset/cleaned_hypertension_data.csv')

# Split data into features and target
X = data.drop('Class', axis=1)
y = data['Class']



# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Create Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions and accuracy
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Scikit-learn Accuracy:", accuracy)


Scikit-learn Accuracy: 0.735


In [14]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
import numpy as np
cv=KFold(n_splits=5,random_state=42,shuffle=True)
grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=cv)
logreg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 0.1, 'penalty': 'l2'}
accuracy : 0.697857142857143


In [17]:
#best model
best_model=LogisticRegression(C=0.1,penalty="l2")
best_model.fit(X_train,y_train)
print("best model accuracy:",best_model.score(X_test,y_test))

best model accuracy: 0.735


### 2- statsmodels model

In [69]:
import statsmodels.api as sm

# Add constant to features
X = sm.add_constant(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create Logistic Regression model
model = sm.Logit(y_train, X_train)
result = model.fit()

# Predictions and accuracy
predictions = result.predict(X_test)
predictions = round(predictions)
accuracy = accuracy_score(y_test, predictions)
print("Statsmodels Accuracy:", accuracy)


Optimization terminated successfully.
         Current function value: 0.538264
         Iterations 6
Statsmodels Accuracy: 0.735


### 3- tensorflow model

In [70]:
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert data to TensorFlow tensors
X_train = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
X_test = tf.convert_to_tensor(X_test.values, dtype=tf.float32)
y_train = tf.convert_to_tensor(y_train.values, dtype=tf.float32)
y_test = tf.convert_to_tensor(y_test.values, dtype=tf.float32)

# Create Logistic Regression model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, verbose=0)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("TensorFlow Accuracy:", accuracy)


TensorFlow Accuracy: 0.4833333194255829


based on the 3 comparisons statsmodels had the best performance so i will proceed with that library

techniques like hyper paramater tuning or data transformation had no impact on the accuracy of the model

## trying different resampling techniques SMOTE showed better results

In [71]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Resampling techniques
techniques = ['oversample', 'smote', 'undersample']
accuracy_results = {}

for technique in techniques:
    # Resampling
    if technique == 'oversample':
        resampler = RandomOverSampler(random_state=42)
    elif technique == 'smote':
        resampler = SMOTE(random_state=42)
    elif technique == 'undersample':
        resampler = RandomUnderSampler(random_state=42)

    X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)

    # Create and fit the logistic regression model
    X_train_const = sm.add_constant(X_resampled)
    X_test_const = sm.add_constant(X_test)
    model = sm.Logit(y_resampled, X_train_const)
    result = model.fit(disp=0)

    # Predict and calculate accuracy
    predictions = result.predict(X_test_const)
    predictions = (predictions > 0.5).astype(int)
    accuracy = accuracy_score(y_test, predictions)

    accuracy_results[technique] = accuracy
    print(f"Accuracy with {technique}: {accuracy}")

# Output the best technique based on accuracy
best_technique = max(accuracy_results, key=accuracy_results.get)
print(f"Best technique: {best_technique} with an accuracy of {accuracy_results[best_technique]}")


Accuracy with oversample: 0.7316666666666667
Accuracy with smote: 0.7383333333333333
Accuracy with undersample: 0.7366666666666667
Best technique: smote with an accuracy of 0.7383333333333333


### statemodels model with SMOTE and stratified cross validation

In [72]:

# Function for 10-fold stratified cross-validation with SMOTE
def stratified_cv_smote(X, y):
    skf = StratifiedKFold(n_splits=10)
    best_fold_metrics = None
    highest_accuracy = 0
    fold_number = 1
    best_fold_number = 0

    # Calculate accuracy after SMOTE and before cross-validation
    smote = SMOTE(random_state=42)
    X_smote, y_smote = smote.fit_resample(X, y)
    model = sm.Logit(y_smote, sm.add_constant(X_smote))
    result = model.fit(disp=0)
    predictions_smote = result.predict(sm.add_constant(X_smote))
    predictions_smote = (predictions_smote > 0.5).astype(int)
    accuracy_smote = accuracy_score(y_smote, predictions_smote)
    print(f"Accuracy after SMOTE (before CV): {accuracy_smote}")

    for train_index, test_index in skf.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE
        X_train_smote, y_train_smote = smote.fit_resample(X_train_fold, y_train_fold)

        # Fit model
        X_train_smote_const = sm.add_constant(X_train_smote)
        X_test_smote_const = sm.add_constant(X_test_fold)
        model = sm.Logit(y_train_smote, X_train_smote_const)
        result = model.fit(disp=0)

        # Predict and calculate metrics
        predictions = result.predict(X_test_smote_const)
        predictions = (predictions > 0.5).astype(int)
        accuracy = accuracy_score(y_test_fold, predictions)

        if accuracy > highest_accuracy:
            highest_accuracy = accuracy
            best_fold_number = fold_number
            best_fold_metrics = {
                "accuracy": accuracy,
                "precision": precision_score(y_test_fold, predictions),
                "recall": recall_score(y_test_fold, predictions),
                "f1": f1_score(y_test_fold, predictions),
                "confusion_matrix": confusion_matrix(y_test_fold, predictions)
            }

        fold_number += 1

    #print(f"Highest accuracy was in fold {best_fold_number}: {highest_accuracy}")
    return best_fold_metrics

# Apply 10-fold stratified cross-validation with SMOTE
results = stratified_cv_smote(X, y)

# Print out the results
print(f"Final Accuracy (after CV): {results['accuracy']}")
print(f"Precision: {results['precision']}")
print(f"Recall: {results['recall']}")
print(f"F1-Score: {results['f1']}")
print(f"Confusion Matrix:\n{results['confusion_matrix']}")

Accuracy after SMOTE (before CV): 0.7161895360315893
Final Accuracy (after CV): 0.755
Precision: 0.7358490566037735
Recall: 0.7878787878787878
F1-Score: 0.7609756097560976
Confusion Matrix:
[[73 28]
 [21 78]]
