In [18]:
# Update this at the top of your modeling section
results = []

def store_result(branch, model_name, feature_strategy, tuning, accuracy, precision, recall, f1):
    results.append({
        'Branch': branch,
        'Model': model_name,
        'Feature Selection': feature_strategy,
        'Tuned': tuning,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    })

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Load data
df = pd.read_csv("nyc.csv")  
# Drop rows where AIDS_diagnosed is missing
df = df.dropna(subset=["AIDS_diagnosed"])

# Replace 'None' with np.nan in Concurrent_diagnosed
df['Concurrent_diagnosed'] = df['Concurrent_diagnosed'].replace('None', np.nan)
df['Concurrent_diagnosed'] = df['Concurrent_diagnosed'].fillna('No Other Disease')

# Fill missing or None values (mode for categorical, median for numeric)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].replace("None", np.nan, inplace=True)
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

# Convert boolean-like columns to 0/1
bool_columns = ['HIV_diagnosed', 'AIDS_diagnosed', 'Linked_to_Care_3mo', 'Death_Status']
for col in bool_columns:
    if df[col].dtype == 'object':
        df[col] = df[col].map({'No': 0, 'Yes': 1, 'Alive': 0, 'Deceased': 1, True: 1, False: 0})
    df[col] = df[col].astype(int)

# Outlier detection (optional)
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.01, random_state=42)
outliers = iso.fit_predict(df.select_dtypes(include=np.number))
df = df[outliers == 1]

# Split features and target
X = df.drop('AIDS_diagnosed', axis=1)
y = df['AIDS_diagnosed']

# Encode categorical variables
cat_columns = X.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_columns:
    X[col] = le.fit_transform(X[col])

# === Handle class imbalance using SMOTE ===
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Standard scaling for most models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_bal)
X = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y_bal, test_size=0.2, random_state=42)

# MinMax scaling for chi-square feature selection
minmax_scaler = MinMaxScaler()
X_bal_minmax = minmax_scaler.fit_transform(X_bal)
X_minmax = pd.DataFrame(X_bal_minmax, columns=X.columns)
X_train_minmax, X_test_minmax, y_train_minmax, y_test_minmax = train_test_split(X_minmax, y_bal, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_bal)
X = pd.DataFrame(X_scaled, columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_bal, test_size=0.2, random_state=42)

print("✅ Preprocessing complete.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace("None", np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

✅ Preprocessing complete.


BRANCH A : TRAIN ON 80% TEST ON 20%

In [8]:
# Branch A: 80% train, 20% test
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)

# Standard scaling for most models
scaler_A = StandardScaler()
X_train_A_scaled = scaler_A.fit_transform(X_train_A)
X_test_A_scaled = scaler_A.transform(X_test_A)
X_train_A_scaled = pd.DataFrame(X_train_A_scaled, columns=X.columns)
X_test_A_scaled = pd.DataFrame(X_test_A_scaled, columns=X.columns)

# MinMax scaling for chi-square
minmax_scaler_A = MinMaxScaler()
X_train_A_minmax = minmax_scaler_A.fit_transform(X_train_A)
X_test_A_minmax = minmax_scaler_A.transform(X_test_A)
X_train_A_minmax = pd.DataFrame(X_train_A_minmax, columns=X.columns)
X_test_A_minmax = pd.DataFrame(X_test_A_minmax, columns=X.columns)

BRANCH B : TRAIN ON 20% TEST ON 80%

In [9]:
# Branch B: 20% train, 80% test
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_bal, y_bal, test_size=0.8, random_state=42)

# Standard scaling for most models
scaler_B = StandardScaler()
X_train_B_scaled = scaler_B.fit_transform(X_train_B)
X_test_B_scaled = scaler_B.transform(X_test_B)
X_train_B_scaled = pd.DataFrame(X_train_B_scaled, columns=X.columns)
X_test_B_scaled = pd.DataFrame(X_test_B_scaled, columns=X.columns)

# MinMax scaling for chi-square
minmax_scaler_B = MinMaxScaler()
X_train_B_minmax = minmax_scaler_B.fit_transform(X_train_B)
X_test_B_minmax = minmax_scaler_B.transform(X_test_B)
X_train_B_minmax = pd.DataFrame(X_train_B_minmax, columns=X.columns)
X_test_B_minmax = pd.DataFrame(X_test_B_minmax, columns=X.columns)

## FEATURE SELECTION ARRAYS

In [10]:
#Chi-square (use MinMax scaled data)

from sklearn.feature_selection import SelectKBest, chi2

k = 10  # or any number you prefer

# Branch A
selector_A = SelectKBest(score_func=chi2, k=k)
X_train_A_chi = selector_A.fit_transform(X_train_A_minmax, y_train_A)
X_test_A_chi = selector_A.transform(X_test_A_minmax)

# Branch B
selector_B = SelectKBest(score_func=chi2, k=k)
X_train_B_chi = selector_B.fit_transform(X_train_B_minmax, y_train_B)
X_test_B_chi = selector_B.transform(X_test_B_minmax)

In [11]:
# Correlation (Use Standard scaled data)

correlations_A = X_train_A_scaled.corrwith(pd.Series(y_train_A)).abs()
top_k_A = correlations_A.sort_values(ascending=False).head(k).index.tolist()
X_train_A_corr = X_train_A_scaled[top_k_A]
X_test_A_corr = X_test_A_scaled[top_k_A]

# Branch B
correlations_B = X_train_B_scaled.corrwith(pd.Series(y_train_B)).abs()
top_k_B = correlations_B.sort_values(ascending=False).head(k).index.tolist()
X_train_B_corr = X_train_B_scaled[top_k_B]
X_test_B_corr = X_test_B_scaled[top_k_B]

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


## SVM BRANCH A (80/20)

In [None]:
#SVM BRANCH A NO FEATURE SELECTION
#NOT TUNED

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

svm_A = SVC(random_state=42)
svm_A.fit(X_train_A_scaled, y_train_A)
y_pred_svm_A = svm_A.predict(X_test_A_scaled)
print("SVM Branch A (No Feature Selection, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_svm_A))
print(classification_report(y_test_A, y_pred_svm_A))
store_result('A','SVM', 'None', 'No', accuracy_score(y_test_A, y_pred_svm_A),
             precision_score(y_test_A, y_pred_svm_A, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_svm_A, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_svm_A, average='weighted', zero_division=0))

SVM Branch A (No Feature Selection, Not Tuned) Accuracy: 0.6902324461038178
              precision    recall  f1-score   support

           0       0.67      0.76      0.71      4154
           1       0.72      0.62      0.67      4149

    accuracy                           0.69      8303
   macro avg       0.69      0.69      0.69      8303
weighted avg       0.69      0.69      0.69      8303



In [None]:
#TUNED
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
svm_A_tuned = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy')
svm_A_tuned.fit(X_train_A_scaled, y_train_A)
y_pred_svm_A_tuned = svm_A_tuned.predict(X_test_A_scaled)
print("SVM Branch A (No Feature Selection, Tuned) Best Params:", svm_A_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_svm_A_tuned))
print(classification_report(y_test_A, y_pred_svm_A_tuned))
store_result('A','SVM', 'None', 'Yes', accuracy_score(y_test_A, y_pred_svm_A_tuned),
             precision_score(y_test_A, y_pred_svm_A_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_svm_A_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_svm_A_tuned, average='weighted', zero_division=0))

In [None]:
# SVM BRANCH A CHI SQUARE FEATURE SELECTION
# NOT TUNED

svm_A_chi = SVC(random_state=42)
svm_A_chi.fit(X_train_A_chi, y_train_A)
y_pred_svm_A_chi = svm_A_chi.predict(X_test_A_chi)
print("SVM Branch A (Chi-square, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_svm_A_chi))
print(classification_report(y_test_A, y_pred_svm_A_chi))
store_result('A''SVM', 'Chi-square', 'No', accuracy_score(y_test_A, y_pred_svm_A_chi),
             precision_score(y_test_A, y_pred_svm_A_chi, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_svm_A_chi, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_svm_A_chi, average='weighted', zero_division=0))

In [None]:
#TUNED
svm_A_chi_tuned = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy')
svm_A_chi_tuned.fit(X_train_A_chi, y_train_A)
y_pred_svm_A_chi_tuned = svm_A_chi_tuned.predict(X_test_A_chi)
print("SVM Branch A (Chi-square, Tuned) Best Params:", svm_A_chi_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_svm_A_chi_tuned))
print(classification_report(y_test_A, y_pred_svm_A_chi_tuned))
store_result('A','SVM', 'Chi-square', 'Yes', accuracy_score(y_test_A, y_pred_svm_A_chi_tuned),
             precision_score(y_test_A, y_pred_svm_A_chi_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_svm_A_chi_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_svm_A_chi_tuned, average='weighted', zero_division=0))


In [None]:
# SVM BRANCH A CORRELATION FEATURE SELECTION
# NOT TUNED
svm_A_corr = SVC(random_state=42)
svm_A_corr.fit(X_train_A_corr, y_train_A)
y_pred_svm_A_corr = svm_A_corr.predict(X_test_A_corr)
print("SVM Branch A (Correlation, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_svm_A_corr))
print(classification_report(y_test_A, y_pred_svm_A_corr))
store_result('A','SVM', 'Correlation', 'No', accuracy_score(y_test_A, y_pred_svm_A_corr),
             precision_score(y_test_A, y_pred_svm_A_corr, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_svm_A_corr, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_svm_A_corr, average='weighted', zero_division=0))


In [None]:
#TUNED
svm_A_corr_tuned = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy')
svm_A_corr_tuned.fit(X_train_A_corr, y_train_A)
y_pred_svm_A_corr_tuned = svm_A_corr_tuned.predict(X_test_A_corr)
print("SVM Branch A (Correlation, Tuned) Best Params:", svm_A_corr_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_svm_A_corr_tuned))
print(classification_report(y_test_A, y_pred_svm_A_corr_tuned))
store_result('A','SVM', 'Correlation', 'Yes', accuracy_score(y_test_A, y_pred_svm_A_corr_tuned),
             precision_score(y_test_A, y_pred_svm_A_corr_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_svm_A_corr_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_svm_A_corr_tuned, average='weighted', zero_division=0))

## SVM BRANCH B (20/80)

In [None]:
#NO FEATURE SELECTION
#NOT TUNED

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

svm_B = SVC(random_state=42)
svm_B.fit(X_train_B_scaled, y_train_B)
y_pred_svm_B = svm_B.predict(X_test_B_scaled)
print("SVM Branch B (No Feature Selection, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_svm_B))
print(classification_report(y_test_B, y_pred_svm_B))
store_result('B','SVM', 'None', 'No', accuracy_score(y_test_B, y_pred_svm_B),
             precision_score(y_test_B, y_pred_svm_B, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_svm_B, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_svm_B, average='weighted', zero_division=0))


In [None]:
#TUNED
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
svm_B_tuned = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy')
svm_B_tuned.fit(X_train_B_scaled, y_train_B)
y_pred_svm_B_tuned = svm_B_tuned.predict(X_test_B_scaled)
print("SVM Branch B (No Feature Selection, Tuned) Best Params:", svm_B_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_svm_B_tuned))
print(classification_report(y_test_B, y_pred_svm_B_tuned))
store_result('B','SVM', 'None', 'Yes', accuracy_score(y_test_B, y_pred_svm_B_tuned),
             precision_score(y_test_B, y_pred_svm_B_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_svm_B_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_svm_B_tuned, average='weighted', zero_division=0))

In [None]:
#CHI SQUARE FEATURE SELECTION
#NOT TUNED

svm_B_chi = SVC(random_state=42)
svm_B_chi.fit(X_train_B_chi, y_train_B)
y_pred_svm_B_chi = svm_B_chi.predict(X_test_B_chi)
print("SVM Branch B (Chi-square, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_svm_B_chi))
print(classification_report(y_test_B, y_pred_svm_B_chi))
store_result('B','SVM', 'Chi-square', 'No', accuracy_score(y_test_B, y_pred_svm_B_chi),
             precision_score(y_test_B, y_pred_svm_B_chi, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_svm_B_chi, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_svm_B_chi, average='weighted', zero_division=0))


In [None]:
#TUNED
svm_B_chi_tuned = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy')
svm_B_chi_tuned.fit(X_train_B_chi, y_train_B)
y_pred_svm_B_chi_tuned = svm_B_chi_tuned.predict(X_test_B_chi)
print("SVM Branch B (Chi-square, Tuned) Best Params:", svm_B_chi_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_svm_B_chi_tuned))
print(classification_report(y_test_B, y_pred_svm_B_chi_tuned))
store_result('B','SVM', 'Chi-square', 'Yes', accuracy_score(y_test_B, y_pred_svm_B_chi_tuned),
             precision_score(y_test_B, y_pred_svm_B_chi_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_svm_B_chi_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_svm_B_chi_tuned, average='weighted', zero_division=0))

In [None]:
#CORRELATION FEATURE SELECTION
#NOT TUNED
svm_B_corr = SVC(random_state=42)
svm_B_corr.fit(X_train_B_corr, y_train_B)
y_pred_svm_B_corr = svm_B_corr.predict(X_test_B_corr)
print("SVM Branch B (Correlation, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_svm_B_corr))
print(classification_report(y_test_B, y_pred_svm_B_corr))
store_result('B','SVM', 'Correlation', 'No', accuracy_score(y_test_B, y_pred_svm_B_corr),
             precision_score(y_test_B, y_pred_svm_B_corr, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_svm_B_corr, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_svm_B_corr, average='weighted', zero_division=0))

In [None]:
#TUNED
svm_B_corr_tuned = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy')
svm_B_corr_tuned.fit(X_train_B_corr, y_train_B)
y_pred_svm_B_corr_tuned = svm_B_corr_tuned.predict(X_test_B_corr)
print("SVM Branch B (Correlation, Tuned) Best Params:", svm_B_corr_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_svm_B_corr_tuned))
print(classification_report(y_test_B, y_pred_svm_B_corr_tuned))
store_result('B','SVM', 'Correlation', 'Yes', accuracy_score(y_test_B, y_pred_svm_B_corr_tuned),
             precision_score(y_test_B, y_pred_svm_B_corr_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_svm_B_corr_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_svm_B_corr_tuned, average='weighted', zero_division=0))


## DECISION TREE BRANCH A (80/20)

In [16]:
#NO FEATURE SELECTION

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

# NOT TUNED
dt_A = DecisionTreeClassifier(random_state=42)
dt_A.fit(X_train_A_scaled, y_train_A)
y_pred_dt_A = dt_A.predict(X_test_A_scaled)
print("Decision Tree Branch A (No Feature Selection, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_dt_A))
print(classification_report(y_test_A, y_pred_dt_A))
store_result('Decision Tree', 'None', 'No', accuracy_score(y_test_A, y_pred_dt_A),
             precision_score(y_test_A, y_pred_dt_A, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_dt_A, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_dt_A, average='weighted', zero_division=0))

# Tuned
param_grid_dt = {'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 5, 10]}
from sklearn.model_selection import GridSearchCV
dt_A_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=3, scoring='accuracy')
dt_A_tuned.fit(X_train_A_scaled, y_train_A)
y_pred_dt_A_tuned = dt_A_tuned.predict(X_test_A_scaled)
print("Decision Tree Branch A (No Feature Selection, Tuned) Best Params:", dt_A_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_dt_A_tuned))
print(classification_report(y_test_A, y_pred_dt_A_tuned))
store_result('A','Decision Tree', 'None', 'Yes', accuracy_score(y_test_A, y_pred_dt_A_tuned),
             precision_score(y_test_A, y_pred_dt_A_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_dt_A_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_dt_A_tuned, average='weighted', zero_division=0))

In [17]:
# CHI SQUARE FEATURE SELECTION

#NOT TUNED
dt_A_chi = DecisionTreeClassifier(random_state=42)
dt_A_chi.fit(X_train_A_chi, y_train_A)
y_pred_dt_A_chi = dt_A_chi.predict(X_test_A_chi)
print("Decision Tree Branch A (Chi-square, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_dt_A_chi))
print(classification_report(y_test_A, y_pred_dt_A_chi))
store_result('Decision Tree', 'Chi-square', 'No', accuracy_score(y_test_A, y_pred_dt_A_chi),
             precision_score(y_test_A, y_pred_dt_A_chi, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_dt_A_chi, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_dt_A_chi, average='weighted', zero_division=0))

# Tuned
dt_A_chi_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=3, scoring='accuracy')
dt_A_chi_tuned.fit(X_train_A_chi, y_train_A)
y_pred_dt_A_chi_tuned = dt_A_chi_tuned.predict(X_test_A_chi)
print("Decision Tree Branch A (Chi-square, Tuned) Best Params:", dt_A_chi_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_dt_A_chi_tuned))
print(classification_report(y_test_A, y_pred_dt_A_chi_tuned))
store_result('A','Decision Tree', 'Chi-square', 'Yes', accuracy_score(y_test_A, y_pred_dt_A_chi_tuned),
             precision_score(y_test_A, y_pred_dt_A_chi_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_dt_A_chi_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_dt_A_chi_tuned, average='weighted', zero_division=0))

NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
#CORRELATION FEATURE SELECTION
# Not Tuned
dt_A_corr = DecisionTreeClassifier(random_state=42)
dt_A_corr.fit(X_train_A_corr, y_train_A)
y_pred_dt_A_corr = dt_A_corr.predict(X_test_A_corr)
print("Decision Tree Branch A (Correlation, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_dt_A_corr))
print(classification_report(y_test_A, y_pred_dt_A_corr))
store_result('A','Decision Tree', 'Correlation', 'No', accuracy_score(y_test_A, y_pred_dt_A_corr),
             precision_score(y_test_A, y_pred_dt_A_corr, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_dt_A_corr, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_dt_A_corr, average='weighted', zero_division=0))

# Tuned
dt_A_corr_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=3, scoring='accuracy')
dt_A_corr_tuned.fit(X_train_A_corr, y_train_A)
y_pred_dt_A_corr_tuned = dt_A_corr_tuned.predict(X_test_A_corr)
print("Decision Tree Branch A (Correlation, Tuned) Best Params:", dt_A_corr_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_dt_A_corr_tuned))
print(classification_report(y_test_A, y_pred_dt_A_corr_tuned))
store_result('Decision Tree', 'Correlation', 'Yes', accuracy_score(y_test_A, y_pred_dt_A_corr_tuned),
             precision_score(y_test_A, y_pred_dt_A_corr_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_dt_A_corr_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_dt_A_corr_tuned, average='weighted', zero_division=0))

## DECISION TREE BRANCH B (20/80)



In [None]:
#NO FEATURE SELECTION
#NOT TUNED

dt_B = DecisionTreeClassifier(random_state=42)
dt_B.fit(X_train_B_scaled, y_train_B)
y_pred_dt_B = dt_B.predict(X_test_B_scaled)
print("Decision Tree Branch B (No Feature Selection, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_dt_B))
print(classification_report(y_test_B, y_pred_dt_B))
store_result('B','Decision Tree', 'None', 'No', accuracy_score(y_test_B, y_pred_dt_B),
             precision_score(y_test_B, y_pred_dt_B, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_dt_B, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_dt_B, average='weighted', zero_division=0))


In [None]:
#TUNED
dt_B_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=3, scoring='accuracy')
dt_B_tuned.fit(X_train_B_scaled, y_train_B)
y_pred_dt_B_tuned = dt_B_tuned.predict(X_test_B_scaled)
print("Decision Tree Branch B (No Feature Selection, Tuned) Best Params:", dt_B_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_dt_B_tuned))
print(classification_report(y_test_B, y_pred_dt_B_tuned))
store_result('B','Decision Tree', 'None', 'Yes', accuracy_score(y_test_B, y_pred_dt_B_tuned),
             precision_score(y_test_B, y_pred_dt_B_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_dt_B_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_dt_B_tuned, average='weighted', zero_division=0))

In [None]:
#CHI SQUARE FEATURE SELECTION
#NOT TUNED
dt_B_chi = DecisionTreeClassifier(random_state=42)
dt_B_chi.fit(X_train_B_chi, y_train_B)
y_pred_dt_B_chi = dt_B_chi.predict(X_test_B_chi)
print("Decision Tree Branch B (Chi-square, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_dt_B_chi))
print(classification_report(y_test_B, y_pred_dt_B_chi))
store_result('B','Decision Tree', 'Chi-square', 'No', accuracy_score(y_test_B, y_pred_dt_B_chi),
             precision_score(y_test_B, y_pred_dt_B_chi, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_dt_B_chi, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_dt_B_chi, average='weighted', zero_division=0))


In [None]:
#TUNED
dt_B_chi_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=3, scoring='accuracy')
dt_B_chi_tuned.fit(X_train_B_chi, y_train_B)
y_pred_dt_B_chi_tuned = dt_B_chi_tuned.predict(X_test_B_chi)
print("Decision Tree Branch B (Chi-square, Tuned) Best Params:", dt_B_chi_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_dt_B_chi_tuned))
print(classification_report(y_test_B, y_pred_dt_B_chi_tuned))
store_result('B','Decision Tree', 'Chi-square', 'Yes', accuracy_score(y_test_B, y_pred_dt_B_chi_tuned),
             precision_score(y_test_B, y_pred_dt_B_chi_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_dt_B_chi_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_dt_B_chi_tuned, average='weighted', zero_division=0))

In [None]:
#CORRELATION FEATURE SELECTION
#NOT TUNED
dt_B_corr = DecisionTreeClassifier(random_state=42)
dt_B_corr.fit(X_train_B_corr, y_train_B)
y_pred_dt_B_corr = dt_B_corr.predict(X_test_B_corr)
print("Decision Tree Branch B (Correlation, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_dt_B_corr))
print(classification_report(y_test_B, y_pred_dt_B_corr))
store_result('B','Decision Tree', 'Correlation', 'No', accuracy_score(y_test_B, y_pred_dt_B_corr),
             precision_score(y_test_B, y_pred_dt_B_corr, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_dt_B_corr, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_dt_B_corr, average='weighted', zero_division=0))

In [None]:
#TUNED
dt_B_corr_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=3, scoring='accuracy')
dt_B_corr_tuned.fit(X_train_B_corr, y_train_B)
y_pred_dt_B_corr_tuned = dt_B_corr_tuned.predict(X_test_B_corr)
print("Decision Tree Branch B (Correlation, Tuned) Best Params:", dt_B_corr_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_dt_B_corr_tuned))
print(classification_report(y_test_B, y_pred_dt_B_corr_tuned))
store_result('B','Decision Tree', 'Correlation', 'Yes', accuracy_score(y_test_B, y_pred_dt_B_corr_tuned),
             precision_score(y_test_B, y_pred_dt_B_corr_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_dt_B_corr_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_dt_B_corr_tuned, average='weighted', zero_division=0))

## XGBOOST BRANCH A (80/20)


In [None]:
#NO FEATURE SELECTION
from xgboost import XGBClassifier

# Not Tuned
xgb_A = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_A.fit(X_train_A_scaled, y_train_A)
y_pred_xgb_A = xgb_A.predict(X_test_A_scaled)
print("XGBoost Branch A (No Feature Selection, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_xgb_A))
print(classification_report(y_test_A, y_pred_xgb_A))
store_result('A','XGBoost', 'None', 'No', accuracy_score(y_test_A, y_pred_xgb_A),
             precision_score(y_test_A, y_pred_xgb_A, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_xgb_A, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_xgb_A, average='weighted', zero_division=0))

# Tuned
param_grid_xgb = {'max_depth': [3, 5, 10], 'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [100, 200]}
xgb_A_tuned = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=3, scoring='accuracy')
xgb_A_tuned.fit(X_train_A_scaled, y_train_A)
y_pred_xgb_A_tuned = xgb_A_tuned.predict(X_test_A_scaled)
print("XGBoost Branch A (No Feature Selection, Tuned) Best Params:", xgb_A_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_xgb_A_tuned))
print(classification_report(y_test_A, y_pred_xgb_A_tuned))
store_result('A','XGBoost', 'None', 'Yes', accuracy_score(y_test_A, y_pred_xgb_A_tuned),
             precision_score(y_test_A, y_pred_xgb_A_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_xgb_A_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_xgb_A_tuned, average='weighted', zero_division=0))


In [None]:
# CHI SQUARE FEATURE SELECTION

# Not Tuned
xgb_A_chi = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_A_chi.fit(X_train_A_chi, y_train_A)
y_pred_xgb_A_chi = xgb_A_chi.predict(X_test_A_chi)
print("XGBoost Branch A (Chi-square, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_xgb_A_chi))
print(classification_report(y_test_A, y_pred_xgb_A_chi))
store_result('XGBoost', 'Chi-square', 'No', accuracy_score(y_test_A, y_pred_xgb_A_chi),
             precision_score(y_test_A, y_pred_xgb_A_chi, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_xgb_A_chi, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_xgb_A_chi, average='weighted', zero_division=0))

# Tuned
xgb_A_chi_tuned = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=3, scoring='accuracy')
xgb_A_chi_tuned.fit(X_train_A_chi, y_train_A)
y_pred_xgb_A_chi_tuned = xgb_A_chi_tuned.predict(X_test_A_chi)
print("XGBoost Branch A (Chi-square, Tuned) Best Params:", xgb_A_chi_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_xgb_A_chi_tuned))
print(classification_report(y_test_A, y_pred_xgb_A_chi_tuned))
store_result('XGBoost', 'Chi-square', 'Yes', accuracy_score(y_test_A, y_pred_xgb_A_chi_tuned),
             precision_score(y_test_A, y_pred_xgb_A_chi_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_xgb_A_chi_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_xgb_A_chi_tuned, average='weighted', zero_division=0))


In [None]:
#CORRELATION FEATURE SELECTION
# Not Tuned
xgb_A_corr = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_A_corr.fit(X_train_A_corr, y_train_A)
y_pred_xgb_A_corr = xgb_A_corr.predict(X_test_A_corr)
print("XGBoost Branch A (Correlation, Not Tuned) Accuracy:", accuracy_score(y_test_A, y_pred_xgb_A_corr))
print(classification_report(y_test_A, y_pred_xgb_A_corr))
store_result('XGBoost', 'Correlation', 'No', accuracy_score(y_test_A, y_pred_xgb_A_corr),
             precision_score(y_test_A, y_pred_xgb_A_corr, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_xgb_A_corr, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_xgb_A_corr, average='weighted', zero_division=0))

# Tuned
xgb_A_corr_tuned = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=3, scoring='accuracy')
xgb_A_corr_tuned.fit(X_train_A_corr, y_train_A)
y_pred_xgb_A_corr_tuned = xgb_A_corr_tuned.predict(X_test_A_corr)
print("XGBoost Branch A (Correlation, Tuned) Best Params:", xgb_A_corr_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_A, y_pred_xgb_A_corr_tuned))
print(classification_report(y_test_A, y_pred_xgb_A_corr_tuned))
store_result('XGBoost', 'Correlation', 'Yes', accuracy_score(y_test_A, y_pred_xgb_A_corr_tuned),
             precision_score(y_test_A, y_pred_xgb_A_corr_tuned, average='weighted', zero_division=0),
             recall_score(y_test_A, y_pred_xgb_A_corr_tuned, average='weighted', zero_division=0),
             f1_score(y_test_A, y_pred_xgb_A_corr_tuned, average='weighted', zero_division=0))


## XGBOOST BRANCH B (20/80)



In [None]:
#FEATURE SELECTION
#NOT TUNED 
xgb_B = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_B.fit(X_train_B_scaled, y_train_B)
y_pred_xgb_B = xgb_B.predict(X_test_B_scaled)
print("XGBoost Branch B (No Feature Selection, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_xgb_B))
print(classification_report(y_test_B, y_pred_xgb_B))
store_result('B','XGBoost', 'None', 'No', accuracy_score(y_test_B, y_pred_xgb_B),
             precision_score(y_test_B, y_pred_xgb_B, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_xgb_B, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_xgb_B, average='weighted', zero_division=0))

In [None]:
#TUNED
xgb_B_tuned = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=3, scoring='accuracy')
xgb_B_tuned.fit(X_train_B_scaled, y_train_B)
y_pred_xgb_B_tuned = xgb_B_tuned.predict(X_test_B_scaled)
print("XGBoost Branch B (No Feature Selection, Tuned) Best Params:", xgb_B_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_xgb_B_tuned))
print(classification_report(y_test_B, y_pred_xgb_B_tuned))
store_result('B','XGBoost', 'None', 'Yes', accuracy_score(y_test_B, y_pred_xgb_B_tuned),
             precision_score(y_test_B, y_pred_xgb_B_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_xgb_B_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_xgb_B_tuned, average='weighted', zero_division=0))

In [None]:
# CHI SQUARE FEATURE SELECTION
# NOT TUNED

xgb_B_chi = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_B_chi.fit(X_train_B_chi, y_train_B)
y_pred_xgb_B_chi = xgb_B_chi.predict(X_test_B_chi)
print("XGBoost Branch B (Chi-square, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_xgb_B_chi))
print(classification_report(y_test_B, y_pred_xgb_B_chi))
store_result('B','XGBoost', 'Chi-square', 'No', accuracy_score(y_test_B, y_pred_xgb_B_chi),
             precision_score(y_test_B, y_pred_xgb_B_chi, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_xgb_B_chi, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_xgb_B_chi, average='weighted', zero_division=0))

In [None]:
#TUNED
xgb_B_chi_tuned = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=3, scoring='accuracy')
xgb_B_chi_tuned.fit(X_train_B_chi, y_train_B)
y_pred_xgb_B_chi_tuned = xgb_B_chi_tuned.predict(X_test_B_chi)
print("XGBoost Branch B (Chi-square, Tuned) Best Params:", xgb_B_chi_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_xgb_B_chi_tuned))
print(classification_report(y_test_B, y_pred_xgb_B_chi_tuned))
store_result('B','XGBoost', 'Chi-square', 'Yes', accuracy_score(y_test_B, y_pred_xgb_B_chi_tuned),
             precision_score(y_test_B, y_pred_xgb_B_chi_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_xgb_B_chi_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_xgb_B_chi_tuned, average='weighted', zero_division=0))

In [None]:
# CORRELATION FEATURE SELECTION
# NOT TUNED
xgb_B_corr = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_B_corr.fit(X_train_B_corr, y_train_B)
y_pred_xgb_B_corr = xgb_B_corr.predict(X_test_B_corr)
print("XGBoost Branch B (Correlation, Not Tuned) Accuracy:", accuracy_score(y_test_B, y_pred_xgb_B_corr))
print(classification_report(y_test_B, y_pred_xgb_B_corr))
store_result('B','XGBoost', 'Correlation', 'No', accuracy_score(y_test_B, y_pred_xgb_B_corr),
             precision_score(y_test_B, y_pred_xgb_B_corr, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_xgb_B_corr, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_xgb_B_corr, average='weighted', zero_division=0))


In [None]:
# TUNED
xgb_B_corr_tuned = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=3, scoring='accuracy')
xgb_B_corr_tuned.fit(X_train_B_corr, y_train_B)
y_pred_xgb_B_corr_tuned = xgb_B_corr_tuned.predict(X_test_B_corr)
print("XGBoost Branch B (Correlation, Tuned) Best Params:", xgb_B_corr_tuned.best_params_)
print("Accuracy:", accuracy_score(y_test_B, y_pred_xgb_B_corr_tuned))
print(classification_report(y_test_B, y_pred_xgb_B_corr_tuned))
store_result('B','XGBoost', 'Correlation', 'Yes', accuracy_score(y_test_B, y_pred_xgb_B_corr_tuned),
             precision_score(y_test_B, y_pred_xgb_B_corr_tuned, average='weighted', zero_division=0),
             recall_score(y_test_B, y_pred_xgb_B_corr_tuned, average='weighted', zero_division=0),
             f1_score(y_test_B, y_pred_xgb_B_corr_tuned, average='weighted', zero_division=0))


## EVALUATION COMPARISON TABLE

In [None]:
import pandas as pd

# Create DataFrame from results
results_df = pd.DataFrame(results)

# Sort for readability
results_df = results_df.sort_values(by=['Branch', 'Model', 'Feature Selection', 'Tuned'])

# Display the table
display(results_df)

# Optionally, highlight the best accuracy and F1-score in each branch/model
def highlight_best(s):
    is_max = s == s.max()
    return ['background-color: lightgreen' if v else '' for v in is_max]

# Highlight best per Branch+Model group
styled = results_df.style
for (branch, model), group in results_df.groupby(['Branch', 'Model']):
    idx = group['Accuracy'].idxmax()
    styled = styled.apply(lambda x: ['background-color: lightgreen' if i == idx else '' for i in x.index], subset=['Accuracy'])
    idx_f1 = group['F1-score'].idxmax()
    styled = styled.apply(lambda x: ['background-color: lightblue' if i == idx_f1 else '' for i in x.index], subset=['F1-score'])

