In [30]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)

import tensorflow as tf
tf.random.set_seed(RANDOM_SEED)

X_scaled = pd.read_csv('../data/processed/X_scaled.csv')
y = pd.read_csv('../data/processed/y.csv')['outcome']

print(f"Loaded data shape: {X_scaled.shape}")
print(f"Target shape: {y.shape}")

Loaded data shape: (1176, 48)
Target shape: (1176,)


# 1. Feature Engineering

In [31]:
# Check for missing values and clean data
print(f"Missing values in y: {y.isna().sum()}")
print(f"Outcome distribution:\n{y.value_counts()}")
print(f"Outcome unique values: {y.unique()}")

# Remove rows with missing outcomes from BOTH X and y
mask = ~y.isna()
X_scaled_clean = X_scaled[mask]
y_clean = y[mask]

print(f"\nCleaned data shapes:")
print(f"X: {X_scaled_clean.shape}, y: {y_clean.shape}")


Missing values in y: 0
Outcome distribution:
outcome
0.0    1017
1.0     159
Name: count, dtype: int64
Outcome unique values: [0. 1.]

Cleaned data shapes:
X: (1176, 48), y: (1176,)


# 2. Model Development

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Train outcome distribution:\n{pd.Series(y_train).value_counts()}")
print(f"Test outcome distribution:\n{pd.Series(y_test).value_counts()}")


Train set: (940, 48)
Test set: (236, 48)
Train outcome distribution:
outcome
0.0    813
1.0    127
Name: count, dtype: int64
Test outcome distribution:
outcome
0.0    204
1.0     32
Name: count, dtype: int64


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

print("Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, random_state=42) 
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("\nLogistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr))

Training Logistic Regression...

Logistic Regression Results:
Accuracy: 0.8771
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93       204
         1.0       0.60      0.28      0.38        32

    accuracy                           0.88       236
   macro avg       0.75      0.63      0.66       236
weighted avg       0.86      0.88      0.86       236



In [None]:
from sklearn.ensemble import RandomForestClassifier

print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))

Training Random Forest...

Random Forest Results:
Accuracy: 0.8771
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.93       204
         1.0       0.71      0.16      0.26        32

    accuracy                           0.88       236
   macro avg       0.80      0.57      0.59       236
weighted avg       0.86      0.88      0.84       236



In [None]:
import xgboost as xgb

print("Training XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb))

Training XGBoost...

XGBoost Results:
Accuracy: 0.8729
              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93       204
         1.0       0.57      0.25      0.35        32

    accuracy                           0.87       236
   macro avg       0.73      0.61      0.64       236
weighted avg       0.85      0.87      0.85       236


XGBoost Results:
Accuracy: 0.8729
              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93       204
         1.0       0.57      0.25      0.35        32

    accuracy                           0.87       236
   macro avg       0.73      0.61      0.64       236
weighted avg       0.85      0.87      0.85       236



In [None]:
n_classes = len(np.unique(y_train))
print(f"Number of classes: {n_classes}")

Number of classes: 2


In [41]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

print("Training Neural Network...")

# two hidden layers with dropout to prevent overfitting
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
y_pred_nn = (nn_model.predict(X_test, verbose=0) > 0.5).astype(int).flatten()

print("\nNeural Network Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nn):.4f}")
print(classification_report(y_test, y_pred_nn))

Training Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Neural Network Results:
Accuracy: 0.8856
              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94       204
         1.0       0.63      0.38      0.47        32

    accuracy                           0.89       236
   macro avg       0.77      0.67      0.70       236
weighted avg       0.87      0.89      0.87       236



In [42]:
results = {
    'Logistic Regression': accuracy_score(y_test, y_pred_lr),
    'Random Forest': accuracy_score(y_test, y_pred_rf),
    'XGBoost': accuracy_score(y_test, y_pred_xgb),
    'Neural Network': accuracy_score(y_test, y_pred_nn)
}

print("\n" + "-"*50)
print("MODEL COMPARISON")
print("-"*50)
for model, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model:20s}: {acc:.4f}")
print("-"*50)


--------------------------------------------------
MODEL COMPARISON
--------------------------------------------------
Neural Network      : 0.8856
Logistic Regression : 0.8771
Random Forest       : 0.8771
XGBoost             : 0.8729
--------------------------------------------------
