In [None]:
# Cell 1 — Imports (Machine Learning & Deep Learning)

import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Scikit-Learn: classic machine learning tools
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

# Metrics (corrected and expanded)
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    RocCurveDisplay,
    f1_score
)

# TensorFlow / Keras: deep learning framework
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Display settings
pd.set_option("display.max_columns", 50)
sns.set_style("whitegrid")

print("All libraries imported successfully — TensorFlow/Keras and f1_score included!")

In [None]:
# Cell 2: Load CSV and get a quick overview
df = pd.read_csv('survey.csv')

# Show the dimensions of the dataset
print("Dataset shape:", df.shape)

# Display the first 6 rows
display(df.head(6))

# Display summary info about the dataset (column types, non-null counts, etc.)
display(df.info())

In [None]:
# Cell 3: Exploratory Data Analysis (EDA)

# List all columns
print("Columns in the dataset:", list(df.columns))

# Show counts of each value in the target column ('treatment')
print("\nTarget value counts (treatment):")
print(df['treatment'].value_counts(dropna=False))

# Calculate percentage of missing values per column
missing_pct = df.isna().mean().sort_values(ascending=False) * 100
print("\nColumns with missing values (%):")
display(missing_pct[missing_pct > 0].round(2))

# Visualize the distribution of the target variable
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='treatment', order=df['treatment'].value_counts().index)
plt.title('Distribution of the target: treatment')
plt.show()

# Visualize the distribution of age
plt.figure(figsize=(8,4))
sns.histplot(df['Age'].dropna(), bins=30)
plt.title('Age distribution')
plt.show()

In [None]:
# Cell 4 — Basic data cleaning: age validation, gender normalization, and Yes/No encoding

df2 = df.copy()  # work on a copy to avoid modifying the original dataset

# Clean and validate the Age column
df2['Age'] = pd.to_numeric(df2['Age'], errors='coerce')  # convert to numeric; invalid values become NaN
print("Before filtering: number of missing ages =", df2['Age'].isna().sum())

# Optionally remove unrealistic ages (outside 14–100)
df2.loc[(df2['Age'] < 14) | (df2['Age'] > 100), 'Age'] = np.nan
print("After filtering: number of missing ages =", df2['Age'].isna().sum())


# Normalize gender labels into three categories: Male, Female, Other
def clean_gender(x):
    if pd.isna(x):
        return 'Other'
    s = str(x).strip().lower()

    # common male variations
    if s in ['male', 'm', 'man', 'male-ish', 'maile', 'mal', 'cis male', 'male (cis)']:
        return 'Male'
    
    # common female variations
    if s in ['female', 'f', 'woman', 'female (cis)', 'cis female']:
        return 'Female'
    
    # any other value (including trans, non-binary, etc.)
    return 'Other'

df2['Gender_clean'] = df2['Gender'].apply(clean_gender)


# Encode Yes/No columns as 1/0
binary_cols = ['self_employed', 'family_history', 'treatment', 'remote_work', 'tech_company']

# Note: this logic is fragile and was the source of some issues.
# A corrected version is applied later in Cell 7.

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    for col in binary_cols:
        if col in df2.columns:
            df2[col] = df2[col].map({'Yes': 1, 'No': 0})   # map Yes/No to 1/0
            df2[col] = df2[col].fillna(df2[col]).infer_objects(copy=False)

print("Basic cleaning complete. Here's a sample:")
display(df2[['Age', 'Gender', 'Gender_clean', 'self_employed', 'family_history', 'treatment']].head())

In [None]:
# Cell 5 — Initial feature selection and simple feature creation

df3 = df2.copy()  # create a new copy so earlier cleaning steps remain intact

# Example: a potential binary feature such as "long_hours" (e.g., working more than 50 hours per week)
# Since the dataset does not include an "hours" column, this feature is not created here.

# List of potential features to use in the analysis and modeling steps
candidate_features = [
    'Age',
    'Gender_clean',
    'self_employed',
    'family_history',
    'work_interfere',     # categorical: Never, Rarely, Sometimes, Often
    'no_employees',       # categorical: company size
    'remote_work',
    'tech_company',
    'benefits',
    'care_options',
    'wellness_program',
    'seek_help',
    'anonymity'
]

# Keep only the features that actually exist in the dataframe
candidate_features = [c for c in candidate_features if c in df3.columns]
print("Candidate features being used:", candidate_features)

# Quick look at unique values for categorical features or any feature with fewer than 20 unique values
for col in candidate_features:
    if df3[col].dtype == 'object' or df3[col].nunique() < 20:
        print(f"\nUnique values in '{col}':")
        print(df3[col].fillna('NA').value_counts().head(10))

In [None]:
# Cell 6 — Preprocessing pipeline
# This is the cell that failed in notebook 1. Here we use the corrected version that includes FunctionTransformer.

print("Defining the preprocessor (with the .astype(str) fix)...")

# Split candidate features into numeric and categorical sets
num_features = [c for c in candidate_features
                if df3[c].dtype in ['int64', 'float64'] and c != 'treatment']
cat_features = [c for c in candidate_features if c not in num_features]

print("Numerical features:", num_features)
print("Categorical features:", cat_features)

# Numerical pipeline: median imputation followed by standard scaling
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: constant imputation, force to string, then one-hot encode
# The FunctionTransformer with astype(str) avoids the TypeError caused by mixed dtypes.
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('to_string', FunctionTransformer(lambda x: x.astype(str))),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine numeric and categorical pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='drop',
    sparse_threshold=0
)

print("Preprocessor (fixed) created.")

In [None]:
# Cell 7: Prepare X, y and create the Train/Test split (for K-Fold later)

print("Preparing X and y (with the robust pd.to_numeric fix)...")

# Features
X = df3[candidate_features].copy()

# Robust target conversion (same original logic)
y_temp = df3['treatment'].replace({'Yes': 1, 'No': 0})
y = pd.to_numeric(y_temp, errors='coerce')

# Remove rows where the target became NaN
mask = y.notna()
X = X[mask]
y = y[mask].astype(int)

print(f"Shape after removing NaNs from target: X={X.shape}, y={y.shape}")

# Updated Logic: Create two sets (Train_Full / Test)

# Create a true hold-out Test Set (20% of the data)
# This data will only be used once, at the very end.
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# The Train_Full set (80% of the data) is what the K-Fold will use or its internal training/validation splits.

print("\nShapes of the two sets (80/20 split):")
print(f"Train_Full: {X_train_full.shape}, {y_train_full.shape}  (used for K-Fold)")
print(f"Test:       {X_test.shape}, {y_test.shape}            (held out until the end)")

print("\nTrain_Full target distribution:", np.bincount(y_train_full) / len(y_train_full))


In [None]:
# Cell 8: Preprocess Data and Define the "Raw" Neural Network Functions (K-Fold Version)

# Apply the preprocessor
print("Applying the preprocessor (fit on Train_Full, transform on both sets)...")

# Fit on the full training set (80%)
preprocessor.fit(X_train_full)

# Transform both Train_Full and Test
X_train_full_processed = preprocessor.transform(X_train_full)
X_test_processed = preprocessor.transform(X_test)

# Convert processed data to NumPy "raw" format (transposed)
X_train_full_raw = X_train_full_processed.T
X_test_raw = X_test_processed.T

# Convert y to 1-row arrays
y_train_full_raw = y_train_full.values.reshape(1, y_train_full.shape[0])
y_test_raw = y_test.values.reshape(1, y_test.shape[0])

print(f"Raw format of X_train_full (features, samples): {X_train_full_raw.shape}")
print(f"Raw format of X_test (features, samples):      {X_test_raw.shape}")

# Helper functions (same logic as before)

def sigmoid(z):
    """Computes the sigmoid (logistic) activation function."""
    return 1 / (1 + np.exp(-z))

def sigmoid_gradient(z):
    """Computes the derivative of the sigmoid (for backpropagation)."""
    s = sigmoid(z)
    return s * (1 - s)

In [None]:
# Cell 9: Initialize Parameters

def initialize_parameters(n_x, n_h, n_y):
    """
    Initializes the weight matrices (Thetas) with small random values.
    
    n_x: number of input neurons (your features)
    n_h: number of neurons in the hidden layer
    n_y: number of output neurons (in our case, this is 1)
    """
    np.random.seed(42)  # Ensures reproducibility
    
    # W1 is Theta_1 (input layer -> hidden layer)
    # W2 is Theta_2 (hidden layer -> output layer)
    
    W1 = np.random.randn(n_h, n_x) * 0.01   # Breaks symmetry
    b1 = np.zeros((n_h, 1))
    
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
    
    return parameters

In [None]:
# Cell 10: Forward Propagation
def forward_propagation(X, parameters):
    """
    Performs a forward pass through a simple 2-layer neural network.
    
    Arguments:
    X -- input data of shape (n_x, m)
    parameters -- dictionary containing network weights and biases
    
    Returns:
    A2 -- output of the sigmoid activation (predictions)
    cache -- dictionary containing intermediate values for backpropagation
    """
    
    # Retrieve weights and biases from the parameters dictionary
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    
    # Layer 1 (Hidden Layer): Linear combination followed by tanh activation
    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)  # tanh activation for hidden layer
    
    # Layer 2 (Output Layer): Linear combination followed by sigmoid activation
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2)  # sigmoid activation for binary classification
    
    # Store intermediate results in cache for use in backpropagation
    cache = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2}
    
    return A2, cache

In [None]:
# Cell 11: Compute Cost Function (with L2 Regularization)
def compute_cost(A2, Y, parameters, lambda_reg):
    """
    Computes the total cost for a 2-layer neural network, 
    combining Binary Cross-Entropy loss with L2 regularization.
    
    Arguments:
    A2 -- predictions from the output layer (sigmoid), shape (1, m)
    Y -- true labels (0 or 1), shape (1, m)
    parameters -- dictionary containing network weights W1 and W2
    lambda_reg -- L2 regularization hyperparameter
    
    Returns:
    total_cost -- scalar value representing the total cost
    """
    
    m = Y.shape[1]  # number of examples
    
    # Retrieve weights for regularization
    W1 = parameters['W1']
    W2 = parameters['W2']
    
    # Binary Cross-Entropy cost
    logprobs = np.multiply(np.log(A2), Y) + np.multiply(np.log(1 - A2), 1 - Y)
    cross_entropy_cost = - (1 / m) * np.sum(logprobs)
    
    # L2 regularization cost
    L2_cost = (lambda_reg / (2 * m)) * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    
    # Total cost = data loss + regularization loss
    total_cost = cross_entropy_cost + L2_cost
    
    # Ensure the cost is returned as a scalar
    return np.squeeze(total_cost)

In [None]:
# Cell 12: Backward Propagation
def backward_propagation(parameters, cache, X, Y, lambda_reg):
    """
    Implements the backpropagation algorithm for a 2-layer neural network.
    
    Arguments:
    parameters -- dictionary containing network weights W1 and W2
    cache -- dictionary containing intermediate values from forward propagation
    X -- input data, shape (n_x, m)
    Y -- true labels, shape (1, m)
    lambda_reg -- L2 regularization hyperparameter
    
    Returns:
    grads -- dictionary containing gradients of weights and biases
    """
    
    m = X.shape[1]  # number of examples
    
    # Retrieve weights
    W1 = parameters['W1']
    W2 = parameters['W2']
    
    # Retrieve cached values from forward propagation
    A1 = cache['A1']
    A2 = cache['A2']
    Z1 = cache['Z1']
    
    # Compute the error at the output layer
    dZ2 = A2 - Y  # derivative of cost w.r.t. Z2 (output pre-activation)
    
    # Gradients for weights and biases between hidden and output layer
    dW2 = (1 / m) * np.dot(dZ2, A1.T) + (lambda_reg / m) * W2  # include L2 regularization
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
    
    # Backpropagate the error to the hidden layer
    # derivative of tanh activation: 1 - A1^2
    dZ1 = np.dot(W2.T, dZ2) * (1 - np.power(A1, 2))
    
    # Gradients for weights and biases between input and hidden layer
    dW1 = (1 / m) * np.dot(dZ1, X.T) + (lambda_reg / m) * W1  # include L2 regularization
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
    
    # Pack gradients into a dictionary
    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}
    
    return grads


In [None]:
# Cell 13: Update Parameters (Gradient Descent)
def update_parameters(parameters, grads, learning_rate):
    """
    Updates the weights and biases of the network using gradient descent.
    
    Arguments:
    parameters -- dictionary containing current weights and biases
    grads -- dictionary containing gradients of weights and biases
    learning_rate -- step size for gradient descent updates
    
    Returns:
    parameters -- dictionary containing updated weights and biases
    """
    
    # Retrieve current weights and biases
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    
    # Retrieve corresponding gradients
    dW1 = grads['dW1']
    db1 = grads['db1']
    dW2 = grads['dW2']
    db2 = grads['db2']
    
    # Update rule for gradient descent
    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    
    # Pack updated parameters back into a dictionary
    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
    
    return parameters

In [None]:
# Cell 14: The Full Neural Network Model
def nn_model(X, Y, n_h, num_epochs, lambda_reg, learning_rate, print_cost=False):
    """
    Builds and trains a simple 2-layer neural network from scratch.
    
    Arguments:
    X -- input data, shape (n_x, m)
    Y -- true labels, shape (1, m)
    n_h -- number of neurons in the hidden layer
    num_epochs -- number of training iterations
    lambda_reg -- L2 regularization hyperparameter
    learning_rate -- learning rate for gradient descent
    print_cost -- if True, prints the cost every 100 epochs
    
    Returns:
    parameters -- learned weights and biases
    costs -- list of costs recorded during training (every 100 epochs)
    """
    
    n_x = X.shape[0]  # number of input features
    n_y = Y.shape[0]  # number of outputs (1 for binary classification)
    costs = []
    
    # 1. Initialize parameters
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    # 2. Training loop
    for i in range(num_epochs):
        
        # 3. Forward propagation
        A2, cache = forward_propagation(X, parameters)
        
        # 4. Compute cost with L2 regularization
        cost = compute_cost(A2, Y, parameters, lambda_reg)
        
        # 5. Backward propagation
        grads = backward_propagation(parameters, cache, X, Y, lambda_reg)
        
        # 6. Update parameters using gradient descent
        parameters = update_parameters(parameters, grads, learning_rate)
        
        # Optionally print the cost every 100 epochs
        if print_cost and i % 100 == 0:
            print(f"Cost after epoch {i}: {cost}")
            costs.append(cost)
            
    return parameters, costs


# Function to make predictions with the trained model
def predict(X, parameters):
    """
    Uses the trained neural network to predict binary labels.
    
    Arguments:
    X -- input data, shape (n_x, m)
    parameters -- learned weights and biases
    
    Returns:
    predictions -- predicted labels (0 or 1)
    """
    A2, cache = forward_propagation(X, parameters)
    predictions = (A2 > 0.5).astype(int)  # threshold at 0.5 for binary classification
    return predictions

In [None]:
# Cell 15: Train the "Raw" MLP (K-Fold Version)

# Hyperparameters
n_h = 8              # number of neurons in hidden layer
num_epochs = 2000     # number of training iterations
lambda_reg = 0.1      # L2 regularization parameter
learning_rate = 0.01  # step size for gradient descent

print("Starting training of the 'raw' model on the full training set...")
start_time = time.time()

# Train the neural network on the full training set
parameters, costs = nn_model(
    X_train_full_raw, y_train_full_raw,  # use the complete training set
    n_h=n_h, 
    num_epochs=num_epochs, 
    lambda_reg=lambda_reg, 
    learning_rate=learning_rate, 
    print_cost=True
)

print(f"Training completed in {time.time() - start_time:.2f} seconds.")

# Plot the learning curve to visualize how the cost decreased over epochs
plt.plot(np.squeeze(costs))
plt.ylabel('Cost (Loss)')
plt.xlabel('Epochs (x100)')
plt.title(f"Learning Curve (Learning Rate = {learning_rate})")
plt.show()

In [None]:
# Cell 16: Evaluate the "Raw" MLP (K-Fold Version)

# Make predictions on the TEST set
y_pred_raw_test = predict(X_test_raw, parameters)

# Make predictions on the full TRAIN set
y_pred_raw_train = predict(X_train_full_raw, parameters)  # updated to full train set

# Flatten predictions for compatibility with sklearn metrics
y_pred_flat_test = y_pred_raw_test.flatten()
y_pred_flat_train = y_pred_raw_train.flatten()

# Performance Diagnostics
print("\n--- Performance Diagnostics ---\n")

# Classification report on the TRAIN set
print("Classification Report (TRAIN_FULL):")
print(classification_report(y_train_full, y_pred_flat_train, digits=4))
print("---")

# Classification report on the TEST set
print("Classification Report (TEST):")
print(classification_report(y_test, y_pred_flat_test, digits=4))
print("---")

# Confusion Matrix for TEST set
print("\nConfusion Matrix (TEST):\n")
cm = confusion_matrix(y_test, y_pred_flat_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot(cmap=plt.cm.Blues, colorbar=False)
disp.ax_.grid(False)  # remove grid for cleaner plot
plt.show()


In [None]:
# Cell 17: ROC Curve for the "Raw" Model

# Get the predicted Probabilitys for the test set
# (We use forward_propagation to retrieve the output layer probabilities, A2)
# 'parameters' comes from Cell 15
# 'X_test_raw' comes from Cell 8
probs_raw_test, _ = forward_propagation(X_test_raw, parameters)
probs_raw_test = probs_raw_test.flatten()  # flatten for sklearn metrics

# Compute the AUC score (Area Under the Curve)
auc_raw_test = roc_auc_score(y_test, probs_raw_test)
print(f"AUC of the 'Raw' Model (Test Set): {auc_raw_test:.4f}")

# Plot the ROC Curve
plt.figure(figsize=(8, 6))
ax = plt.gca()  # get current axes

RocCurveDisplay.from_predictions(
    y_test, 
    probs_raw_test, 
    name=f"Raw Model (AUC = {auc_raw_test:.3f})", 
    ax=ax
)

# Plot the random classifier line for reference
plt.plot([0, 1], [0, 1], 'r--', label='Random Classifier (AUC = 0.50)')

plt.title('ROC Curve - "Raw" Model (Test Set)')
plt.legend()
plt.show()

In [None]:
# Cell 18: The "Refined" Model (with Early Stopping)

# This is a modified copy of 'nn_model' (Cell 14)
# that includes Early Stopping.
def nn_model_refined(X_train, Y_train, X_val, Y_val, n_h, num_epochs, lambda_reg, learning_rate, patience=20):
    """
    Builds and trains a 2-layer neural network with Early Stopping.
    
    Arguments:
    X_train, Y_train -- training data and labels
    X_val, Y_val -- validation data and labels (for overfitting monitoring)
    n_h -- number of neurons in the hidden layer
    num_epochs -- maximum number of training iterations
    lambda_reg -- L2 regularization hyperparameter
    learning_rate -- step size for gradient descent
    patience -- number of epochs to wait without improvement before stopping
    
    Returns:
    best_params -- parameters (weights and biases) from the best epoch
    costs_train -- list of training costs over epochs
    costs_val -- list of validation costs over epochs
    """
    
    n_x = X_train.shape[0]  # number of input features
    n_y = Y_train.shape[0]  # number of outputs
    costs_train = []
    costs_val = []  # store validation costs
    
    # Initialize parameters
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    best_cost = float('inf')  # best validation cost so far
    patience_counter = 0
    best_params = {}
    
    for i in range(num_epochs):
        
        # Training step
        A2_train, cache_train = forward_propagation(X_train, parameters)
        cost_train = compute_cost(A2_train, Y_train, parameters, lambda_reg)
        grads = backward_propagation(parameters, cache_train, X_train, Y_train, lambda_reg)
        parameters = update_parameters(parameters, grads, learning_rate)
        
        # Validation step
        A2_val, _ = forward_propagation(X_val, parameters)
        cost_val = compute_cost(A2_val, Y_val, parameters, lambda_reg)
        
        costs_train.append(cost_train)
        costs_val.append(cost_val)

        # Early Stopping logic
        if cost_val < best_cost:
            best_cost = cost_val
            best_params = parameters.copy()  # save the best parameters
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f"--- Early stopping triggered at epoch {i} ---")
            break
            
    return best_params, costs_train, costs_val

print("Function 'nn_model_refined' (with Early Stopping) defined.")

In [None]:
# Cell 19: Grid Search with K-Fold Cross-Validation 

print("Starting Grid Search with K-Fold (K=5) for the refined model...")

# Define the hyperparameter grid
param_grid = {
    'n_h': [8, 16],              # hidden layer sizes to test
    'lambda_reg': [0.1, 0.5, 1.0], # L2 regularization values
    'learning_rate': [0.01]      # learning rate(s) to test
}

# Initialize K-Fold cross-validation
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

best_score_f1 = -1
best_hyperparams = {}
# (final model parameters will be trained at the end on all data)

start_time = time.time()

# Loop over hyperparameter combinations
for n_h in param_grid['n_h']:
    for lr in param_grid['learning_rate']:
        for lam in param_grid['lambda_reg']:
            
            print(f"\nTesting: n_h={n_h}, lr={lr}, lambda={lam}")
            fold_scores = []  # store F1-scores for each fold

            # K-Fold Cross-Validation
            for fold, (train_index, val_index) in enumerate(skf.split(X_train_full, y_train_full)):
                
                # Select "raw" data based on fold indices
                X_train_fold = X_train_full_raw[:, train_index]
                y_train_fold = y_train_full_raw[:, train_index]
                X_val_fold = X_train_full_raw[:, val_index]
                y_val_fold = y_train_full_raw[:, val_index]
                
                # For scoring with sklearn, we need the normal labels
                y_val_fold_series = y_train_full.iloc[val_index]

                # Train the refined model (with Early Stopping)
                params, _, _ = nn_model_refined(
                    X_train_fold, y_train_fold,
                    X_val_fold, y_val_fold,  # validation fold
                    n_h=n_h, 
                    num_epochs=2000,
                    lambda_reg=lam, 
                    learning_rate=lr,
                    patience=50
                )
                
                # Evaluate this fold
                y_pred_val_fold = predict(X_val_fold, params).flatten()
                score = f1_score(y_val_fold_series, y_pred_val_fold)
                fold_scores.append(score)
            
            # Compute average F1-score across folds
            avg_f1_score = np.mean(fold_scores)
            print(f"-> Mean F1 (K-Fold): {avg_f1_score:.4f}  (Scores: {[round(s, 2) for s in fold_scores]})")
            
            # Update best hyperparameters if this combination is better
            if avg_f1_score > best_score_f1:
                best_score_f1 = avg_f1_score
                best_hyperparams = {'n_h': n_h, 'lr': lr, 'lambda': lam}

print(f"\nGrid Search completed in {time.time() - start_time:.2f} seconds.")
print(f"\n--- BEST HYPERPARAMETERS (K-Fold) ---")
print(f"Best average F1-score: {best_score_f1:.4f}")
print(f"Best Hyperparameters: {best_hyperparams}")

# Final Retraining
# Train the final model on all training data (80% of the dataset)
# using the simple nn_model (Cell 14)
print("\nRetraining the final model on ALL training data...")
best_model_params, _ = nn_model(
    X_train_full_raw, y_train_full_raw,
    n_h=best_hyperparams['n_h'],
    num_epochs=2000,  # fixed number of epochs
    lambda_reg=best_hyperparams['lambda'],
    learning_rate=best_hyperparams['lr'],
    print_cost=False  # no printing
)
print("Final model retrained.")

In [None]:
# Cell 20: Final Evaluation of the Refined Model (Post K-Fold)

# Make predictions on the test and train_full sets with the best model
y_pred_refined_test = predict(X_test_raw, best_model_params).flatten()
y_pred_refined_train = predict(X_train_full_raw, best_model_params).flatten()

print("\n--- Performance Diagnostics (REFINED MODEL POST K-FOLD) ---")

# Classification report for the training set
print("Classification Report (TRAIN_FULL - Refined):")
print(classification_report(y_train_full, y_pred_refined_train, digits=4))
print("---")

# Classification report for the test set
print("Classification Report (TEST - Refined):")
print(classification_report(y_test, y_pred_refined_test, digits=4))
print("---")

# Confusion matrix for the test set
print("\nConfusion Matrix (TEST - Refined):\n")
cm = confusion_matrix(y_test, y_pred_refined_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No', 'Yes'])
disp.plot(cmap=plt.cm.Blues, colorbar=False)
disp.ax_.grid(False)  # remove grid for cleaner plot
plt.show()

# Visualization: Distribution of predicted probabilities
print("\nDistribution of Predicted Probabilities (TEST - Refined):\n")
probs_refined, _ = forward_propagation(X_test_raw, best_model_params)
probs_flat = probs_refined.flatten()

plt.figure(figsize=(10, 6))
plt.hist(probs_flat[y_test == 0], bins=30, alpha=0.5, label='Class 0 (No)')
plt.hist(probs_flat[y_test == 1], bins=30, alpha=0.5, label='Class 1 (Yes)')
plt.xlabel('Predicted Probability of "Yes"')
plt.ylabel('Frequency')
plt.title('Predicted Probability Distribution (Refined Model)')
plt.legend()
plt.show()

# Error Analysis
print("\n--- Error Analysis (TEST - Refined) ---")
misclassified_idx = np.where(y_test != y_pred_refined_test)[0]
print(f"Total errors: {len(misclassified_idx)} out of {len(y_test)}")
print("Showing the first 5 misclassified examples:")

for idx in misclassified_idx[:5]:
    original_idx = y_test.index[idx]
    true_label = y_test.iloc[idx]
    pred_label = y_pred_refined_test[idx]
    
    print(f"\n* Original Index: {original_idx}")
    print(f"  True Label: {true_label}, Predicted: {pred_label}")
    print(f"  Features: {X.loc[original_idx].to_dict()}")


In [None]:
# Cell 21: ROC Curve (Refined Model - Test Set)

# Get predicted probabilities for the test set
# (We use 'best_model_params' obtained from Cell 18)
probs_refined_test, _ = forward_propagation(X_test_raw, best_model_params)
probs_refined_test = probs_refined_test.flatten()  # flatten for sklearn metrics

# Compute the AUC score (Area Under the Curve)
auc_refined_test = roc_auc_score(y_test, probs_refined_test)
print(f"AUC of the Refined Model (Test Set): {auc_refined_test:.4f}")

# Plot the ROC Curve
plt.figure(figsize=(8, 6))
ax = plt.gca()  # get current axes

RocCurveDisplay.from_predictions(
    y_test, 
    probs_refined_test, 
    name=f"Refined Model (AUC = {auc_refined_test:.3f})", 
    ax=ax
)

# Plot the reference line for a random classifier
plt.plot([0, 1], [0, 1], 'r--', label='Random Classifier (AUC = 0.50)')

plt.title('ROC Curve - Refined Model Post-KFold (Test Set)')
plt.legend()
plt.show()

In [None]:
# Cell 22: Feature Importance Analysis (Refined Model)

print("--- Feature Importance Analysis (Weights from Layer 1 of the Refined Model) ---")

# Retrieve the feature names
# Instead of relying on another cell, we fetch them directly from the preprocessor
# 'preprocessor', 'num_features', 'cat_features' come from Cell 6
print("Fetching feature names from the preprocessor...")
onehot_features = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(cat_features)
all_features = num_features + list(onehot_features)
print(f"Found {len(all_features)} features in total.")

# Get the weight matrix W1 from the best model
# 'best_model_params' was obtained in Cell 18
W1_refined = best_model_params['W1']

# Compute feature importance
# Here we take the mean of the absolute values of the weights for each input feature
feature_importance_refined = np.mean(np.abs(W1_refined), axis=0)

# Step 4: Create a DataFrame for easy visualization
df_importance_refined = pd.DataFrame({
    'Feature': all_features,
    'Importance (Mean Abs W1)': feature_importance_refined
})

# Show the Top 10 most important features
print("\nTop 10 Features (based on mean absolute weights from the first layer):")
display(df_importance_refined.sort_values(by='Importance (Mean Abs W1)', ascending=False).head(10))

In [None]:
# Cell 23: Prepare Data for Keras/TensorFlow Model

print("--- Section: Keras/TensorFlow Model ---")

# Unlike our "raw" model, Keras/TensorFlow expects data in shape (n_samples, n_features)
# We'll use the processed datasets from Cell 8 (before transposing).

# Reminder of available datasets:
# X_train_full_processed -> 80% of data, preprocessed features
# y_train_full           -> 80% of data, target labels
# X_test_processed       -> 20% of data, preprocessed features
# y_test                 -> 20% of data, target labels

# Create a validation split for Early Stopping in Keras
# We'll use 20% of the training data as a validation set
X_train_k, X_val_k, y_train_k, y_val_k = train_test_split(
    X_train_full_processed, 
    y_train_full,
    test_size=0.20,       # 20% of the 80% training data = 16% of total dataset
    stratify=y_train_full, # keep the class distribution consistent
    random_state=42
)

# Define the number of input features (for the first layer of Keras)
n_features = X_train_k.shape[1]

# Display dataset shapes
print(f"Number of input features (input_shape): {n_features}")
print(f"Training set shape: {X_train_k.shape}")
print(f"Validation set shape: {X_val_k.shape}")
print(f"Test set shape: {X_test_processed.shape}")

In [None]:
# Cell 24: Build the Keras Model (Deeper Architecture)

# Retrieve the best L2 regularization value (lambda) found via K-Fold (Cell 18)
# If 'best_hyperparams' doesn't exist, fall back to 0.1 as in Cell 15
try:
    best_lambda = best_hyperparams['lambda']
except NameError:
    best_lambda = 0.1 

print(f"Using L2 regularization (lambda) = {best_lambda}")

# Define the Sequential model
model_keras = Sequential()

# Input layer + first hidden layer
# Use 'tanh' activation to match the "raw" model
model_keras.add(Dense(
    16,  # 16 neurons
    activation='tanh',
    kernel_regularizer=tf.keras.regularizers.l2(best_lambda),  # L2 regularization
    input_shape=(n_features,)  # only needed for the first layer
))

# Second hidden layer
model_keras.add(Dense(
    8,   # 8 neurons
    activation='tanh',
    kernel_regularizer=tf.keras.regularizers.l2(best_lambda)
))

# Output layer
model_keras.add(Dense(
    1,   # single neuron (binary classification)
    activation='sigmoid'  # sigmoid for probability output (0 to 1)
))

# Display the model architecture
print("\n--- Keras Model Architecture ---")
model_keras.summary()

In [None]:
# Cell 25: Compile and Train the Keras Model

# Define the Early Stopping callback
# This automatically stops training if validation loss stops improving
early_stopping = EarlyStopping(
    monitor='val_loss',          # Stop if the validation loss stops improving
    patience=50,                 # Wait 50 epochs without improvement
    restore_best_weights=True    # Restore the weights from the best epoch
)

# Compile the model
model_keras.compile(
    optimizer='adam',                   # Modern optimizer (better than plain gradient descent)
    loss='binary_crossentropy',         # Loss function for binary classification
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]  # Metrics to track
)

# Train the model
print("\n--- Starting Keras training (with progress bars!) ---")

# Set a high number of epochs (1000), but EarlyStopping will stop training early
history = model_keras.fit(
    X_train_k, y_train_k,
    epochs=1000,                       # maximum epochs
    batch_size=32,                     # mini-batch gradient descent
    validation_data=(X_val_k, y_val_k), # validation set for EarlyStopping
    callbacks=[early_stopping],         # pass the EarlyStopping callback
    verbose=1                           # 1 = show progress bars
)

print("\nKeras training completed (stopped early by EarlyStopping).")

In [None]:
# Cell 26: Final Evaluation of the Keras Model

print("\n--- Keras Model Evaluation (on Test Set) ---")

# Evaluate the model (Loss and Metrics)
loss, accuracy, auc = model_keras.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test Loss:     {loss:.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")
print(f"Test AUC:      {auc:.4f}")

# Make predictions
y_pred_probs_k = model_keras.predict(X_test_processed)
y_pred_k = (y_pred_probs_k > 0.5).astype(int).flatten()  # Round probabilities to 0 or 1

# Classification report
print("\nClassification Report (TEST - Keras):")
print(classification_report(y_test, y_pred_k, digits=4))

# Confusion matrix
print("\nConfusion Matrix (TEST - Keras):")
cm_k = confusion_matrix(y_test, y_pred_k)
disp_k = ConfusionMatrixDisplay(confusion_matrix=cm_k, display_labels=['No', 'Yes'])
disp_k.plot(cmap=plt.cm.Blues, colorbar=False)
disp_k.ax_.grid(False)  # Remove grid for a cleaner display
plt.show()


In [None]:
# Cell 27: ROC Curve (Keras Model - Test Set)

# Get predicted PROBABILITIES for the test set
# Using model.predict() from Keras
probs_keras = model_keras.predict(X_test_processed).flatten()  # flatten for sklearn metrics

# Compute the AUC score (Area Under the Curve)
auc_keras = roc_auc_score(y_test, probs_keras)
print(f"AUC of the Keras Model (Test Set): {auc_keras:.4f}")

# Plot the ROC Curve
plt.figure(figsize=(8, 6))
ax = plt.gca()  # get current axes

RocCurveDisplay.from_predictions(
    y_test,
    probs_keras,
    name=f"Keras Model (AUC = {auc_keras:.3f})",
    ax=ax
)

# Plot the reference line for a random classifier
plt.plot([0, 1], [0, 1], 'r--', label='Random Classifier (AUC = 0.50)')

plt.title('ROC Curve - Keras Model (Test Set)')
plt.legend()
plt.show()