In [None]:
# Cell 1: imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)
sns.set_style('whitegrid')

print("Imports ok")

In [None]:
# Cell 2: load and view CSV  
df = pd.read_csv('survey.csv')
print("Shape:", df.shape)
display(df.head(6))
display(df.info())

In [None]:
# Cell 3: Exploratory Data Analysis
print("Columns:", list(df.columns))
print("\nTarget value counts (treatment):")
print(df['treatment'].value_counts(dropna=False))

# Missing values %
missing_pct = df.isna().mean().sort_values(ascending=False) * 100
display(missing_pct[missing_pct>0].round(2))

# Visualization
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='treatment', order=df['treatment'].value_counts().index)
plt.title('Distribution of target: treatment')
plt.show()

# Age
plt.figure(figsize=(8,4))

# Outliers removal
df2 = df[(df['Age'] >= 14) & (df['Age'] <= 100)].copy()

# New DataFrame to plot
sns.histplot(df2['Age'].dropna(), bins=30)
plt.title('Age distribution (Filtered 0-100 years)')
plt.show()

In [None]:
# Cell 4 — Final Data Preparation Pipeline

import warnings
import pandas as pd
from sklearn.preprocessing import FunctionTransformer

# Gender Cleaning Function
# Standardizes messy gender labels into: "Male", "Female", or "Other"
def clean_gender(x):
    if pd.isna(x):
        return 'Other'
    s = str(x).strip().lower()

    # Map all variations to "Male"
    if s in ['male', 'm', 'man', 'male-ish', 'maile', 'mal', 'cis male', 'male (cis)']:
        return 'Male'

    # Map all variations to "Female"
    if s in ['female', 'f', 'woman', 'female (cis)', 'cis female']:
        return 'Female'

    # Everything else becomes "Other"
    return 'Other'


# Create df3 and Apply Cleaning Steps
df3 = df2.copy()

# Clean gender column
df3['Gender_clean'] = df3['Gender'].apply(clean_gender)

# Convert Yes/No columns to binary 1/0
bin_cols = [
    'self_employed', 'family_history', 'treatment', 'remote_work', 'tech_company',
    'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity'
]

with warnings.catch_warnings():
    warnings.simplefilter("ignore", FutureWarning)
    for c in bin_cols:
        if c in df3.columns:
            series = df3[c].replace({'Yes': 1, 'No': 0})
            df3.loc[:, c] = pd.to_numeric(series, errors='coerce')

print("df3 cleaned (gender and binary columns).")


# Select Candidate Features
candidate_features = [
    'Age', 'Gender_clean', 'self_employed', 'family_history',
    'work_interfere', 'no_employees', 'remote_work', 'tech_company',
    'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity'
]

# Keep only features that actually exist in the dataset
candidate_features = [c for c in candidate_features if c in df3.columns]
print("Candidate features selected.")


# Build the Preprocessor
# Separate numerical from categorical features
num_features = [
    c for c in candidate_features
    if df3[c].dtype in ['int64', 'float64'] and c != 'treatment'
]
cat_features = [c for c in candidate_features if c not in num_features]

print(f"Numerical features: {num_features}")
print(f"Categorical features: {cat_features}")

# Pipeline for numeric data: impute + scale
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for categorical data:
# - impute missing values
# - convert everything to string
# - one-hot encode
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('to_string', FunctionTransformer(lambda x: x.astype(str))),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine numerical + categorical pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='drop',
    sparse_threshold=0
)

print("Preprocessor created.")


# Define X and y
target_col = 'treatment'
print(f"Shape before dropping NaNs in target: {df3.shape}")

# Remove rows with missing target labels
df3 = df3.dropna(subset=[target_col])

print(f"Shape after dropping NaNs in target: {df3.shape}")

X = df3[candidate_features]
y = df3[target_col]  # Already numeric (0/1)

print("Target column 'y' is numeric.")


# Train/Test Split
STATE = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=STATE,
    stratify=y
)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")


# Build the Random Forest Pipeline
# The pipeline performs preprocessing + model training in a single object
pipe_rf = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestClassifier(
        random_state=STATE,
        class_weight='balanced'   # important for imbalanced datasets
    ))
])

print("Random Forest pipeline created.")
print("--- All preparation steps completed. You can now run Cell 5. ---")


In [None]:
# Cell 5: GridSearch cell
# FORCE DATA CLEANUP
print("--- FORCING DATA CLEANUP ---")
print(f"Data type in y_train BEFORE fix: {y_train.dtype}")
print("Value counts in y_train BEFORE fix:")
print(y_train.value_counts(dropna=False)) 

# Convert "Yes"/"No" to 1/0 and drop any rows that fail conversion
y_temp_numeric = pd.to_numeric(y_train.replace({'Yes': 1, 'No': 0}), errors='coerce')
mask = y_temp_numeric.notna()

# Filter X and y to include only valid rows
X_train_clean = X_train[mask]
y_train_clean = y_temp_numeric[mask]

print("\n--- DATA IS NOW CLEAN ---")
print("Value counts in y_train AFTER fix:")
print(y_train_clean.value_counts(dropna=False)) 
print(f"Shape of X_train (clean): {X_train_clean.shape}")
print(f"Shape of y_train (clean): {y_train_clean.shape}")


# -RUN GRIDSEARCH ON CLEAN DATA
# Small parameter grid for Random Forest
param_grid_rf = {
    'model__n_estimators': [100, 150],     # Number of trees
    'model__max_depth': [10, 15, None],    # Maximum depth per tree (None = unlimited)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=STATE)

grid_rf = GridSearchCV(
    pipe_rf,
    param_grid_rf,
    cv=cv,
    scoring='f1',      # Optimize for the F1-score
    n_jobs=4           # Use 4 CPU cores
)

print("\nStarting GridSearch on CLEAN data... this may take a minute.")
import time
start_time = time.time()

# Fit the full RF pipeline (preprocessing + model) on the cleaned data
grid_rf.fit(X_train_clean, y_train_clean)

end_time = time.time()
print(f"GridSearch took {end_time - start_time:.2f} seconds.")
print("---")

# DISPLAY THE RESULTS
print(f"Best F1 score (in CV): {grid_rf.best_score_:.4f}")     # Access .best_score_
print("Best parameters found:")
print(grid_rf.best_params_)                                   # Access .best_params_


In [None]:
# Cell 6: Evaluate on Test Set

# Retrieve the best Random Forest model selected by GridSearch
best_rf = grid_rf.best_estimator_

# Make predictions on the (uncleaned) X_test.
# The internal preprocessing pipeline inside best_rf will automatically clean X_test.
# Therefore, y_pred_rf will already be numeric (1.0 / 0.0)
y_pred_rf = best_rf.predict(X_test)

# The variable y_test stored in memory is still "dirty" (contains 'Yes'/'No').
# We must clean it using the EXACT same procedure we applied to y_train.
print(f"y_test values BEFORE fix: {y_test.value_counts(dropna=False).index[0:2]}...")

# Convert 'Yes'/'No' into 1/0.
# Any other unexpected value (e.g., 'Don't Know') becomes NaN.
y_test_numeric = pd.to_numeric(y_test.replace({'Yes': 1, 'No': 0}), errors='coerce')

# Create a mask to keep only valid rows (those that are NOT NaN)
mask = y_test_numeric.notna()

# Apply the mask to BOTH y_test and the predictions.
# This removes invalid rows and ensures perfect alignment.
y_test_clean = y_test_numeric[mask]
y_pred_rf_clean = y_pred_rf[mask]  

print(f"y_test values AFTER fix: {y_test_clean.value_counts(dropna=False).index[0:2]}...")

# Now evaluate using the cleaned and aligned test labels.
print("\nRandom Forest Test Classification Report:")
print(classification_report(y_test_clean, y_pred_rf_clean, digits=4))


In [None]:
# Cell 7: Define Preprocessor (LR-Selected Features)

# Define a NEW feature list
# Based on your Cell 12 from Notebook 1, these were the features
# that survived L1 regularization (i.e., coefficients ≠ 0).
# I am excluding 'no_employees' and 'tech_company' since they didn't appear
# in your top 10.
lr_features = [
    'Age',
    'Gender_clean',
    'family_history',
    'work_interfere',
    'benefits',
    'care_options',
    'anonymity'
]

print(f"Running Experiment 2 with {len(lr_features)} selected features.")

# Split the selected features into numeric and categorical
# We reuse the original 'num_features' and 'cat_features' lists.
num_features_lr = [c for c in lr_features if c in num_features]
cat_features_lr = [c for c in lr_features if c in cat_features]

print(f"LR Numeric features: {num_features_lr}")
print(f"LR Categorical features: {cat_features_lr}")

# Build NEW pipelines (with *_lr names) ---
# Same steps as before

num_pipeline_lr = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline_lr = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('to_string', FunctionTransformer(lambda x: x.astype(str))),  
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create the NEW ColumnTransformer
preprocessor_lr = ColumnTransformer([
    ('num', num_pipeline_lr, num_features_lr),
    ('cat', cat_pipeline_lr, cat_features_lr)
], remainder='drop', sparse_threshold=0)

print("New 'preprocessor_lr' created.")


In [None]:
# Cell 8: GridSearch for Random Forest using LR-selected features

# Create a NEW Random Forest pipeline using the selected features from LR
pipe_rf_lr = Pipeline([
    ('pre', preprocessor_lr),  # Use the new preprocessor for LR-selected features
    ('model', RandomForestClassifier(
        random_state=STATE,
        class_weight='balanced'   # Handle class imbalance
    ))
])
print("New pipeline 'pipe_rf_lr' has been created.")

# Define the parameter grid (can reuse previous values)
param_grid_rf = {
    'model__n_estimators': [100, 150],  # Number of trees
    'model__max_depth': [10, 15, None], # Maximum depth of trees
}

# Set up the new GridSearch
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=STATE)
grid_rf_lr = GridSearchCV(
    pipe_rf_lr, 
    param_grid_rf, 
    cv=cv, 
    scoring='f1',   # Use F1 score to evaluate performance
    n_jobs=4        # Use 4 CPU cores
)

print("\nStarting GridSearch for Random Forest with LR-selected features...")
start_time_lr = time.time()

# Train on the proper training data
# X_train: DataFrame of training features (from Cell 4)
# y_train_clean: cleaned target variable (from Cell 5)
grid_rf_lr.fit(X_train, y_train_clean)

end_time_lr = time.time()
print(f"GridSearch #2 finished in {end_time_lr - start_time_lr:.2f} seconds.")
print("---")
print(f"Best F1 score (from CV) using LR features: {grid_rf_lr.best_score_:.4f}")
print("Best hyperparameters found:")
print(grid_rf_lr.best_params_)


In [None]:
# Cell 9: Final Evaluation of Random Forest using LR-Selected Features

print("--- Results for Random Forest Model with LR-Selected Features ---")

# Grab the best model from GridSearch
best_rf_lr = grid_rf_lr.best_estimator_

# Make predictions on the original test set
y_pred_rf_lr = best_rf_lr.predict(X_test)

# Clean the test target variable
# Convert 'Yes'/'No' to numeric (1/0) and remove missing values
y_test_numeric = pd.to_numeric(y_test.replace({'Yes': 1, 'No': 0}), errors='coerce')
mask = y_test_numeric.notna()
y_test_clean = y_test_numeric[mask]

# Align predictions with the cleaned test data
y_pred_rf_lr_clean = y_pred_rf_lr[mask]

# Display a detailed classification report
print("\nRandom Forest Test Classification Report [LR Features]:")
print(classification_report(y_test_clean, y_pred_rf_lr_clean, digits=4))


In [None]:
# Cell 10: Graphical Overview of Hyperparameter Search (F1-Score vs. Max Depth)

# Extract detailed GridSearch results
# 'grid_rf' contains the results from the GridSearch
results = pd.DataFrame(grid_rf.cv_results_)

# Get the best hyperparameters from the winning model
best_n_estimators = grid_rf.best_params_['model__n_estimators']
best_depth = grid_rf.best_params_['model__max_depth']
best_score = grid_rf.best_score_

# Filter results to isolate the effect of max_depth
# Only consider results with the optimal number of estimators
depth_results = results[
    results['param_model__n_estimators'] == best_n_estimators
].sort_values(by='param_model__max_depth')

# Start plotting
plt.figure(figsize=(9, 6))

# Prepare X-axis labels, converting None to a readable string
x_labels = depth_results['param_model__max_depth'].apply(
    lambda x: 'None (Full)' if pd.isna(x) else int(x)
)

# Plot mean F1-Score from cross-validation
plt.plot(
    x_labels.astype(str), 
    depth_results['mean_test_score'], 
    marker='o', 
    linestyle='-', 
    color='darkgreen',
    label='Mean F1-Score (5-Fold CV)'
)

# Highlight the best result
best_depth_label = str(int(best_depth)) if not pd.isna(best_depth) else 'None (Full)'
plt.scatter(
    best_depth_label, 
    best_score, 
    color='red', 
    s=150, 
    label=f'Optimal Depth: {best_depth_label} (F1: {best_score:.4f})',
    zorder=5
)

# Titles and styling
plt.title('F1-Score vs. Maximum Tree Depth (Hyperparameter Search)')
plt.xlabel('Maximum Tree Depth (max_depth)')
plt.ylabel('Average F1-Score (Cross-Validation)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)

# Show the plot
plt.show()


In [None]:
# Cell 11: Confusion Matrix (Final Evaluation)

# Get the best estimator from GridSearch
# Assume 'grid_rf' has already been run and 'best_estimator_' is available
best_rf = grid_rf.best_estimator_

# Make predictions on the test set
y_predicted = best_rf.predict(X_test)

# Convert true labels to binary integers (0/1)
# This step is crucial to avoid ValueErrors and ensure predictions and labels are compatible
y_true_converted = y_test.replace({'No': 0, 'Yes': 1}).astype(int)

# Compute the confusion matrix
cm = confusion_matrix(y_true_converted, y_predicted)

# Set display labels for clarity
labels_en = ['No Treatment (0)', 'Sought Treatment (1)']

# Plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels_en)

print("Confusion Matrix: Optimized Random Forest")
disp.plot(cmap=plt.cm.Blues, colorbar=False) 
disp.ax_.grid(False)  # Ensure a clean plot appearance
plt.title('Random Forest - Confusion Matrix (Test Set)')

# Show the plot
plt.show()


In [None]:
# Cell 11: ROC Curve (Receiver Operating Characteristic) - Random Forest

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import RocCurveDisplay, roc_auc_score

# Preparations
# Assume 'grid_rf' has already been run and X_test is defined
best_rf = grid_rf.best_estimator_

# Convert y_test from strings ('Yes'/'No') to binary (0/1)
# This ensures that RocCurveDisplay and roc_auc_score work correctly
y_test_converted = y_test.replace({'No': 0, 'Yes': 1}).astype(int)

# Create the plot axis
fig, ax = plt.subplots()

# Plot the ROC curve for the optimized Random Forest
RocCurveDisplay.from_estimator(
    best_rf,              # The tuned RF model
    X_test,
    y_test_converted, 
    name='Random Forest (Tuned)',  # Legend label
    ax=ax                        # Use the axis we created
)

# Add a reference "chance" line (random classifier)
ax.plot([0, 1], [0, 1], 'k--', label='Chance (AUC = 0.50)')  # Dashed black line

# Customize titles and labels
ax.set_title('ROC Curve (Optimized Random Forest Model)')
ax.set_xlabel('False Positive Rate (FPR)')
ax.set_ylabel('True Positive Rate (TPR / Recall)')
ax.legend(loc='lower right')

# Show the plot
plt.show()

# Calculate and print the AUC score separately
# Extract predicted probabilities for the positive class (1)
y_probs_rf = best_rf.predict_proba(X_test)[:, 1]
auc_score_rf = roc_auc_score(y_test_converted, y_probs_rf)
print(f"Area Under the Curve (AUC Score): {auc_score_rf:.4f}")
