In [None]:
# Cell One: imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)
sns.set_style('whitegrid')

print("Imports ok")

In [None]:
# Cell two: load and view CSV  
df = pd.read_csv('survey.csv')
print("Shape:", df.shape)
display(df.head(6))
display(df.info())

In [None]:
# Cell three:Exploratory Data Analysis
print("Columns:", list(df.columns))
print("\nTarget value counts (treatment):")
print(df['treatment'].value_counts(dropna=False))

# Missing values %
missing_pct = df.isna().mean().sort_values(ascending=False) * 100
display(missing_pct[missing_pct>0].round(2))

# Visualization
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='treatment', order=df['treatment'].value_counts().index)
plt.title('Distribution of target: treatment')
plt.show()

# Age
plt.figure(figsize=(8,4))

# 1. Outliers removal
df2 = df[(df['Age'] >= 14) & (df['Age'] <= 100)].copy()

# 2. Usamos esse novo DataFrame para o gráfico
sns.histplot(df2['Age'].dropna(), bins=30)
plt.title('Age distribution (Filtered 0-100 years)')
plt.show()

In [None]:
# Cell 4 (THE REAL, ALL-IN-ONE PREP CELL - v3 - THE FINAL ONE)
import warnings
import pandas as pd
from sklearn.preprocessing import FunctionTransformer # <-- VAMOS PRECISAR DISTO

# --- 1. Define Cleaning Function ---
def clean_gender(x):
    if pd.isna(x): return 'Other'
    s = str(x).strip().lower()
    if s in ['male', 'm', 'man', 'male-ish', 'maile', 'mal', 'cis male', 'male (cis)']: return 'Male'
    if s in ['female', 'f', 'woman', 'female (cis)', 'cis female']: return 'Female'
    return 'Other'

# --- 2. Copy df2 and APPLY ALL CLEANING ---
df3 = df2.copy()

# Clean Gender
df3['Gender_clean'] = df3['Gender'].apply(clean_gender)

# Clean binary columns
bin_cols = ['self_employed','family_history','treatment','remote_work','tech_company',
            'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity']

with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for c in bin_cols:
        if c in df3.columns:
            series = df3[c].replace({'Yes':1, 'No':0})
            df3.loc[:, c] = pd.to_numeric(series, errors='coerce')
print("df3 cleaned (Gender and binary columns).")

# --- 3. Define Features ---
candidate_features = [
    'Age', 'Gender_clean', 'self_employed', 'family_history',
    'work_interfere', 'no_employees', 'remote_work', 'tech_company',
    'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity'
]
candidate_features = [c for c in candidate_features if c in df3.columns]
print("Candidate features selected.")

# --- 4. Define Preprocessor (COM A CORREÇÃO DO 'TypeError') ---
num_features = [c for c in candidate_features if df3[c].dtype in ['int64','float64'] and c!='treatment']
cat_features = [c for c in candidate_features if c not in num_features]
print(f"Num features: {num_features}")
print(f"Cat features: {cat_features}")

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# --- AQUI ESTÁ A CORREÇÃO ---
# Vamos forçar tudo o que entra aqui a ser uma string
# O Imputer corre, depois convertemos tudo para string, depois o OneHotEncoder corre.
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('to_string', FunctionTransformer(lambda x: x.astype(str))), # <-- O ERRO MORRE AQUI
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# -----------------------------

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
], remainder='drop', sparse_threshold=0)
print("Preprocessor created.")

# --- 5. Define X and y (WITH ALL FIXES) ---
target_col = 'treatment'
print(f"Shape before dropping target NaNs: {df3.shape}")
df3 = df3.dropna(subset=[target_col])
print(f"Shape after dropping target NaNs: {df3.shape}")

X = df3[candidate_features] 
y = df3[target_col] # Isto já é 1.0 / 0.0 da limpeza que fizemos em cima
print("Target column 'y' is now numeric.")

# --- 6. Split the data ---
STATE = 42 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=STATE, 
    stratify=y
)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# --- 7. Build the RF Pipeline ---
pipe_rf = Pipeline([
    ('pre', preprocessor), 
    ('model', RandomForestClassifier(
        random_state=STATE,
        class_weight='balanced'
    ))
])
print("Random Forest pipeline created.")
print("--- ALL PREP IS DONE. YOU ARE CLEAR TO RUN CELL 8 (agora é a Cell 5). ---")

In [None]:
# Cell 5 (THE NEW, FINAL, ATOMIC BOMB CELL - Replaces your old GridSearch cell)
import pandas as pd
import numpy as np

# --- ESTA PARTE JÁ CORREU, MAS NÃO FAZ MAL CORRER DE NOVO ---
print("--- FORCING DATA CLEANUP ---")
print(f"Data type in y_train BEFORE fix: {y_train.dtype}")
print("Value counts in y_train BEFORE fix:")
print(y_train.value_counts(dropna=False)) 

y_temp_numeric = pd.to_numeric(y_train.replace({'Yes': 1, 'No': 0}), errors='coerce')
mask = y_temp_numeric.notna()
X_train_clean = X_train[mask]
y_train_clean = y_temp_numeric[mask]

print("\n--- DATA IS NOW CLEAN ---")
print("Value counts in y_train AFTER fix:")
print(y_train_clean.value_counts(dropna=False)) 
print(f"Shape of X_train (clean): {X_train_clean.shape}")
print(f"Shape of y_train (clean): {y_train_clean.shape}")

# --- 2. NOW WE RUN THE GRIDSEARCH ON THE CLEAN DATA ---
param_grid_rf = {
    'model__n_estimators': [100, 150],       # Number of trees
    'model__max_depth': [10, 15, None],      # How deep trees can go (None = full)
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=STATE)
grid_rf = GridSearchCV(
    pipe_rf, 
    param_grid_rf, 
    cv=cv, 
    scoring='f1', 
    n_jobs=4     # Using 4 cores
)

print("\nStarting GridSearch on CLEAN data... this may take a minute.")
import time
start_time = time.time()

grid_rf.fit(X_train_clean, y_train_clean) 

end_time = time.time()
print(f"GridSearch took {end_time - start_time:.2f} seconds.")
print("---")

# --- A CORREÇÃO ESTÁ AQUI ---
print(f"Best F1 score (in CV): {grid_rf.best_score_:.4f}")  # <-- Adiciona o underscore
print("Best parameters found:")
print(grid_rf.best_params_) # <-- Adiciona o underscore

In [None]:
# Cell 6 (Old Cell 9): Evaluate on Test Set

# Get the best RF model found by the GridSearch
best_rf = grid_rf.best_estimator_

# Make predictions on the (dirty) X_test.
# O pipeline dentro do best_rf limpa-o automaticamente.
# y_pred_rf will be clean (1.0 / 0.0)
y_pred_rf = best_rf.predict(X_test)

# --- A CORREÇÃO FINAL ESTÁ AQUI ---
# A variável y_test na memória está "suja" (tem 'Yes'/'No').
# Temos de a limpar *exatamente* como limpámos a y_train.
print(f"y_test values BEFORE fix: {y_test.value_counts(dropna=False).index[0:2]}...")

# 1. Converte 'Yes'/'No' para 1/0 e "Don't Know" (ou outro texto) para NaN
y_test_numeric = pd.to_numeric(y_test.replace({'Yes': 1, 'No': 0}), errors='coerce')

# 2. Cria uma máscara de todas as linhas válidas (que NÃO são NaN)
mask = y_test_numeric.notna()

# 3. Aplica essa máscara a AMBOS, o y_test e o y_pred
# Isto remove as linhas com um y_test inválido e mantém-nos alinhados
y_test_clean = y_test_numeric[mask]
y_pred_rf_clean = y_pred_rf[mask] # <-- Importante!

print(f"y_test values AFTER fix: {y_test_clean.value_counts(dropna=False).index[0:2]}...")

# Agora comparamos duas listas limpas.
print("\nRandom Forest Test Classification Report:")
print(classification_report(y_test_clean, y_pred_rf_clean, digits=4))

In [None]:
# Cell 6: EXPERIMENT 2 - Define Preprocessor (LR-Selected Features)

# --- 1. Define a NOVA lista de features ---
# Baseado na tua Célula 12 do notebook 1, estas eram as features
# que o L1 NÃO matou (i.e., com coeficiente != 0)
# (Estou a excluir 'no_employees' e 'tech_company' que não apareceram no teu top 10)
lr_features = [
    'Age', 
    'Gender_clean', 
    'family_history', 
    'work_interfere', 
    'benefits', 
    'care_options', 
    'anonymity'
]

print(f"Running Experiment 2 with {len(lr_features)} selected features.")

# --- 2. Separar as listas de features (num/cat) ---
num_features_lr = [c for c in lr_features if c in num_features] # Re-usa as 'num_features' originais
cat_features_lr = [c for c in lr_features if c in cat_features] # Re-usa as 'cat_features' originais

print(f"LR Num features: {num_features_lr}")
print(f"LR Cat features: {cat_features_lr}")

# --- 3. Criar os NOVOS pipelines (com nomes _lr) ---
# Usamos os mesmos "steps" de antes, incluindo o fix do .astype(str)

num_pipeline_lr = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline_lr = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('to_string', FunctionTransformer(lambda x: x.astype(str))), # O nosso fix de TypeError
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- 4. Criar o NOVO Preprocessor ---
preprocessor_lr = ColumnTransformer([
    ('num', num_pipeline_lr, num_features_lr),
    ('cat', cat_pipeline_lr, cat_features_lr)
], remainder='drop', sparse_threshold=0) # 'remainder=drop' é automático

print("New 'preprocessor_lr' created.")

In [None]:
# Cell 7: EXPERIMENT 2 - Run GridSearch (LR-Selected Features)

# --- 1. Criar o NOVO pipeline de RF ---
pipe_rf_lr = Pipeline([
    ('pre', preprocessor_lr), # <-- Usa o NOVO preprocessor
    ('model', RandomForestClassifier(
        random_state=STATE,
        class_weight='balanced'
    ))
])
print("New 'pipe_rf_lr' created.")

# --- 2. Definir a Grelha de Parâmetros (pode ser a mesma) ---
param_grid_rf = {
    'model__n_estimators': [100, 150],
    'model__max_depth': [10, 15, None],
}

# --- 3. Configurar o NOVO GridSearch ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=STATE)
grid_rf_lr = GridSearchCV( # <-- NOVO nome de variável
    pipe_rf_lr, 
    param_grid_rf, 
    cv=cv, 
    scoring='f1', 
    n_jobs=4 # Usar 4 cores, como antes
)

print("\nStarting NEW GridSearch (LR features)...")
start_time_lr = time.time()

# --- 4. Treinar com os DADOS DE TREINO CERTOS ---
# A célula 4 (Super-Cell) criou o X_train (DataFrame)
# A célula 5 (Bomba Atómica) criou o y_train_clean (Series numérico)
# O nosso novo pipeline sabe selecionar as colunas do X_train.
grid_rf_lr.fit(X_train, y_train_clean) # <-- Fit no X_train original (DataFrame)

end_time_lr = time.time()
print(f"GridSearch #2 took {end_time_lr - start_time_lr:.2f} seconds.")
print("---")
print(f"Best F1 score (in CV) [LR Features]: {grid_rf_lr.best_score_:.4f}")
print("Best parameters found [LR Features]:")
print(grid_rf_lr.best_params_)

In [None]:
# Cell 8: EXPERIMENT 2 - Final Evaluation
print("--- Results for RF Model with LR-Selected Features ---")

# 1. Pega no melhor modelo
best_rf_lr = grid_rf_lr.best_estimator_

# 2. Faz previsões (no X_test original)
y_pred_rf_lr = best_rf_lr.predict(X_test)

# 3. Usa os DADOS DE TESTE LIMPOS (da célula 6 antiga)
# (y_test_clean e y_pred_rf_clean [só para a máscara])
# (Temos de refazer a limpeza do y_test, já que a Célula 6 antiga pode não ter corrido)
y_test_numeric = pd.to_numeric(y_test.replace({'Yes': 1, 'No': 0}), errors='coerce')
mask = y_test_numeric.notna()
y_test_clean = y_test_numeric[mask]
y_pred_rf_lr_clean = y_pred_rf_lr[mask] # <-- Alinha as previsões

# 4. Mostra o relatório
print("\nRandom Forest Test Classification Report [LR Features]:")
print(classification_report(y_test_clean, y_pred_rf_lr_clean, digits=4))