In [None]:
# C√©lula 1: imports e op√ß√µes
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 200)
sns.set_style('whitegrid')

print("Imports ok")

In [None]:
# C√©lula 2: carregar CSV e vis√£o r√°pida
df = pd.read_csv('survey.csv')
print("Shape:", df.shape)
display(df.head(6))
display(df.info())

In [None]:
# C√©lula 3: Exploratory Data Analysis
print("Colunas:", list(df.columns))
print("\nTarget value counts (treatment):")
print(df['treatment'].value_counts(dropna=False))

# Missing values %
missing_pct = df.isna().mean().sort_values(ascending=False) * 100
display(missing_pct[missing_pct>0].round(2))

# visualizar algumas distribui√ß√µes
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='treatment', order=df['treatment'].value_counts().index)
plt.title('Distribution of target: treatment')
plt.show()

# idade
plt.figure(figsize=(8,4))
sns.histplot(df['Age'].dropna(), bins=30)
plt.title('Age distribution')
plt.show()

In [None]:
# C√©lula 4: clean basic - age, gender normalisation, map Yes/No
df2 = df.copy()

# Age cleaning: converter para num√©rico e filtrar valores an√≥malos
df2['Age'] = pd.to_numeric(df2['Age'], errors='coerce')
print("Antes: n null age =", df2['Age'].isna().sum())
# remover idades absurdas (opcional): manter 14-100
df2.loc[(df2['Age'] < 14) | (df2['Age'] > 100), 'Age'] = np.nan
print("Depois filtro: n null age =", df2['Age'].isna().sum())

# Normalizar Gender: map para Male / Female / Other
def clean_gender(x):
    if pd.isna(x): return 'Other'
    s = str(x).strip().lower()
    # casos comuns
    if s in ['male', 'm', 'man', 'male-ish', 'maile', 'mal', 'cis male', 'male (cis)']: return 'Male'
    if s in ['female', 'f', 'woman', 'female (cis)', 'cis female']: return 'Female'
    # se contiver 'trans' ou 'non' agrupar como Other
    return 'Other'

df2['Gender_clean'] = df2['Gender'].apply(clean_gender)

# Map binary answers from 'Yes'/'No' to 1/0 for columns that have them:
bin_cols = ['self_employed','family_history','treatment','remote_work','tech_company']
for c in bin_cols:
    if c in df2.columns:
        df2[c] = df2[c].map({'Yes':1, 'No':0})
        df2[c] = df2[c].fillna(df2[c]).infer_objects(copy=False)  # se j√° forem 0/1 mant√©m
print("Cleaned basic columns. Sample:")
display(df2[['Age','Gender','Gender_clean','self_employed','family_history','treatment']].head())

# visualizar algumas distribui√ß√µes
plt.figure(figsize=(8,4))
sns.countplot(data=df2, x='treatment', order=df2['treatment'].value_counts().index)
plt.title('Distribution of target: treatment')
plt.show()

# idade
plt.figure(figsize=(8,4))
sns.histplot(df2['Age'].dropna(), bins=30)
plt.title('Age distribution')
plt.show()


In [None]:
# C√©lula 5: features iniciais e novas features simples
df3 = df2.copy()

# Criar feature bin√°ria: long_hours (ex.: horas de trabalho > 50)
# Note: se n√£o existir coluna de horas, ignora esta parte. Aqui assumimos que n√£o h√° 'hours' neste dataset -> n√£o criar.
# Exemplos de features a usar (ajusta conforme interesse):
candidate_features = [
    'Age', 
    'Gender_clean',
    'self_employed',
    'family_history',
    'work_interfere',   # categorical: 'Never','Rarely','Sometimes','Often'
    'no_employees',     # categorical: size of company
    'remote_work',
    'tech_company',
    'benefits',
    'care_options',
    'wellness_program',
    'seek_help',
    'anonymity'
]

# Ver que colunas existem no df
candidate_features = [c for c in candidate_features if c in df3.columns]
print("Candidate features used:", candidate_features)

# quick peek at categorical uniques for those columns
for c in candidate_features:
    if df3[c].dtype=='object' or df3[c].nunique() < 20:
        print(f"\n--- {c} unique values ---")
        print(df3[c].fillna('NA').value_counts().head(10))


In [None]:
# C√©lula 6: Preprocessing Pipeline (SEM Feature Engineering)

# USAR LISTA ANTIGA
num_features = [c for c in candidate_features if df3[c].dtype in ['int64','float64'] and c!='treatment']
cat_features = [c for c in candidate_features if c not in num_features]

print("num_features:", num_features)
print("cat_features:", cat_features)

# Pipelines (igual)
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
], remainder='drop', sparse_threshold=0)


In [None]:
# C√©lula 7: preparar X,y e separar treino/teste
X = df3[candidate_features].copy()
y = df3['treatment'].map({'Yes':1,'No':0}) if df3['treatment'].dtype=='object' else df3['treatment']
# Se houver NaNs no target, remover
mask = y.notna()
X = X[mask]
y = y[mask].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train/test shapes:", X_train.shape, X_test.shape)
print("Train target dist:", np.bincount(y_train)/len(y_train))


In [None]:
# C√©lula 8: baseline Dummy e Logistic regression com cross-validation
# Dummy baseline
dummy = Pipeline([('preproc', preprocessor), ('clf', DummyClassifier(strategy='most_frequent'))])
dummy.fit(X_train, y_train)
print("Baseline most frequent test score (accuracy):", dummy.score(X_test, y_test))

# Logistic CV
log_pipe = Pipeline([('preproc', preprocessor), ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear'))])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_f1 = cross_val_score(log_pipe, X_train, y_train, cv=cv, scoring='f1')
print("Logistic CV F1 mean ¬± std:", scores_f1.mean().round(4), scores_f1.std().round(4))

# Fit on full train and evaluate on test
log_pipe.fit(X_train, y_train)
y_pred = log_pipe.predict(X_test)
print("Logistic Test classification report:")
print(classification_report(y_test, y_pred, digits=4))


In [None]:
# C√©lula 9: Matriz de Confus√£o (Modelo Original)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 1. Gerar a matriz
# (y_test √© a verdade, y_pred √© a previs√£o do teu modelo original)
cm_original = confusion_matrix(y_test, y_pred)

# 2. Desenhar a matriz
disp_original = ConfusionMatrixDisplay(confusion_matrix=cm_original, 
                                       display_labels=['No', 'Yes'])

# 3. Mostrar o gr√°fico
print("Confusion Matrix (Original Model - No Tuning):")
disp_original.plot(cmap=plt.cm.Blues, colorbar=False)
disp_original.ax_.grid(False) # Desligar a grelha
plt.show()

In [None]:
# C√©lula 10: Imports e defini√ß√£o do Pipeline de Tuning
# 1. Criar o pipeline COMPLETO
# Junta o teu 'preprocessor' (que j√° tens) com o 'LogisticRegression'
#
# Nota: Estou a adicionar solver='liblinear' e max_iter=1000
# 'liblinear' √© bom para datasets pequenos e lida bem com a regulariza√ß√£o L1/L2
# 'max_iter' mais alto evita avisos de converg√™ncia chatos.

pipe_lr = Pipeline([
    ('pre', preprocessor), 
    ('model', LogisticRegression(solver='liblinear', max_iter=1000))
])

# 2. Definir a "grelha" de par√¢metros que queremos testar
# Vamos testar a for√ßa da regulariza√ß√£o (C) e o tipo de regulariza√ß√£o (penalty)
#
# 'model__C' -> O '__' (duplo underscore) diz ao Pipeline: 
# "Quero que passes o par√¢metro 'C' para o passo que chamei de 'model'"

param_grid = {
    'model__penalty': ['l1', 'l2'],
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100] 
}

# 3. Configurar o GridSearch
# cv=5 -> 5-fold cross-validation (como j√° tinhas feito)
# scoring='f1' -> Queremos otimizar para a pontua√ß√£o F1
# n_jobs=-1 -> Usa todos os processadores do teu PC. Vai ser r√°pido.
grid_lr = GridSearchCV(pipe_lr, param_grid, cv=5, scoring='f1', n_jobs=-1)

In [None]:
# C√©lula 11: Executar o Tuning
import time
start_time = time.time()

# Fit no GridSearch (isto pode demorar uns segundos)
# Ele vai usar X_train e y_train
grid_lr.fit(X_train, y_train)

end_time = time.time()

print(f"GridSearch demorou {end_time - start_time:.2f} segundos.")
print("---")
print(f"Melhor pontua√ß√£o F1 (em CV): {grid_lr.best_score_:.4f}")
print("Melhores par√¢metros encontrados:")
print(grid_lr.best_params_)

In [None]:
# C√©lula 12: Avaliar o modelo otimizado no set de teste
from sklearn.metrics import classification_report

# Pega no melhor estimador que o GridSearch encontrou
best_lr = grid_lr.best_estimator_

# Faz previs√µes no X_test
y_pred_lr_tuned = best_lr.predict(X_test)

print("Relat√≥rio de Classifica√ß√£o (Modelo Otimizado no Teste):")
print(classification_report(y_test, y_pred_lr_tuned))

In [None]:
# C√©lula 13: An√°lise de Feature Importance
import pandas as pd

# 1. Apanhar o modelo treinado (est√° dentro do pipeline)
model = best_lr.named_steps['model']

# 2. Apanhar o preprocessor (para sabermos os nomes das colunas)
preprocessor = best_lr.named_steps['pre']

# 3. Apanhar os nomes das features
# O preprocessor tem 2 transformadores ('num' e 'cat')
# O transformador 'cat' tem o 'onehot' l√° dentro
onehot_features = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(cat_features)

# Juntar os nomes das features num√©ricas + one-hot
# (A ordem TEM de ser a mesma do ColumnTransformer: num, depois cat)
all_features = num_features + list(onehot_features)

# 4. Criar um DataFrame bonito para ver os resultados
coefs = model.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': all_features,
    'Coeficiente': coefs,
    'Abs_Coef': abs(coefs) # Valor absoluto para ordenar por import√¢ncia
})

# 5. Mostrar as 10 features mais impactantes (positivas ou negativas)
print("Features mais importantes (impacto total):")
display(feature_importance.sort_values(by='Abs_Coef', ascending=False).head(10))

# 6. Mostrar as 5 que mais levam ao "Sim" (treatment=1)
print("\nFeatures que mais indicam 'Treatment = Yes':")
display(feature_importance.sort_values(by='Coeficiente', ascending=False).head(5))

# 7. Mostrar as 5 que mais levam ao "N√£o" (treatment=0)
print("\nFeatures que mais indicam 'Treatment = No':")
display(feature_importance.sort_values(by='Coeficiente', ascending=True).head(5))

In [None]:
# Cell 14: Confusion Matrix (Clean Version)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 1. Generate the matrix (this stays the same)
cm = confusion_matrix(y_test, y_pred_lr_tuned)

# 2. Draw the matrix (this stays the same)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                              display_labels=['No', 'Yes'])

# 3. Show the plot (THIS IS WHERE WE CHANGE IT)
print("Confusion Matrix (Optimized Model):")

# We add 'colorbar=False' to remove the scale on the right
disp.plot(cmap=plt.cm.Blues, colorbar=False) 

# We access the plot's 'axis' (ax_) and turn off the grid
disp.ax_.grid(False) # <-- The magic to remove the lines

plt.show()

In [None]:
# Cell 15: ROC Curve (Receiver Operating Characteristic)
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay, roc_auc_score

# 1. First, let's create the "canvas" (the axis)
# This allows us to draw both lines on the same plot
fig, ax = plt.subplots()

# 2. Now, we tell RocCurveDisplay to draw itself ON THAT canvas
# It uses .predict_proba() automatically to get the "confidence"
RocCurveDisplay.from_estimator(
    best_lr, # Your tuned model from GridSearch
    X_test,
    y_test,
    name='Logistic Regression (Tuned)', # Name for the legend
    ax=ax # Tells it to use the canvas we created
)

# 3. Finally, we draw the "chance" (Dummy) line on the SAME canvas
ax.plot([0, 1], [0, 1], 'k--', label='Chance (AUC = 0.50)') # 'k--' is a black dashed line
    
# 4. Clean up and show
ax.set_title('ROC Curve (Optimized Model)')
ax.legend() # Activates the legend (which shows your model's AUC)
plt.show()

# 5. (Optional) Print the AUC score separately
# The plot already shows it, but this way you have the number
y_probs = best_lr.predict_proba(X_test)[:, 1] # Probabilities for the "Yes" class only
auc_score = roc_auc_score(y_test, y_probs)
print(f"Area Under the Curve (AUC Score): {auc_score:.4f}")

In [None]:
# C√©lula 16: An√°lise dos Erros do Modelo
# 1. Identificar quais previs√µes est√£o erradas
errors_mask = (y_pred_lr_tuned != y_test)

# 2. Ver as caracter√≠sticas das pessoas mal classificadas
X_test_errors = X_test[errors_mask]
y_test_errors = y_test[errors_mask]
y_pred_errors = y_pred_lr_tuned[errors_mask]

print(f"Total de erros: {errors_mask.sum()} / {len(y_test)} ({errors_mask.sum()/len(y_test)*100:.1f}%)")
print("\n" + "="*60)

# 3. Separar os dois tipos de erro
false_positives = (y_pred_errors == 1) & (y_test_errors == 0)
false_negatives = (y_pred_errors == 0) & (y_test_errors == 1)

print(f"\n‚ùå Falsos Positivos: {false_positives.sum()} casos")
print("   (Modelo disse 'Yes' mas era 'No')")
print(f"\n‚ùå Falsos Negativos: {false_negatives.sum()} casos")
print("   (Modelo disse 'No' mas era 'Yes')")

# 4. Analisar caracter√≠sticas dos falsos positivos
print("\n" + "="*60)
print("üìä PERFIL DOS FALSOS POSITIVOS:")
print("="*60)
if false_positives.sum() > 0:
    fp_data = X_test_errors[false_positives]
    print("\nIdade m√©dia:", fp_data['Age'].mean())
    print("\nG√©nero:")
    print(fp_data['Gender_clean'].value_counts())
    print("\nWork Interfere:")
    print(fp_data['work_interfere'].value_counts())

# 5. Analisar caracter√≠sticas dos falsos negativos
print("\n" + "="*60)
print("üìä PERFIL DOS FALSOS NEGATIVOS:")
print("="*60)
if false_negatives.sum() > 0:
    fn_data = X_test_errors[false_negatives]
    print("\nIdade m√©dia:", fn_data['Age'].mean())
    print("\nG√©nero:")
    print(fn_data['Gender_clean'].value_counts())
    print("\nWork Interfere:")
    print(fn_data['work_interfere'].value_counts())

# 6. VISUALIZA√á√ÉO: Comparar distribui√ß√µes
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Gr√°fico 1: Idade
axes[0].hist(X_test[~errors_mask]['Age'].dropna(), bins=20, alpha=0.5, label='Corretos', color='green')
axes[0].hist(X_test_errors['Age'].dropna(), bins=20, alpha=0.5, label='Erros', color='red')
axes[0].set_xlabel('Idade')
axes[0].set_ylabel('Frequ√™ncia')
axes[0].set_title('Distribui√ß√£o de Idade: Corretos vs Erros')
axes[0].legend()

# Gr√°fico 2: Work Interfere
work_counts_correct = X_test[~errors_mask]['work_interfere'].value_counts()
work_counts_errors = X_test_errors['work_interfere'].value_counts()

x = range(len(work_counts_correct))
width = 0.35
axes[1].bar([i - width/2 for i in x], work_counts_correct.values, width, label='Corretos', color='green', alpha=0.7)
axes[1].bar([i + width/2 for i in x], work_counts_errors.values, width, label='Erros', color='red', alpha=0.7)
axes[1].set_xlabel('Work Interfere')
axes[1].set_ylabel('Frequ√™ncia')
axes[1].set_title('Work Interfere: Corretos vs Erros')
axes[1].set_xticks(x)
axes[1].set_xticklabels(work_counts_correct.index, rotation=45)
axes[1].legend()

plt.tight_layout()
plt.show()