
# RandomForest Classifier Pipeline com Scikit-Learn

Este notebook organiza o pipeline de machine learning utilizando **RandomForest** para classificação, incluindo:
- Pré-processamento de dados (numéricos e categóricos)
- Validação cruzada com métricas de avaliação
- Importância das variáveis
- Predição no conjunto de teste e geração do arquivo de submissão


## 1. Imports e Configurações

In [1]:
# importanto todas as bibliotecas necessárias
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib

# para gráficos interativos no Windows
matplotlib.use('TkAgg')


## 2. Carregando os dados

In [9]:
BASE_DIR = os.path.join('..', 'data')  # sobe uma pasta e entra em 'data'
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

## 3. Preparação dos dados

In [10]:

# separar features e target
X = train_df.drop(columns=['id', 'labels'])
y = train_df['labels']
X_test = test_df.drop(columns=['id'])

# identificar colunas
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = ['category_code']


## 4. Pré-processamento

In [11]:

# transformações numéricas
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# transformações categóricas
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# combinação dos preprocessadores
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])


## 5. Definindo o modelo e o pipeline

In [12]:

rf_model = RandomForestClassifier(
    n_estimators=200, 
    class_weight='balanced', 
    random_state=42
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_model)
])


## 6. Validação cruzada

In [13]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(
    pipeline, X, y, cv=cv, 
    scoring=scoring, 
    return_train_score=False
)

print("\nCross-Validation Metrics per Fold")
for i in range(5):
    print(f"Fold {i+1}: Accuracy={cv_results['test_accuracy'][i]:.3f}, "
          f"F1={cv_results['test_f1'][i]:.3f}, ROC-AUC={cv_results['test_roc_auc'][i]:.3f}")

print("\nAverage Metrics")
print(f"Accuracy : {np.mean(cv_results['test_accuracy']):.3f}")
print(f"F1-score : {np.mean(cv_results['test_f1']):.3f}")
print(f"ROC-AUC  : {np.mean(cv_results['test_roc_auc']):.3f}")



Cross-Validation Metrics per Fold
Fold 1: Accuracy=0.800, F1=0.857, ROC-AUC=0.809
Fold 2: Accuracy=0.744, F1=0.822, ROC-AUC=0.765
Fold 3: Accuracy=0.752, F1=0.816, ROC-AUC=0.795
Fold 4: Accuracy=0.798, F1=0.862, ROC-AUC=0.796
Fold 5: Accuracy=0.798, F1=0.854, ROC-AUC=0.867

Average Metrics
Accuracy : 0.779
F1-score : 0.842
ROC-AUC  : 0.806


## 7. Importância das variáveis

In [14]:

pipeline.fit(X, y)

feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
importances = pipeline.named_steps['classifier'].feature_importances_

feat_imp = pd.DataFrame({
    "Feature": feature_names, 
    "Importance": importances
}).sort_values(by='Importance', ascending=False).head(15)

plt.figure(figsize=(10,6))
sns.barplot(data=feat_imp, x='Importance', y='Feature', palette='mako')
plt.title("Top 15 Features - RandomForest")
plt.tight_layout()
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=feat_imp, x='Importance', y='Feature', palette='mako')


## 8. Predição no conjunto de teste e submissão

In [19]:
BASE_DIR = os.path.join('..', 'data')

test_df = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'))
X_test = test_df.drop(columns=['id'])
test_ids = test_df['id']

# fazer previsões
test_predictions = pipeline.predict(X_test)

# criar DataFrame de submissão
submission_df = pd.DataFrame({
    'id': test_ids,
    'labels': test_predictions
})

# caminho do arquivo de submissão
submission_path = os.path.join(BASE_DIR, 'submission_randomforest.csv')

# salvar CSV
submission_df.to_csv(submission_path, index=False)

print(f"Submission file saved at: {submission_path}")



Submission file saved at: ..\data\submission_randomforest.csv
