
# RandomForest Classifier Pipeline com Scikit-Learn

Este notebook organiza o pipeline de machine learning utilizando **RandomForest** para classificação, incluindo:
- Pré-processamento de dados (numéricos e categóricos)
- Validação cruzada com métricas de avaliação
- Importância das variáveis
- Predição no conjunto de teste e geração do arquivo de submissão


## 1. Imports e Configurações

In [1]:
# importanto todas as bibliotecas necessárias
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib

# para gráficos interativos no Windows
matplotlib.use('TkAgg')


## 2. Carregando os dados

In [2]:
BASE_DIR = os.path.join('..', 'data')  # sobe uma pasta e entra em 'data'
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

## 3. Preparação dos dados

In [3]:

# separar features e target
X = train_df.drop(columns=['id', 'labels'])
y = train_df['labels']
X_test = test_df.drop(columns=['id'])

# identificar colunas
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = ['category_code']


## 4. Análise Descritiva do Dataset

In [4]:
# informações básicas do dataset
print("Informações Gerais do DataFrame de Treino:")
train_df.info()

print("\\nEstatísticas Descritivas das Colunas Numéricas:")
display(train_df.describe())

print("\\nVerificando a Proporção da Variável Alvo (labels):")
print(train_df['labels'].value_counts(normalize=True))

Informações Gerais do DataFrame de Treino:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646 entries, 0 to 645
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        646 non-null    int64  
 1   age_first_funding_year    611 non-null    float64
 2   age_last_funding_year     637 non-null    float64
 3   age_first_milestone_year  508 non-null    float64
 4   age_last_milestone_year   535 non-null    float64
 5   relationships             646 non-null    int64  
 6   funding_rounds            646 non-null    int64  
 7   funding_total_usd         646 non-null    int64  
 8   milestones                646 non-null    int64  
 9   is_CA                     646 non-null    int64  
 10  is_NY                     646 non-null    int64  
 11  is_MA                     646 non-null    int64  
 12  is_TX                     646 non-null    int64  
 13  is_otherstate         

Unnamed: 0,id,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,...,is_consulting,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,labels
count,646.0,611.0,637.0,508.0,535.0,646.0,646.0,646.0,646.0,646.0,...,646.0,646.0,646.0,646.0,646.0,646.0,646.0,646.0,646.0,646.0
mean,461.577399,2.341718,4.037724,3.352657,4.944729,7.948916,2.351393,29496330.0,1.913313,0.54644,...,0.003096,0.304954,0.329721,0.260062,0.51548,0.419505,0.235294,0.091331,2.848655,0.647059
std,264.859464,2.468275,2.950923,2.866952,3.213319,7.397602,1.357856,226199900.0,1.337095,0.498224,...,0.055598,0.460745,0.470476,0.439008,0.500148,0.49386,0.424511,0.288303,1.89405,0.478255
min,1.0,0.0,0.0,0.0,0.0,0.0,1.0,11000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,233.25,0.68,1.87,1.185,2.54,3.0,1.0,3000000.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0
50%,459.5,1.65,3.61,2.785,4.62,6.0,2.0,10200000.0,2.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.3333,1.0
75%,692.5,3.6,5.59,4.935,6.88,10.0,3.0,25875000.0,3.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0
max,923.0,21.9,21.9,24.68,24.68,63.0,8.0,5700000000.0,6.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16.0,1.0


\nVerificando a Proporção da Variável Alvo (labels):
labels
1    0.647059
0    0.352941
Name: proportion, dtype: float64


## 5. Análise Visual e Validação de Hipóteses

In [5]:
# gráficos para análise
import seaborn as sns
import matplotlib.pyplot as plt

# investigando a hipótese 1
sns.barplot(x='is_CA', y='labels', data=train_df, errorbar=None)
plt.title('Taxa de Sucesso: Califórnia vs. Outros Estados')
plt.ylabel('Taxa Média de Sucesso')
plt.show()

# investigando a hipótese 2
sns.boxplot(x='labels', y='relationships', data=train_df)
plt.title('Distribuição de Conexões por Sucesso da Startup')
plt.xlabel('0 = Insucesso, 1 = Sucesso')
plt.show()

# análise da principal variável categórica
plt.figure(figsize=(12, 6))
sns.countplot(y='category_code', data=train_df, order=train_df['category_code'].value_counts().index)
plt.title('Distribuição de Startups por Setor')
plt.xscale('log') # usar escala log para melhor visualização se houver muita variação
plt.show()

## 6. Otimização do Modelo

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

# carregar e separar os dados
X = train_df.drop(columns=['id', 'labels'])
y = train_df['labels']

# função para criar novas features
def create_features(df):
    df_transformed = df.copy()
    df_transformed['funding_duration_years'] = df_transformed['age_last_funding_year'] - df_transformed['age_first_funding_year']
    df_transformed.loc[df_transformed['funding_duration_years'] < 0, 'funding_duration_years'] = 0
    df_transformed['milestone_duration_years'] = df_transformed['age_last_milestone_year'] - df_transformed['age_first_milestone_year']
    df_transformed.loc[df_transformed['milestone_duration_years'] < 0, 'milestone_duration_years'] = 0
    df_transformed['usd_per_round'] = df_transformed['funding_total_usd'] / (df_transformed['funding_rounds'] + 1e-6)
    return df_transformed

# pipeline de pré-processamento
new_numerical_features = X.select_dtypes(include=np.number).columns.tolist() + [
    'funding_duration_years', 'milestone_duration_years', 'usd_per_round'
]
categorical_features = ['category_code']
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, new_numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

# pipeline final com o novo modelo
# usar o GradientBoostingClassifier no pipeline
pipeline_final = Pipeline(steps=[
    ('feature_creation', FunctionTransformer(create_features)),
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42)) # Removido class_weight, não é um parâmetro direto aqui
])

# otimização com um novo grid de parâmetros, específico para o Gradient Boosting
param_dist_gb = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__learning_rate": [0.01, 0.05, 0.1], # taxa de aprendizado, o parâmetro mais importante
    "classifier__max_depth": [3, 5, 7],            # profundidade das árvores
    "classifier__subsample": [0.8, 0.9, 1.0],      # fração de amostras para treinar cada árvore
    "classifier__max_features": ["sqrt", "log2"]
}
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(
    estimator=pipeline_final,
    param_distributions=param_dist_gb,
    n_iter=50, # manter uma busca ampla
    cv=cv_strategy,
    scoring="accuracy",
    random_state=42,
    verbose=1,
    n_jobs=-1
)

print("🚀 Iniciando a otimização com Gradient Boosting e Engenharia de Features...")
random_search.fit(X, y)

# exibir resultados finais
print("\\n--- ✅ Resultados da Otimização Final ---")
print(f"Melhor acurácia encontrada: {random_search.best_score_:.4f}")
print("\\nMelhores hiperparâmetros:")
print(random_search.best_params_)

print("\\n--- 📊 Avaliando o Desempenho do Modelo Otimizado Definitivo ---")
best_model_gb = random_search.best_estimator_
scoring_metrics = ['accuracy', 'f1', 'roc_auc']
final_cv_results = cross_validate(best_model_gb, X, y, cv=cv_strategy, scoring=scoring_metrics)

print("\\nNovas Métricas Médias (Gradient Boosting Otimizado):")
print(f"  - Acurácia: {np.mean(final_cv_results['test_accuracy']):.4f}")
print(f"  - F1-score: {np.mean(final_cv_results['test_f1']):.4f}")
print(f"  - ROC-AUC : {np.mean(final_cv_results['test_roc_auc']):.4f}")

🚀 Iniciando a otimização com Gradient Boosting e Engenharia de Features...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
\n--- ✅ Resultados da Otimização Final ---
Melhor acurácia encontrada: 0.8003
\nMelhores hiperparâmetros:
{'classifier__subsample': 0.8, 'classifier__n_estimators': 300, 'classifier__max_features': 'log2', 'classifier__max_depth': 3, 'classifier__learning_rate': 0.05}
\n--- 📊 Avaliando o Desempenho do Modelo Otimizado Definitivo ---
\nNovas Métricas Médias (Gradient Boosting Otimizado):
  - Acurácia: 0.8003
  - F1-score: 0.8539
  - ROC-AUC : 0.8081


## Predição e Submissão

In [7]:
print("Fazendo previsões no conjunto de teste com o modelo otimizado...")

X_test_final = test_df.drop(columns=['id'])
test_predictions_final = best_model_gb.predict(X_test_final)

# criar o DataFrame de submissão
submission_df_final = pd.DataFrame({
    'id': test_df['id'],
    'labels': test_predictions_final
})

# salvar o arquivo CSV
submission_path_final = 'submission_final_otimizado.csv'
submission_df_final.to_csv(submission_path_final, index=False)

print(f"\\nArquivo de submissão '{submission_path_final}' criado com sucesso!")
display(submission_df_final.head())

Fazendo previsões no conjunto de teste com o modelo otimizado...
\nArquivo de submissão 'submission_final_otimizado.csv' criado com sucesso!


Unnamed: 0,id,labels
0,70,1
1,23,0
2,389,1
3,872,1
4,920,1
