In [55]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.7.0 scikit-optimize-0.10.2


In [56]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

def ajustando_df(df):
  df['Title'] = df['Name'].str.extract(r'(Mr|Mrs|Miss|Ms|Dr|Rev|Major)',expand=True)
  df.groupby('Title')['Age'].median()
  df['family_size'] = df['SibSp'] + df['Parch'] + 1
  df['is_alone'] = np.where(df['family_size'] == 1, 1, 0)
  df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
  df.drop(columns=['Name','Cabin','Ticket','Title'],inplace=True)
  df.dropna(subset=['Embarked'],inplace=True)
  df['Age'] = df['Age'].fillna(df['Age'].median())
  df["Sex"] = df["Sex"].map({"female": 0, "male": 1})
  df['Pclass_Sex'] = df['Sex'] * df['Pclass']
  df['Fare_Sex'] = df['Sex'] * df['Fare']
  return df


features_minmax = ['Age','Fare','SibSp','family_size','Parch','Pclass_Sex','Fare_Sex']
features_onehot = ['Embarked']

preprocessor = ColumnTransformer(
transformers=[
    ('num', MinMaxScaler(), features_minmax),
    ('cat', OneHotEncoder(handle_unknown='ignore'), features_onehot),
]
)

In [57]:
df = pd.read_csv('train.csv')

df = ajustando_df(df)


y = df['Survived']
df_new = df.drop(columns=['Survived','PassengerId'])
# Split treino/teste
X_train, X_test, y_train, y_test = train_test_split(
    df_new, y, test_size=0.2, random_state=42
)
preprocessor.fit(X_train)

X_train_transform = preprocessor.transform(X_train)
feature_names = preprocessor.get_feature_names_out()
X_train_transform = pd.DataFrame(X_train_transform, columns=feature_names,  index=X_train.index)
X_train = pd.concat([X_train_transform, X_train.drop(columns=features_minmax+features_onehot)], axis=1)

X_test_transform = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()
X_test_transform = pd.DataFrame(X_test_transform, columns=feature_names,index=X_test.index)
X_test = pd.concat([X_test_transform, X_test.drop(columns=features_minmax+features_onehot)], axis=1)

In [58]:
# Selecionar as 6 melhores features usando ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=6)
X_train_best_features = selector.fit_transform(X_train, y_train)
X_test_best_features = selector.transform(X_test)

mask = selector.get_support()
selected_features = X_train.columns[mask]

print("Features escolhidas:", list(selected_features))

Features escolhidas: ['num__Fare', 'num__Pclass_Sex', 'cat__Embarked_C', 'Pclass', 'Sex', 'is_alone']


In [None]:
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# Espaço de busca
space = [
    # RandomForest
    Integer(50, 200, name="n_estimators_rf"),
    Integer(2, 10, name="max_depth_rf"),

    # SVM
    Real(0.01, 10.0, name="C_svm"),
    Categorical(["linear", "rbf"], name="kernel_svm"),

    # XGBoost
    Integer(50, 200, name="n_estimators_xgb"),
    Real(0.01, 0.3, name="learning_rate_xgb"),
    Integer(2, 10, name="max_depth_xgb"),

    # Voting type
    Categorical(["soft", "hard"], name="voting_type")
]
# Função objetivo
@use_named_args(space)
def objective(**params):
    rf = RandomForestClassifier(
        n_estimators=params["n_estimators_rf"],
        max_depth=params["max_depth_rf"],
        random_state=42
    )

    svc = SVC(
        C=params["C_svm"],
        kernel=params["kernel_svm"],
        probability=True,  # necessário para voting="soft"
        random_state=42
    )

    xgb = XGBClassifier(
        n_estimators=params["n_estimators_xgb"],
        learning_rate=params["learning_rate_xgb"],
        max_depth=params["max_depth_xgb"],
        eval_metric="logloss",
        random_state=42
    )

    # Meta-modelo: Logistic Regression
    meta_lr = LogisticRegression(max_iter=1000, random_state=42)

    # Stacking (RF + SVM como base, LogisticRegression como meta-modelo)
    stacking = StackingClassifier(
        estimators=[("rf", rf), ("svc", svc),("xgb", xgb)],
        final_estimator=meta_lr,
        passthrough=True
    )

    # Voting (RF + SVM + XGB)
    voting = VotingClassifier(
        estimators=[("rf", rf), ("svc", svc), ("xgb", xgb)],
        voting=params["voting_type"]
    )

    # Avaliar ambos
    scores_stack = cross_val_score(stacking, X_train_best_features, y_train, cv=5, scoring="accuracy")
    scores_vote = cross_val_score(voting, X_train_best_features, y_train, cv=5, scoring="accuracy")

    # Combinação (50% stacking, 50% voting)
    return -(0.5 * np.mean(scores_stack) + 0.5 * np.mean(scores_vote))

# Rodar otimização
res = gp_minimize(objective, space, n_calls=40, random_state=42)

print("Melhor score combinado:", -res.fun)
print("Melhores hiperparâmetros:", res.x)
print("Nomeados:", dict(zip([dim.name for dim in space], res.x)))

In [None]:
# Modelos base
model1 = RandomForestClassifier(n_estimators=200, random_state=42)
model2 = GradientBoostingClassifier(n_estimators=200,learning_rate=0.001,max_depth=5, random_state=42)
model3 = SVC(probability=True, kernel="rbf", random_state=42)

# Meta-modelo para stacking
meta_model = LogisticRegression()

# ----- STACKING -----
stacking_clf = StackingClassifier(
    estimators=[("rf", model1), ("gb", model2), ("svc", model3)],
    final_estimator=meta_model,
    cv=5
)

stacking_clf.fit(X_train_best_features, y_train)
y_pred_stack = stacking_clf.predict(X_test_best_features)

print("Stacking Accuracy:", accuracy_score(y_test, y_pred_stack))

# ----- VOTING -----
voting_clf = VotingClassifier(
    estimators=[("rf", model1), ("gb", model2), ("svc", model3)],
    voting="soft"
)

voting_clf.fit(X_train_best_features, y_train)
y_pred_vote = voting_clf.predict(X_test_best_features)

print("Voting Accuracy:", accuracy_score(y_test, y_pred_vote))

# Avaliação em treino e teste
y_train_pred = voting_clf.predict(X_train_best_features)
y_test_pred = voting_clf.predict(X_test_best_features)

acc_train = accuracy_score(y_train, y_train_pred)
acc_test = accuracy_score(y_test, y_test_pred)

print(f"Voting Ensemble - Acurácia Treino: {acc_train:.4f}")
print(f"Voting Ensemble - Acurácia Teste:  {acc_test:.4f}")

# Detectando overfitting
if acc_train - acc_test > 0.05:
    print("⚠️ Possível Overfitting (diferença > 5%)")
else:
    print("✅ Sem sinais fortes de overfitting")

# ----- MATRIZ DE CONFUSÃO -----
cm = confusion_matrix(y_test, y_pred_vote)
print("\nMatriz de Confusão (Voting):")
print(cm)

# ----- CLASSIFICATION REPORT -----
print("\nClassification Report (Voting):")
print(classification_report(y_test, y_pred_vote))