In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import optuna


In [2]:
train=pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test=pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
sample_sub = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

In [3]:
# Identificar colunas numéricas e categóricas
num_cols = train.select_dtypes(include=np.number).columns
cat_cols = train.select_dtypes(exclude=np.number).columns

# Remover colunas apenas se existirem
if "Transported" in num_cols:
    num_cols = num_cols.drop("Transported")
if "PassengerId" in cat_cols:
    cat_cols = cat_cols.drop("PassengerId")

# Copiar dataset
df = train.copy()

# Imputação de valores ausentes
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0]).infer_objects(copy=False)

# Codificação de variáveis categóricas
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Separar features e target
if "Transported" in df.columns:
    X = df.drop(["Transported", "PassengerId"], axis=1, errors='ignore')
    y = df["Transported"].astype(int)
else:
    X = df.drop(["PassengerId"], axis=1, errors='ignore')
    y = None  # Não existe target no dataset de teste

# Escalonamento
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split treino/teste (somente se y existir)
if y is not None:
    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )


  df[col] = df[col].fillna(df[col].mode()[0]).infer_objects(copy=False)


In [4]:
dummy = DummyClassifier(strategy="most_frequent")
cv = KFold(n_splits=5, shuffle=True, random_state=42)
baseline_acc = cross_val_score(dummy, X_scaled, y, cv=cv, scoring="accuracy").mean()
print("Baseline Accuracy:", baseline_acc)


Baseline Accuracy: 0.5036236981294887


In [5]:
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, eval_metric="logloss"),
    "KNN": KNeighborsClassifier()
}

results = {}
for name, model in models.items():
    acc = cross_val_score(model, X_scaled, y, cv=cv, scoring="accuracy").mean()
    results[name] = acc

print(pd.DataFrame(results.items(), columns=["Modelo", "Accuracy"]))


         Modelo  Accuracy
0  RandomForest  0.787991
1       XGBoost  0.789715
2           KNN  0.762913


In [6]:
voting = VotingClassifier(
    estimators=[
        ("rf", RandomForestClassifier(random_state=42)),
        ("xgb", XGBClassifier(random_state=42, eval_metric="logloss")),
        ("knn", KNeighborsClassifier())
    ],
    voting="soft"
)

ensemble_acc = cross_val_score(voting, X_scaled, y, cv=cv, scoring="accuracy").mean()
print("VotingClassifier Accuracy:", ensemble_acc)


VotingClassifier Accuracy: 0.7907502757758615


In [7]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 500)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    return cross_val_score(rf, X_scaled, y, cv=cv, scoring="accuracy").mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Melhores parâmetros:", study.best_params)
print("Melhor accuracy:", study.best_value)


[I 2025-10-22 20:46:49,403] A new study created in memory with name: no-name-71c34c81-0e93-4941-ae34-2b4bf0c8c4fe
[I 2025-10-22 20:47:07,358] Trial 0 finished with value: 0.7955832849719194 and parameters: {'n_estimators': 342, 'max_depth': 13}. Best is trial 0 with value: 0.7955832849719194.
[I 2025-10-22 20:47:24,140] Trial 1 finished with value: 0.7944328678505894 and parameters: {'n_estimators': 416, 'max_depth': 8}. Best is trial 0 with value: 0.7955832849719194.
[I 2025-10-22 20:47:32,288] Trial 2 finished with value: 0.7913273040932616 and parameters: {'n_estimators': 221, 'max_depth': 7}. Best is trial 0 with value: 0.7955832849719194.
[I 2025-10-22 20:47:47,402] Trial 3 finished with value: 0.798689113421136 and parameters: {'n_estimators': 324, 'max_depth': 10}. Best is trial 3 with value: 0.798689113421136.
[I 2025-10-22 20:48:03,103] Trial 4 finished with value: 0.796043253301535 and parameters: {'n_estimators': 396, 'max_depth': 8}. Best is trial 3 with value: 0.7986891134

Melhores parâmetros: {'n_estimators': 429, 'max_depth': 10}
Melhor accuracy: 0.7998393320235497


In [8]:
# Copiar dataset de teste
test_df = test.copy()

# Remover 'Transported' de cat_cols, caso tenha vindo do treino
cat_cols = [c for c in cat_cols if c in test_df.columns]
num_cols = [c for c in num_cols if c in test_df.columns]

# Imputação de valores ausentes (mesma estratégia do treino)
for col in num_cols:
    test_df[col] = test_df[col].fillna(df[col].median())

for col in cat_cols:
    test_df[col] = test_df[col].fillna(df[col].mode()[0]).infer_objects(copy=False)

# Codificação (usando o mesmo LabelEncoder do treino)
# ⚠️ Mas cuidado: você deve usar o MESMO ENCODER do treino, não refitá-lo!
# Para simplificar, vamos reusar manualmente o mesmo mapeamento.
for col in cat_cols:
    # Cria um novo LabelEncoder e aplica o mesmo mapeamento
    le = LabelEncoder()
    le.fit(df[col].unique())  # aprende a partir do treino
    # Valores desconhecidos no teste serão mapeados para -1
    test_df[col] = test_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Escalonamento (mesmo StandardScaler do treino)
test_scaled = scaler.transform(test_df.drop("PassengerId", axis=1))

# Treinar modelo final com os melhores parâmetros do Optuna
best_rf = RandomForestClassifier(
    n_estimators=study.best_params["n_estimators"],
    max_depth=study.best_params["max_depth"],
    random_state=42
)

best_rf.fit(X_scaled, y)

# Fazer previsões
predictions = best_rf.predict(test_scaled)
predictions_bool = predictions.astype(bool)

# Criar submission.csv
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": predictions_bool
})

submission.to_csv("submission.csv", index=False)
print("✅ Arquivo submission.csv gerado com sucesso!")
display(submission.head())


✅ Arquivo submission.csv gerado com sucesso!


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
