In [None]:
# ============================================================
# Spaceship Titanic - Análise Exploratória e Baseline com 10-Fold CV
# ============================================================

# 1. Importação das bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Configurações de visualização
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# ============================================================
# 1. Carregamento dos dados
# ============================================================
train_df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

print("Dimensões do treino:", train_df.shape)
print("Dimensões do teste:", test_df.shape)

train_df.head()


In [None]:
# Copiando datasets
df_train = train_df.copy()
df_test = test_df.copy()

# Tratamento de valores faltantes
for col in df_train.columns:
    if df_train[col].dtype == "object":
        df_train[col].fillna(df_train[col].mode()[0], inplace=True)
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)
    else:
        df_train[col].fillna(df_train[col].median(), inplace=True)
        df_test[col].fillna(df_test[col].median(), inplace=True)

# Separando features e alvo
X = df_train.drop(columns=["Transported", "PassengerId"])
y = df_train["Transported"].astype(int)

# OneHotEncoding
X = pd.get_dummies(X)
df_test_encoded = pd.get_dummies(df_test.drop(columns=["PassengerId"]))

# Garantir que as colunas sejam iguais
df_test_encoded = df_test_encoded.reindex(columns=X.columns, fill_value=0)

# ============================================================
# 10-Fold Cross Validation
# ============================================================
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)

fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    fold_accuracies.append(acc)
    print(f"Fold {fold} - Accuracy: {acc:.4f}")

print("\nMédia de Accuracy nos 10 folds:", np.mean(fold_accuracies))
print("Desvio padrão:", np.std(fold_accuracies))


In [None]:
# Histograma das accuracies
plt.figure(figsize=(8, 5))
sns.histplot(fold_accuracies, bins=5, kde=True, color="blue")
plt.title("Distribuição das accuracies por fold")
plt.xlabel("Accuracy")
plt.ylabel("Frequência")
plt.show()

# Boxplot das accuracies
plt.figure(figsize=(6, 4))
sns.boxplot(x=fold_accuracies, color="orange")
plt.title("Variação da accuracy nos folds")
plt.xlabel("Accuracy")
plt.show()


In [None]:
# Treinando modelo final com todos os dados
model.fit(X, y)

# Previsões para o conjunto de teste
y_test_pred = model.predict(df_test_encoded)

# Criando arquivo de submissão
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Transported": y_test_pred.astype(bool)
})

submission.to_csv("submission.csv", index=False)
print("Arquivo submission.csv criado!")
