In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

# Load datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Save PassengerId for final submission
passenger_ids = test["PassengerId"]

# Target
y_train = train["Transported"].astype(bool)
train.drop(columns=["Transported"], inplace=True)

# Combine train and test for uniform processing
combined = pd.concat([train, test], axis=0).reset_index(drop=True)

# Cabin split: Deck / Num / Side
combined[["Deck", "Num", "Side"]] = combined["Cabin"].str.split("/", expand=True)
combined.drop(columns=["Cabin", "Name", "PassengerId"], inplace=True)

# New features: total, mean, std of expenses
combined["Total_Bill"] = combined[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)
combined["mean_bill"] = combined[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].mean(axis=1)
combined["std_bill"] = combined[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].std(axis=1)

# Handle missing values
cat_cols = combined.select_dtypes(include="object").columns
num_cols = combined.select_dtypes(include=["int64", "float64"]).columns

# Impute
cat_imputer = SimpleImputer(strategy="most_frequent")
combined[cat_cols] = cat_imputer.fit_transform(combined[cat_cols])

num_imputer = SimpleImputer(strategy="mean")
combined[num_cols] = num_imputer.fit_transform(combined[num_cols])

# Encode categorical variables
encoder = LabelEncoder()
for col in cat_cols:
    combined[col] = encoder.fit_transform(combined[col])

# Scale numerical features
scaler = StandardScaler()
combined[num_cols] = scaler.fit_transform(combined[num_cols])

# Split combined dataset
X_train = combined.iloc[:len(y_train), :]
X_test = combined.iloc[len(y_train):, :]

# Train XGBoost model
model = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42, use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

# Predict
preds = model.predict(X_test)

# Create submission DataFrame
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds
})

# Save CSV
submission.to_csv("submission_xgboost.csv", index=False)
print("Submission file saved as submission_xgboost.csv")
