In [None]:
import numpy as np 
import pandas as pd 
import os

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train_res = train.Transported.astype(int)
    test_id = test.PassengerId
    return train, test, train_res, test_id

In [None]:
def preprocess_data(train, test):
    # Convert HomePlanet to categorical
    train['HomePlanet'] = train['HomePlanet'].astype('category')
    test['HomePlanet'] = test['HomePlanet'].astype('category')

    # Handle missing values by adding "Missing" as a new category
    train['HomePlanet'] = train['HomePlanet'].cat.add_categories('Missing').fillna('Missing')
    test['HomePlanet'] = test['HomePlanet'].cat.add_categories('Missing').fillna('Missing')
      train['Destination'] = train['Destination'].astype('category')
    test['Destination'] = test['Destination'].astype('category')

    # Handle missing values by adding "Missing" as a new category
    train['Destination'] = train['Destination'].cat.add_categories('Missing').fillna('Missing')
    test['Destination'] = test['Destination'].cat.add_categories('Missing').fillna('Missing')
    
    # Split Cabin
    train[['Cabin_1', 'Cabin_2', 'Cabin_3']] = train['Cabin'].str.split('/', expand=True)
    test[['Cabin_1', 'Cabin_2', 'Cabin_3']] = test['Cabin'].str.split('/', expand=True)

    # Convert Cabin_1 to categorical
    train['Cabin_1'] = train['Cabin_1'].astype('category')
    test['Cabin_1'] = test['Cabin_1'].astype('category')

    # Handle missing values by adding "Missing" as a new category
    train['Cabin_1'] = train['Cabin_1'].cat.add_categories('Missing').fillna('Missing')
     test['Cabin_1'] = test['Cabin_1'].cat.add_categories('Missing').fillna('Missing')
    
    # Convert Cabin_3 to categorical
    train['Cabin_3'] = train['Cabin_3'].astype('category')
    test['Cabin_3'] = test['Cabin_3'].astype('category')
    
    # Handle missing values by adding "Missing" as a new category
    train['Cabin_3'] = train['Cabin_3'].cat.add_categories('Missing').fillna('Missing')
    test['Cabin_3'] = test['Cabin_3'].cat.add_categories('Missing').fillna('Missing')
    cols = ['FoodCourt', 'RoomService', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in cols:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

    # Convert categorical features
    categorical_columns = ['CryoSleep', 'VIP', 'HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3']
    for col in categorical_columns:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')
        if 'Missing' not in train[col].cat.categories:
            train[col] = train[col].cat.add_categories('Missing')
        if 'Missing' not in test[col].cat.categories:
            test[col] = test[col].cat.add_categories('Missing')

        # Now, fill NaN values with 'Missing'
        train[col] = train[col].fillna('Missing')
        test[col] = test[col].fillna('Missing')

    # Advanced Feature Engineering
    train['FamilySize'] = train['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    test['FamilySize'] = test['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    
    train['Spending'] = train[cols].sum(axis=1)
    test['Spending'] = test[cols].sum(axis=1)

    # Calculate mean age, ignoring NaNs
    mean_age = train['Age'].mean(skipna=True)
    # Replace NaN with the mean age
    train['Age'] = train['Age'].fillna(mean_age)
    test['Age'] = test['Age'].fillna(mean_age)
     
    train.drop(['Name', 'Transported', 'PassengerId', 'Cabin', 'Cabin_2'], axis=1, inplace=True)
    test.drop(['Name', 'PassengerId', 'Cabin', 'Cabin_2'], axis=1, inplace=True)
    return train, test


In [None]:
def apply_clustering(train, test):
    # Select numerical features for clustering
    features = ['Spending', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    # Normalize only for those who spent money
    scaler = StandardScaler()

    # Separate train & test subsets where Spending > 0
    spent_money_train = train[train['Spending'] > 0].copy()
    spent_money_test = test[test['Spending'] > 0].copy()
    spent_money_train[features] = scaler.fit_transform(spent_money_train[features])
    spent_money_test[features] = scaler.transform(spent_money_test[features])  # Use same scaler as train!

    # Apply K-Means clustering
    k = 5  # You can experiment with different k values
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    spent_money_train['Cluster'] = kmeans.fit_predict(spent_money_train[features])
    spent_money_test['Cluster'] = kmeans.predict(spent_money_test[features])  # Use trained k-means model!
    train['SpendingCluster'] = -1
    test['SpendingCluster'] = -1

    # Assign cluster labels to those who spent money
    train.loc[train['Spending'] > 0, 'SpendingCluster'] = spent_money_train['Cluster'].values
    test.loc[test['Spending'] > 0, 'SpendingCluster'] = spent_money_test['Cluster'].values

    train['SpendingCluster'] = train['SpendingCluster'].astype('category')
    test['SpendingCluster'] = test['SpendingCluster'].astype('category')
    return train, test


In [None]:
def tune_xgb(train, train_res):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1]
    }
    xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=42, enable_categorical=True)
    grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(train, train_res)
    print("Best parameters for XGBoost:", grid_search.best_params_)
    return grid_search.best_estimator_

In [None]:
def train_meta_models(oof_preds, train_res, train_columns):
    oof_preds_df = pd.DataFrame(oof_preds, columns=["XGB", "LGB", "CatBoost"])
    meta_models = {
        "LogisticRegression": LogisticRegression(),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
    }
    
    meta_model_scores = {}
    for name, model in meta_models.items():
        scores = cross_val_score(model, oof_preds_df, train_res, cv=5, scoring='accuracy')
        meta_model_scores[name] = scores.mean()
         print(f"{name} CV Accuracy: {scores.mean():.4f}")
    
    best_meta_model = max(meta_model_scores, key=meta_model_scores.get)
    meta_models[best_meta_model].fit(oof_preds_df, train_res)
    
    return meta_models, best_meta_model
    

In [None]:
def predict(models, meta_models, best_meta_model, test, test_id, train_columns):
    # Ensure that test data has the same column names as the train data
    test_preds = np.column_stack([models[name].predict_proba(test[train_columns])[:, 1] for name in models.keys()])
    final_preds = meta_models[best_meta_model].predict(test_preds)
    
    submission = pd.DataFrame({'PassengerId': test_id, 'Transported': final_preds.astype(bool)})
    submission.to_csv('submission.csv', index=False)
    print("Submission file saved!")

In [None]:
train, test, train_res, test_id = load_data("/kaggle/input/spaceship-titanic/train.csv", "/kaggle/input/spaceship-titanic/test.csv")
train, test = preprocess_data(train, test)
train, test = apply_clustering(train, test)
models, oof_preds = train_base_models(train, train_res)
meta_models, best_meta_model = train_meta_models(oof_preds, train_res, train.columns)
predict(models, meta_models, best_meta_model, test, test_id, train.columns)