In [38]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings


In [6]:
# Import Data
train_data = pd.read_csv("./spaceship-titanic/train.csv")
test_data = pd.read_csv("./spaceship-titanic/test.csv")
sample_output = pd.read_csv("./spaceship-titanic/sample_submission.csv")

In [7]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [8]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Feature Engineering

In [30]:
# Cabin
def clean_cabin_data(x, index=0):
    if pd.isna(x):
        return x
    else:
        return x.split('/')[index]

# Name
def clean_name(x, index=0):
    if pd.isna(x):
        return x
    else:
        return x.split(' ')[index]

In [31]:
def generate_features(df):

    # Cabin
    for i in [0,1,2]:
        df[f'Cabin_data_{i}'] = df['Cabin'].apply(lambda x: clean_cabin_data(x, index=i))

    # Name
    first_name_list = df['Name'].apply(lambda x: clean_name(x,0))
    last_name_list = df['Name'].apply(lambda x: clean_name(x,1))
    
    first_name_dict={row['Name']:row['count']-1 for idx, row in first_name_list.value_counts().reset_index().iterrows()}
    last_name_dict={row['Name']:row['count']-1 for idx, row in last_name_list.value_counts().reset_index().iterrows()}
    
    df['first_name_unique'] = first_name_list.apply(lambda x: first_name_dict[x] if not pd.isna(x) else x)
    df['last_name_unique'] = last_name_list.apply(lambda x: last_name_dict[x] if not pd.isna(x) else x)

    # Generate Dummies
    df = pd.get_dummies(df, columns=['HomePlanet', 'Destination', 'Cabin_data_0',  'Cabin_data_2',], drop_first=True, dummy_na=True)
    df.drop(columns= ['Cabin_data_2_nan'], inplace=True)
    return df

In [32]:
train_data_clean = generate_features(train_data)

In [33]:
X_columns = ['CryoSleep',  'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 
       'Cabin_data_1', 'first_name_unique', 'last_name_unique',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'HomePlanet_nan',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_nan', 'Cabin_data_0_B', 'Cabin_data_0_C', 'Cabin_data_0_D',
       'Cabin_data_0_E', 'Cabin_data_0_F', 'Cabin_data_0_G', 'Cabin_data_0_T',
       'Cabin_data_0_nan', 'Cabin_data_2_S']
y_columns = 'Transported'

In [34]:
X = train_data_clean[X_columns]
y = train_data_clean[[y_columns]]

In [35]:
# Define the custom transformer
class MissingIndicatorAdder(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.features:
            X_copy[f'{col}_is_missing'] = X_copy[col].isnull().astype(int)
        return X_copy

In [56]:
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    eval_metric='logloss',
    random_state=42
)

In [57]:
numerical_pipeline = Pipeline(steps=[
    ('add_missing_tag', MissingIndicatorAdder(features=['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck',])),
        ('imputer', IterativeImputer(random_state=42)),
    ('classifier', xgb_model)
])

In [58]:
numerical_pipeline.fit(X,y)

0,1,2
,steps,"[('add_missing_tag', ...), ('imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features,"['CryoSleep', 'Age', ...]"

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [59]:
scores = cross_val_score(numerical_pipeline, X, y, cv=5, scoring='accuracy',)
print(f"\nCross-Validation Scores: {scores}, {scores.mean()}")


Cross-Validation Scores: [0.7573318  0.75215641 0.79758482 0.83544304 0.77675489], 0.7838541918261821


In [60]:
test_data_clean = generate_features(test_data)

In [63]:
test_data_clean

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,Destination_nan,Cabin_data_0_B,Cabin_data_0_C,Cabin_data_0_D,Cabin_data_0_E,Cabin_data_0_F,Cabin_data_0_G,Cabin_data_0_T,Cabin_data_0_nan,Cabin_data_2_S
0,0013_01,True,G/3/S,27.0,False,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,True
1,0018_01,False,F/4/S,19.0,False,0.0,9.0,0.0,2823.0,0.0,...,False,False,False,False,False,True,False,False,False,True
2,0019_01,True,C/0/S,31.0,False,0.0,0.0,0.0,0.0,0.0,...,False,False,True,False,False,False,False,False,False,True
3,0021_01,False,C/1/S,38.0,False,0.0,6652.0,0.0,181.0,585.0,...,False,False,True,False,False,False,False,False,False,True
4,0023_01,False,F/5/S,20.0,False,10.0,0.0,635.0,0.0,0.0,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,True,G/1496/S,34.0,False,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,True
4273,9269_01,False,,42.0,False,0.0,847.0,17.0,10.0,144.0,...,False,False,False,False,False,False,False,False,True,False
4274,9271_01,True,D/296/P,,False,0.0,0.0,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,False,False
4275,9273_01,False,D/297/P,,False,0.0,2680.0,0.0,0.0,523.0,...,True,False,False,True,False,False,False,False,False,False


In [64]:
output = numerical_pipeline.predict(test_data_clean[X_columns])

In [66]:
output[0]

np.int64(1)

In [69]:
test_data['Transported'] = output

In [72]:
test_data['Transported'] = test_data['Transported'].astype(bool)

In [74]:
test_data[['PassengerId', 'Transported']].to_csv("output_1.csv", index=False)

In [67]:
sample_output

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


## Data Imputation

In [49]:
# IterativeImputer is an experimental feature in scikit-learn
# We need this import to use it and will silence the warning for a clean output
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Use pd.get_dummies() with dummy_na=True
df_method1 = pd.get_dummies(df, columns=['City'], dummy_na=True)

print("\n--- Method 1: DataFrame after encoding (NaN as a category) ---")
print(df_method1)
print("-" * 50)

In [52]:
# --- 2. Basic Implementation with Default Estimator (BayesianRidge) ---
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # Instantiate the imputer with default parameters
    # The default estimator is BayesianRidge
    imputer_default = IterativeImputer(
        max_iter=10, 
        random_state=42
    )

    # Fit and transform the data
    df_imputed_default = pd.DataFrame(imputer_default.fit_transform(X), columns=X.columns)

ValueError: could not convert string to float: 'Europa'

In [None]:
print("--- Results with Default Estimator (BayesianRidge) ---")
print("Missing values after imputation:")
print(df_imputed_default.isnull().sum())
# Let's inspect a few imputed values
print("\nSample imputed values (first 5 rows):")
print(df_imputed_default.head())
print("-" * 50)