In [51]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
import warnings

In [2]:
# Import Data
train_data = pd.read_csv("./spaceship-titanic/train.csv")
test_data = pd.read_csv("./spaceship-titanic/test.csv")
sample_output = pd.read_csv("./spaceship-titanic/sample_submission.csv")

In [3]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [4]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Feature Engineering

In [5]:
# Cabin
def clean_cabin_data(x, index=0):
    if pd.isna(x):
        return x
    else:
        return x.split('/')[index]

for i in [0,1,2]:
    train_data[f'Cabin_data_{i}'] = train_data['Cabin'].apply(lambda x: clean_cabin_data(x, index=i))

In [31]:
# Name
def clean_name(x, index=0):
    if pd.isna(x):
        return x
    else:
        return x.split(' ')[index]

first_name_list = train_data['Name'].apply(lambda x: clean_name(x,0))
last_name_list = train_data['Name'].apply(lambda x: clean_name(x,1))

first_name_dict={row['Name']:row['count']-1 for idx, row in first_name_list.value_counts().reset_index().iterrows()}
last_name_dict={row['Name']:row['count']-1 for idx, row in last_name_list.value_counts().reset_index().iterrows()}

train_data['first_name_unique'] = first_name_list.apply(lambda x: first_name_dict[x] if not pd.isna(x) else x)
train_data['last_name_unique'] = last_name_list.apply(lambda x: last_name_dict[x] if not pd.isna(x) else x)

In [60]:
train_data = pd.get_dummies(train_data, columns=['HomePlanet', 'Destination', 'Cabin_data_0',  'Cabin_data_2',], drop_first=True, dummy_na=True)

In [63]:
train_data.drop(columns= ['Cabin_data_2_nan'], inplace=True)

In [65]:
train_data.columns

Index(['PassengerId', 'CryoSleep', 'Cabin', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported',
       'Cabin_data_1', 'first_name_unique', 'last_name_unique',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'HomePlanet_nan',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_nan', 'Cabin_data_0_B', 'Cabin_data_0_C', 'Cabin_data_0_D',
       'Cabin_data_0_E', 'Cabin_data_0_F', 'Cabin_data_0_G', 'Cabin_data_0_T',
       'Cabin_data_0_nan', 'Cabin_data_2_S'],
      dtype='object')

In [76]:
X_columns = ['CryoSleep',  'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 
       'Cabin_data_1', 'first_name_unique', 'last_name_unique',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'HomePlanet_nan',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_nan', 'Cabin_data_0_B', 'Cabin_data_0_C', 'Cabin_data_0_D',
       'Cabin_data_0_E', 'Cabin_data_0_F', 'Cabin_data_0_G', 'Cabin_data_0_T',
       'Cabin_data_0_nan', 'Cabin_data_2_S']
y_columns = 'Transported'

In [77]:
X = train_data[X_columns]
y = train_data[[y_columns]]

In [80]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [81]:
# Define the custom transformer
class MissingIndicatorAdder(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.features:
            X_copy[f'{col}_is_missing'] = X_copy[col].isnull().astype(int)
        return X_copy

In [78]:
X.isna().sum()

CryoSleep                    217
Age                          179
VIP                          203
RoomService                  181
FoodCourt                    183
ShoppingMall                 208
Spa                          183
VRDeck                       188
Cabin_data_1                 199
first_name_unique            200
last_name_unique             200
HomePlanet_Europa              0
HomePlanet_Mars                0
HomePlanet_nan                 0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
Destination_nan                0
Cabin_data_0_B                 0
Cabin_data_0_C                 0
Cabin_data_0_D                 0
Cabin_data_0_E                 0
Cabin_data_0_F                 0
Cabin_data_0_G                 0
Cabin_data_0_T                 0
Cabin_data_0_nan               0
Cabin_data_2_S                 0
dtype: int64

In [90]:
from sklearn.ensemble import RandomForestClassifier

In [85]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

In [99]:
numerical_pipeline = Pipeline(steps=[
    ('add_missing_tag', MissingIndicatorAdder(features=['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck',])),
        ('imputer', IterativeImputer(random_state=42)),
    ('classifier', xgb_model)
])

In [100]:
numerical_pipeline.fit(X,y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,steps,"[('add_missing_tag', ...), ('imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features,"['CryoSleep', 'Age', ...]"

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [101]:
scores = cross_val_score(numerical_pipeline, X, y, cv=5, scoring='accuracy')
print(f"\nCross-Validation Scores: {scores}, {scores.mean()}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Cross-Validation Scores: [0.75618171 0.7492812  0.80448534 0.84004603 0.78078251], 0.7861553569337033


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


np.float64(0.7915602991283034)

## Data Imputation

In [49]:
# IterativeImputer is an experimental feature in scikit-learn
# We need this import to use it and will silence the warning for a clean output
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Use pd.get_dummies() with dummy_na=True
df_method1 = pd.get_dummies(df, columns=['City'], dummy_na=True)

print("\n--- Method 1: DataFrame after encoding (NaN as a category) ---")
print(df_method1)
print("-" * 50)

In [52]:
# --- 2. Basic Implementation with Default Estimator (BayesianRidge) ---
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # Instantiate the imputer with default parameters
    # The default estimator is BayesianRidge
    imputer_default = IterativeImputer(
        max_iter=10, 
        random_state=42
    )

    # Fit and transform the data
    df_imputed_default = pd.DataFrame(imputer_default.fit_transform(X), columns=X.columns)

ValueError: could not convert string to float: 'Europa'

In [None]:
print("--- Results with Default Estimator (BayesianRidge) ---")
print("Missing values after imputation:")
print(df_imputed_default.isnull().sum())
# Let's inspect a few imputed values
print("\nSample imputed values (first 5 rows):")
print(df_imputed_default.head())
print("-" * 50)