## Spaceship Titanic Prediction with Catboost

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

## Import datasets

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
train_targets = train.pop("Transported")
test = pd.read_csv("../input/spaceship-titanic/test.csv")
data = pd.concat([train, test])
data["Cabin"] = data["Cabin"].replace(np.NAN, data["Cabin"].mode()[0])
data["Deck"] = data["Cabin"].apply(lambda item: str(item).split('/')[0])
data["Num"] = data["Cabin"].apply(lambda item:  str(item).split('/')[1])
data["Side"] = data["Cabin"].apply(lambda item: str(item).split('/')[2])
data.pop("Cabin")
data.pop("PassengerId")
data.pop("Name")
data = pd.get_dummies(data)
train = data.iloc[0:len(train)]
test = data.iloc[len(train):]


In [None]:
data.head()

## Model Development

In [None]:
models = []
kfold = StratifiedKFold(7, shuffle=True, random_state=2022)
for index, (train_indices, valid_indices) in enumerate(kfold.split(train, train_targets)):
    x_train = train.iloc[train_indices]
    x_val = train.iloc[valid_indices]
    y_train = train_targets.iloc[train_indices]
    y_val = train_targets.iloc[valid_indices]
    params = {
        'iterations': 10000, 
        'depth': 8, 
        'early_stopping_rounds': 1000,
        'eval_metric': 'Accuracy',
        "verbose": 1000
    }
    ## Create Model
    model = CatBoostClassifier(**params)
    ## Train Model
    model.fit(x_train, y_train, eval_set=(x_val, y_val))
    ## Save Model
    file_path = f"model_{index}.model"
    model.save_model(file_path)
    ## Load Model
    model.load_model(file_path)
    models.append(model)

## Feature Importance
Let's take a look at feature importance of this dataset.

In [None]:
for fold, model in enumerate(models):
    print("=" * 100)
    print(f"Feature Importance for fold {fold}:")
    print("=" * 100)
    feature_importance = sorted(zip(train.columns, model.get_feature_importance()), reverse=True, key=lambda item: item[1])
    for item in feature_importance[:30]:
        print(item)

## Submission

In [None]:
def inference(df, models):
    y_pred = np.mean([model.predict_proba(df)[:, 1] for model in models], axis=0)
    y_pred = np.array(y_pred > 0.5, dtype=np.bool_)
    return y_pred

In [None]:
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
submission["Transported"] = inference(test, models)
submission.to_csv("submission.csv", index=False)
submission.head()