## Spaceship Titanic Prediction with LGBM

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

## Import datasets

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
train_targets = train.pop("Transported")
test = pd.read_csv("../input/spaceship-titanic/test.csv")
data = pd.concat([train, test])
data["Cabin"] = data["Cabin"].replace(np.NAN, data["Cabin"].mode()[0])
data["Deck"] = data["Cabin"].apply(lambda item: str(item).split('/')[0])
data["Num"] = data["Cabin"].apply(lambda item:  str(item).split('/')[1])
data["Side"] = data["Cabin"].apply(lambda item: str(item).split('/')[2])
data.pop("Cabin")
data.pop("PassengerId")
data.pop("Name")
data = pd.get_dummies(data)
train = data.iloc[0:len(train)]
test = data.iloc[len(train):]

## Model Development

In [None]:
models = []
kfold = StratifiedKFold(7, shuffle=True, random_state=2022)
for (train_indices, valid_indices) in kfold.split(train, train_targets):
    x_train = train.iloc[train_indices]
    x_val = train.iloc[valid_indices]
    y_train = train_targets.iloc[train_indices]
    y_val = train_targets.iloc[valid_indices]
    model = LGBMClassifier()
    model.fit(x_train, y_train, eval_set=(x_val, y_val))
    models.append(model)

## Submission

In [None]:
def inference(df, models):
    y_pred = np.mean([model.predict_proba(df)[:, 1] for model in models], axis=0)
    y_pred = np.array(y_pred > 0.5, dtype=np.bool_)
    return y_pred

In [None]:
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
submission["Transported"] = inference(test, models)
submission.to_csv("submission.csv", index=False)
submission.head()