In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

import optuna

In [None]:
import warnings

warnings.filterwarnings('ignore')

# Load data

In [None]:
X = pd.read_csv("../input/titanic-spaceship-competition-using-kfolds-data/TitanicSpaceship-5folds.csv", index_col="PassengerId")
X_test = pd.read_csv("../input/spaceship-titanic/test.csv", index_col="PassengerId")

X

# Data preprocessing

### Explore Data

In [None]:
print("-----------------------------")
print("Shapes")
print("-----------------------------")
print(X.shape)
print(X_test.shape)
print("-----------------------------")
print("NaN values")
print("-----------------------------")
print(X.isna().sum().sum())
print(X_test.isna().sum().sum())
print("-----------------------------")
print("Duplicates")
print("-----------------------------")
print(X.duplicated().sum())
print(X_test.duplicated().sum())

### Drop category columns for low cardinality to encode set in future

In [None]:
object_cols = [i for i in X.columns if X[i].dtype == "O"]

high_cardinality_cols = [i for i in object_cols if X[i].nunique() > 10]

X = X.drop(high_cardinality_cols, axis=1)
X_test = X_test.drop(high_cardinality_cols, axis=1)

object_cols = [i for i in X.columns if X[i].dtype == "O"] # update object_cols

X.head()

### Handle missing values

In [None]:
nan_cols = [i for i in X.columns if X[i].isna().any()]

num_nan_cols = [i for i in nan_cols if not i in object_cols]
cat_nan_cols = [i for i in nan_cols if i in object_cols]

In [None]:
cat_nan_cols

In [None]:
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(X[cat_nan_cols])

X[cat_nan_cols] = imputer.transform(X[cat_nan_cols])
X_test[cat_nan_cols] = imputer.transform(X_test[cat_nan_cols])

X.head()

In [None]:
imputer = SimpleImputer(strategy="median")
imputer.fit(X[num_nan_cols])

X[num_nan_cols] = imputer.transform(X[num_nan_cols])
X_test[num_nan_cols] = imputer.transform(X_test[num_nan_cols])

X.head()

## Let's score our dataset!

In [None]:
preds = []
losses = []

for i in range(5):
    test = X_test.copy()
    X_train = X[X.kfold != i].drop("kfold", axis=1)
    X_valid = X[X.kfold == i].drop("kfold", axis=1)
    y_train = X_train.pop("Transported")
    y_valid = X_valid.pop("Transported")
    enc = OrdinalEncoder()
    enc.fit(X_train[object_cols])
    X_train[object_cols] = enc.transform(X_train[object_cols])
    X_valid[object_cols] = enc.transform(X_valid[object_cols])
    test[object_cols] = enc.transform(test[object_cols])
    
    model = XGBClassifier(
        random_state=0,       
        tree_method='gpu_hist',
        gpu_id=0,
        predictor='gpu_predictor',
        n_estimators=500,
    )
    model.fit(X_train, y_train)
    preds.append(model.predict(test))
    loss = log_loss(y_valid, model.predict(X_valid))
    print("--------------------------------------------------")
    print(loss)
    print("--------------------------------------------------")
    losses.append(loss)
    
print()
print(np.mean(losses))

# Do our submission and modeling together (Optuna)

In [None]:
def run(trial):
    preds = []
    losses = []
    
    learning_rate=trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lamba=trial.suggest_loguniform("reg_lamba", 1e-8, 100.0)
    reg_alpha=trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample=trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree=trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth=trial.suggest_int("max_depth", 1, 7)
    
    i = 0
    
    test = X_test.copy()
    X_train = X[X.kfold != i].drop("kfold", axis=1)
    X_valid = X[X.kfold == i].drop("kfold", axis=1)
    y_train = X_train.pop("Transported")
    y_valid = X_valid.pop("Transported")
    enc = OrdinalEncoder()
    enc.fit(X_train[object_cols])
    X_train[object_cols] = enc.transform(X_train[object_cols])
    X_valid[object_cols] = enc.transform(X_valid[object_cols])
    test[object_cols] = enc.transform(test[object_cols])
    
    model = XGBClassifier(
        random_state=0,       
        tree_method='gpu_hist',
        gpu_id=0,
        predictor='gpu_predictor',
        n_estimators=500,
        learning_rate=learning_rate,
        reg_lamba=reg_lamba,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth
    )
    model.fit(X_train, y_train)
    preds.append(model.predict(test))
    loss = log_loss(y_valid, model.predict(X_valid))
    losses.append(loss)
    
    return np.mean(losses)

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(run, n_trials=1000)

In [None]:
best_params = {
    'n_estimators': 500,
    'random_state':0,       
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
    'learning_rate': 0.014488419170389398,
    'reg_lamba': 4.620380549431809,
    'reg_alpha': 0.0040896213001427855,
    'subsample': 0.7847668791304616,
    'colsample_bytree': 0.9646100147515869,
    'max_depth': 6
}

In [None]:
preds = []
losses = []

for i in range(5):
    test = X_test.copy()
    X_train = X[X.kfold != i].drop("kfold", axis=1)
    X_valid = X[X.kfold == i].drop("kfold", axis=1)
    y_train = X_train.pop("Transported")
    y_valid = X_valid.pop("Transported")
    enc = OrdinalEncoder()
    enc.fit(X_train[object_cols])
    X_train[object_cols] = enc.transform(X_train[object_cols])
    X_valid[object_cols] = enc.transform(X_valid[object_cols])
    test[object_cols] = enc.transform(test[object_cols])
    
    model = XGBClassifier(**best_params)
    model.fit(X_train, y_train)
    preds.append(model.predict(test))
    loss = log_loss(y_valid, model.predict(X_valid))
    print("--------------------------------------------------")
    print(loss)
    print("--------------------------------------------------")
    losses.append(loss)
    
print()
print(np.mean(losses))

In [None]:
preds_ = np.mean(np.column_stack(preds), axis=1)
preds_ = list(map(lambda x: True if x >= 0.5 else False, preds_))

## Let's make submission!

In [None]:
ss = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
ss["Transported"] = preds_
ss

In [None]:
ss.to_csv("submission.csv", index=False)