# Spaceship Titanic
[Kaggle Competiton](https://www.kaggle.com/competitions/spaceship-titanic/)


    - PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
    - HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
    - CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
    - Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    - Destination - The planet the passenger will be debarking to.
    - Age - The age of the passenger.
    - VIP - Whether the passenger has paid for special VIP service during the voyage.
    - RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
    - Name - The first and last names of the passenger.
    - Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.



In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb
import optuna
from sklearn.metrics import accuracy_score


def convert(dataframe):
    dataframe["HomePlanet"]=dataframe["HomePlanet"].astype("category")
    dataframe["Destination"]=dataframe["Destination"].astype("category")
    dataframe[["group", "number_within_group"]]=dataframe["PassengerId"].str.split("_", expand=True)
    dataframe["group"]=dataframe["group"].astype(int)
    dataframe["number_within_group"]=dataframe["number_within_group"].astype(int)
    dataframe[["deck", "num", "side"]]=dataframe["Cabin"].str.split("/", expand=True)
    dataframe["deck"]=dataframe["deck"].astype("category")
    dataframe["num"]=dataframe["num"].astype("category")
    dataframe["side"]=dataframe["side"].astype("category")
    dataframe = dataframe.drop(["Cabin", "PassengerId", "Name"], axis=1) #also remove name
    dataframe["CryoSleep"] =dataframe["CryoSleep"].astype(bool)
    dataframe["VIP"] =dataframe["VIP"].astype(bool)
    dataframe["Age"]=pd.to_numeric(dataframe['Age'], errors='coerce', downcast='integer')
    dataframe["RoomService"]=pd.to_numeric(dataframe['RoomService'], errors='coerce', downcast='integer')
    dataframe["FoodCourt"]=pd.to_numeric(dataframe['FoodCourt'], errors='coerce', downcast='integer')
    dataframe["ShoppingMall"]=pd.to_numeric(dataframe['ShoppingMall'], errors='coerce', downcast='integer')
    dataframe["Spa"]=pd.to_numeric(dataframe['Spa'], errors='coerce', downcast='integer')
    dataframe["VRDeck"]=pd.to_numeric(dataframe['VRDeck'], errors='coerce', downcast='integer')
    return dataframe


mnist_train = pd.read_csv("/home/rainer/Downloads/ML_datasets/spaceship_titanic_train.csv")
mnist_train = convert(mnist_train)
mnist_train["Transported"] =mnist_train["Transported"].astype(bool)

X, Y = mnist_train.drop('Transported', axis=1), mnist_train[['Transported']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)
dtrain_reg = xgb.DMatrix(X_train, Y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, Y_test, enable_categorical=True)

In [2]:
def objective(trial):
    param = {
        'tree_method':'gpu_hist',
        "objective": "binary:hinge",
        'eta': trial.suggest_float('eta', 0.01, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.01, 10, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),

    }

    model = xgb.train(
        params=param,
        dtrain=dtrain_reg,
        num_boost_round=200,
    )
    preds = model.predict(dtest_reg)
    accuracy = accuracy_score(Y_test, preds)
    return accuracy

In [3]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2000)
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[32m[I 2023-04-29 09:43:13,939][0m A new study created in memory with name: no-name-cbd59dc9-adcb-4643-bb80-95056042dad3[0m
[32m[I 2023-04-29 09:43:15,676][0m Trial 0 finished with value: 0.7893284268629255 and parameters: {'eta': 0.17023924559965342, 'max_depth': 9, 'min_child_weight': 4.033255941559674, 'subsample': 0.9675481813645649, 'colsample_bynode': 0.1479972913492718}. Best is trial 0 with value: 0.7893284268629255.[0m
[32m[I 2023-04-29 09:43:17,029][0m Trial 1 finished with value: 0.797148114075437 and parameters: {'eta': 0.19510481745732886, 'max_depth': 10, 'min_child_weight': 6.091361727596768, 'subsample': 0.9967733380144969, 'colsample_bynode': 0.14960101142456422}. Best is trial 1 with value: 0.797148114075437.[0m
[32m[I 2023-04-29 09:43:17,369][0m Trial 2 finished with value: 0.8031278748850046 and parameters: {'eta': 0.1876686415367159, 'max_depth': 3, 'min_child_weight': 19.956213968427317, 'subsample': 0.21877715803079412, 'colsample_bynode': 0.3660538517

Number of finished trials: 2000
Best trial:
  Value: 0.8219871205151794
  Params: 
    eta: 0.10639974467535045
    max_depth: 10
    min_child_weight: 10.486339500146217
    subsample: 0.9875003085612942
    colsample_bynode: 0.18579214962585772
