# Spaceship Titanic
[Kaggle Competiton](https://www.kaggle.com/competitions/spaceship-titanic/)


    - PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
    - HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
    - CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
    - Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    - Destination - The planet the passenger will be debarking to.
    - Age - The age of the passenger.
    - VIP - Whether the passenger has paid for special VIP service during the voyage.
    - RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
    - Name - The first and last names of the passenger.
    - Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.



In [13]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

def convert(dataframe):
    dataframe["HomePlanet"]=dataframe["HomePlanet"].astype("category")
    dataframe["Destination"]=dataframe["Destination"].astype("category")
    dataframe[["group", "number_within_group"]]=dataframe["PassengerId"].str.split("_", expand=True)
    dataframe["group"]=dataframe["group"].astype(int)
    dataframe["number_within_group"]=dataframe["number_within_group"].astype(int)
    dataframe[["deck", "num", "side"]]=dataframe["Cabin"].str.split("/", expand=True)
    dataframe["deck"]=dataframe["deck"].astype("category")
    dataframe["num"]=dataframe["num"].astype("category")
    dataframe["side"]=dataframe["side"].astype("category")
    dataframe = dataframe.drop(["Cabin", "PassengerId", "Name"], axis=1) #also remove name
    dataframe["CryoSleep"] =dataframe["CryoSleep"].astype(bool)
    dataframe["VIP"] =dataframe["VIP"].astype(bool)
    dataframe["Age"]=pd.to_numeric(dataframe['Age'], errors='coerce', downcast='integer')
    dataframe["RoomService"]=pd.to_numeric(dataframe['RoomService'], errors='coerce', downcast='integer')
    dataframe["FoodCourt"]=pd.to_numeric(dataframe['FoodCourt'], errors='coerce', downcast='integer')
    dataframe["ShoppingMall"]=pd.to_numeric(dataframe['ShoppingMall'], errors='coerce', downcast='integer')
    dataframe["Spa"]=pd.to_numeric(dataframe['Spa'], errors='coerce', downcast='integer')
    dataframe["VRDeck"]=pd.to_numeric(dataframe['VRDeck'], errors='coerce', downcast='integer')
    return dataframe


mnist_train = pd.read_csv("/home/rainer/Downloads/ML_datasets/spaceship_titanic_train.csv")
mnist_train = convert(mnist_train)
mnist_train["Transported"] =mnist_train["Transported"].astype(bool)

X, Y = mnist_train.drop('Transported', axis=1), mnist_train[['Transported']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)
dtrain_reg = xgb.DMatrix(X_train, Y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, Y_test, enable_categorical=True)

In [14]:
mnist_train.dtypes

HomePlanet             category
CryoSleep                  bool
Destination            category
Age                     float64
VIP                        bool
RoomService             float64
FoodCourt               float64
ShoppingMall            float64
Spa                     float64
VRDeck                  float64
Transported                bool
group                     int64
number_within_group       int64
deck                   category
num                    category
side                   category
dtype: object

In [15]:
mnist_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,group,number_within_group,deck,num,side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,1,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,1,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,1,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,2,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,1,F,1,S


In [16]:
# Define hyperparameters
params = {
    "objective": "binary:hinge",
    "tree_method": "gpu_hist"
}
n=100
results = xgb.cv(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    nfold=5,
    verbose_eval=10, # print eval every xth round
    metrics=["logloss", "error"],
)

[0]	train-logloss:18.40655+0.12457	train-error:0.49962+0.00338	test-logloss:18.40644+0.49841	test-error:0.49961+0.01353
[10]	train-logloss:8.07723+0.22515	train-error:0.21924+0.00611	test-logloss:10.62456+0.33069	test-error:0.28839+0.00898
[20]	train-logloss:5.34904+0.16932	train-error:0.14519+0.00460	test-logloss:8.16619+0.32961	test-error:0.22166+0.00895
[30]	train-logloss:4.64121+0.14257	train-error:0.12598+0.00387	test-logloss:7.78194+0.25201	test-error:0.21123+0.00684
[40]	train-logloss:4.04781+0.15570	train-error:0.10987+0.00423	test-logloss:7.59545+0.24109	test-error:0.20617+0.00654
[50]	train-logloss:3.59994+0.15479	train-error:0.09771+0.00420	test-logloss:7.54459+0.23490	test-error:0.20479+0.00638
[60]	train-logloss:3.24250+0.20374	train-error:0.08801+0.00553	test-logloss:7.56719+0.20502	test-error:0.20540+0.00556
[70]	train-logloss:2.87092+0.21233	train-error:0.07793+0.00576	test-logloss:7.60674+0.19701	test-error:0.20647+0.00535
[80]	train-logloss:2.49369+0.19832	train-error

In [17]:
dfull = xgb.DMatrix(X, Y, enable_categorical=True)
model = xgb.train(
    params=params,
    dtrain=dfull,
    num_boost_round=100
)
mnist_test = pd.read_csv("/home/rainer/Downloads/ML_datasets/spaceship_titanic_test.csv")
spaceship_finalPred = convert(mnist_test)
spaceship_finalPred_matrix = xgb.DMatrix(spaceship_finalPred, enable_categorical=True)
predictions = model.predict(spaceship_finalPred_matrix)
predictionsDf = pd.DataFrame(predictions, columns=["Transported"])
predictionsDf["PassengerId"]=mnist_test["PassengerId"]

In [18]:
predictionsDf

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
