In [None]:
! pip install cuml

In [None]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
DIR = '../input/spaceship-titanic'

train_df = pd.read_csv(os.path.join(DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DIR, 'test.csv'))

sub = pd.read_csv(os.path.join(DIR, 'sample_submission.csv'))

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df["PassengerId"].is_unique

In [None]:
train_df["Cabin"].describe()

In [None]:
train_df["Name"].duplicated().sum()   # 219 Person traveled more than once, can be a extra feature

In [None]:
def transform(df):
    count = df["Name"].value_counts()
    uniques = count.index[count == 1]
    df["UniqTrans"] = df["Name"].isin(uniques)
    df[['CabinTrans_0', 'CabinTrans_1', 'CabinTrans_2']]= df["Cabin"].str.split(pat='/', expand=True)

In [None]:
transform(train_df)
transform(test_df)
train_df = train_df.drop(["PassengerId", "Name", "Cabin"], axis=1)
test_df = test_df.drop(["PassengerId", "Name", "Cabin"], axis=1)

y_train = train_df["Transported"]
X_train = train_df.drop(["Transported"], axis=1)
X_test = test_df

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

cat_attribs = ["HomePlanet", "CryoSleep", "Destination", "VIP", "CabinTrans_0", "CabinTrans_1", "CabinTrans_2"]
num_attribs = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
full_pipeline = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs), ('std_scaler', StandardScaler(), num_attribs)], remainder='passthrough')
encoder = full_pipeline.fit(X_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
xgb = XGBClassifier(tree_method="gpu_hist", objective='binary:logistic', enable_categorical=True)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
print(y_pred)

In [None]:
sub.head()

In [None]:
sub["Transported"] = y_pred
sub.to_csv("sub.csv", index=False)