In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv("../input/spaceship-titanic/train.csv")
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
train_df.head()

In [None]:
for col in train_df:
    print(f'{col:15}: {train_df[col].unique()}')

In [None]:
train_df.isnull().mean()

Looking at values unique values and null values
1. HomePlanet, CyroSleep, Cabin, Destination, VIP, Name's nan will be filled with a placeholder. (M for missing maybe?)
2. Also experiment if filling it with the most frequent value improves the accuracy of the model
3. RoomService, FoodCourt, ShoppingMall, Spa, VRDeck could be summed up as TotalSpent.
4. Try treating age and totalspent as a continuous variable first, and then try binning the value looking at the variation
5. Name should be divided into first/last name, last name could be useful because it can indicate family information.
6. PassengerId should be split into two, because the first part gives us group information
7. Cabin should be separated into 3 parts.

In [None]:
test_df.isnull().mean()

In [None]:
# deal with train_df, test_df at the same time
combined = train_df, test_df

# Second, divide cabin and name and passengerid, sum up total spent money
for dataset in combined:
    dataset[['Cabin_1', 'Cabin_2', 'Cabin_3']] = dataset['Cabin'].str.split('/', expand=True)
    dataset[['FirstName', 'LastName']] = dataset['Name'].str.split(' ', expand=True)
    dataset[['GroupId', 'PersonId']] = dataset['PassengerId'].str.split('_', expand=True)
    dataset['TotalSpent'] = dataset['RoomService'] + dataset['FoodCourt'] + dataset['ShoppingMall'] + dataset['Spa'] + dataset['VRDeck']


In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
maybe_not_necessary_var = ['PassengerId', 'Name', 'Cabin', 'PersonId', "FirstName"]
cat_var = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_1', 'Cabin_2', 'Cabin_3', 'GroupId', 'LastName']
cont_var = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpent']
cols = cat_var + cont_var

In [None]:
y = train_df['Transported']
X = train_df.drop('Transported', axis=1)
Xt = test_df.copy()

In [None]:
X['CryoSleep'] = X['CryoSleep'].apply(lambda x: str(x) if pd.notnull(x) else x)
Xt['CryoSleep'] = Xt['CryoSleep'].apply(lambda x: str(x) if pd.notnull(x) else x)
X['VIP'] = X['VIP'].apply(lambda x: str(x) if pd.notnull(x) else x)
Xt['VIP'] = Xt['VIP'].apply(lambda x: str(x) if pd.notnull(x) else x)

In [None]:
X.isnull().sum()

In [None]:
def Categorize(x, xt, cat_var, cols):
    x = x.copy()[cols]
    xt = xt.copy()[cols]
    
    x[cat_var] = x[cat_var]
    xt[cat_var] = xt[cat_var]
    
    # encode null values into 0
    for c in cat_var:
        x.loc[x.isnull()[c], c] = 0 
        xt.loc[xt.isnull()[c], c] = 0
    
    dicts = {col:{key:val for val, key in enumerate(x[col].unique(), 1)} for col in cat_var}
    
    for dictcol in dicts:
        
        # key is encoded by train set, so any new keys in test set will be set to 0
        unknown_key = list(set(xt[dictcol].unique()) - set(x[dictcol].unique()))
        
        for key in unknown_key:
            xt.loc[xt[dictcol]==key, dictcol] = 0
        
        for key,val in dicts[dictcol].items():  
            x.loc[x[dictcol]==key, dictcol] = val
            xt.loc[xt[dictcol]==key, dictcol] = val
    
    return x, xt
        

In [None]:
Xc,Xtc = Categorize(X, Xt, cat_var, cols)

In [None]:
Xc

In [None]:
def FillContVar(x, xt, cont_var, group_var, cols):
    x = x.copy()[cols]
    xt = xt.copy()[cols]
    
    for dataset in (x,xt):
        for cv in cont_var:
            dataset[cv] = dataset.groupby(group_var)[cv].apply(lambda a: a.fillna(a.median()))
    return x, xt

In [None]:
Xcc,Xtcc = FillContVar(Xc, Xtc, cont_var, ['HomePlanet', 'CryoSleep'], cols)

In [None]:
Xcc.head()

In [None]:
Xtcc.head()

In [None]:
Xcc.isnull().sum()

In [None]:
Xtcc.isnull().sum()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfclf = RandomForestClassifier(1222, min_samples_leaf=4, oob_score=True, n_jobs=-1, random_state=2)

In [None]:
rfclf.fit(Xcc, y)

In [None]:
rfclf.oob_score_

In [None]:
pred_rf = rfclf.predict(Xtcc)

In [None]:
submit_rf = pd.DataFrame({'PassengerId': Xt['PassengerId'], 'Transported': pred_rf})

A random forest model will be the baseline. The score when submitted was 0.79705

In [None]:
submit_rf.to_csv("submission_rf.csv",index=False)

Try using neural networks (fastai library)

In [None]:
!pip install fastai -Uqq

In [None]:
from fastai.tabular.all import *
from torch.utils.data import Dataset, Subset

Making a dataset for our data

In [None]:
class STitanicDataset(Dataset):
    def __init__(self, df, y=None, shuffle=False):
        self.y = y
        self.is_train = False if y is None else True
        self.df = df.copy()
        if shuffle:
            self.df = self.df.sample(frac=1)
        self.cat = self.df[cat_var].astype(int)
        self.cont = self.df[cont_var]
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        if self.is_train:
            return self.cat.iloc[idx].values, self.cont.iloc[idx].values, int(self.y.iloc[idx])
        else:
            return self.cat.iloc[idx].values, self.cont.iloc[idx].values

In [None]:
train_ds = STitanicDataset(Xcc, y=y)
test_ds = STitanicDataset(Xtcc)

splitting the data into training and validation set

In [None]:
cut = 0.8
rs = RandomSplitter(seed=1)
train_n, valid_n = rs(train_ds)

In [None]:
train_ds_sub = Subset(train_ds, train_n)
valid_ds_sub = Subset(train_ds, valid_n)

In [None]:
dlt = torch.utils.data.DataLoader(train_ds_sub, batch_size=64)
dlv = torch.utils.data.DataLoader(valid_ds_sub, batch_size=64)

In [None]:
dls = DataLoaders(dlt, dlv)

The model used for tabular data

In [None]:
class TabModel(Module):
    def __init__(self, emb_sz, cat_n, cont_n, layers):
        super(TabModel, self).__init__()
        self.cat_n = cat_n
        self.cont_n = cont_n
        self.embeddings = nn.ModuleList(nn.Embedding(ni, nf) for ni, nf in emb_sz)
        cat_len = sum([x for _, x in emb_sz])
        self.emb_drop = nn.Dropout(0.2)
        self.linear_drop = nn.Dropout(0.5)
        model = []
        model.append(nn.BatchNorm1d(cat_len+cont_n))
        model.append(self.linear_drop)
        model.append(nn.Linear(cat_len+cont_n, layers[0]))
        model.append(nn.Mish())
        for i in range(len(layers)-2):
            model.append(nn.BatchNorm1d(layers[i]))
            model.append(self.linear_drop)
            model.append(nn.Linear(layers[i], layers[i+1]))
            model.append(nn.Mish())
        self.linear = nn.Sequential(*model)
        self.final_layer = nn.Linear(layers[-2], layers[-1])
        
    def forward(self, cat, cont):
        emb = [self.embeddings[i](cat[:,i].int()) for i in range(self.cat_n)]
        emb.extend([cont[:,i].unsqueeze(1) for i in range(self.cont_n)])
        inputs = torch.cat(emb, dim=1).float()
        res = self.linear(inputs)
        res = self.final_layer(res)
        return res

Getting the embedding size for each categorical variable

In [None]:
embsz = []
for c in cat_var:
    vlen = len(Xcc[c].unique())+1
    output = int(1.6*vlen**0.6)
    embsz.append([vlen, output])
    
print(embsz)

In [None]:
learn = Learner(dls, TabModel(embsz, len(cat_var), len(cont_var), [50, 25, 2]), loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(4, 1e-3)

In [None]:
dl_test = torch.utils.data.DataLoader(test_ds, batch_size=64)

code for predicting 

In [None]:
def predictdata(model, dl_test):
    model.eval()
    dl_test_it = iter(dl_test)
    result = Tensor()
    for i in dl_test_it:
        cat, cont = i
        res = model(cat, cont)
        res = res.argmax(1)
        result = torch.cat([result, res])
    return result.bool()

In [None]:
pred_nn = predictdata(learn.model, dl_test)

In [None]:
submit_nn = pd.DataFrame({'PassengerId': Xt['PassengerId'], 'Transported': pred_nn})

In [None]:
submit_nn.to_csv("submission_rnn.csv",index=False)