# Start

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()

# Check Input

In [None]:
train_df = pd.read_csv("../input/spaceship-titanic/train.csv")
train_df

In [None]:
test_df = pd.read_csv("../input/spaceship-titanic/test.csv")
test_df

In [None]:
train_df.Age.describe()

# Check Categorical Data

In [None]:
HomePlanet = set(train_df.HomePlanet.unique().tolist() + test_df.HomePlanet.unique().tolist())
HomePlanet

In [None]:
Destination = set(train_df.Destination.unique().tolist() + test_df.Destination.unique().tolist())
Destination

In [None]:
#for cabin, we can separate by deck/num/side
DNS = train_df.Cabin.apply(lambda x: x.split('/') if x is not np.NaN else [-1, -1, -1])
D, N, S = np.array(DNS.values.tolist()).T
D, N, S = set(D), set(N), set(S)
D, S

Create a function to do data cleaning

In [None]:
hp_map = {'Earth':1, 'Europa':2, 'Mars':3}
ds_map = {'55 Cancri e':1, 'PSO J318.5-22':2, 'TRAPPIST-1e':3}
d_map = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8}
s_map = {'P':1, 'S':2}

def clean_data(df, train=False):
    new_df = df[["PassengerId", "RoomService", "FoodCourt",
                 "ShoppingMall", "Spa", "VRDeck"]].fillna(0)
    new_df["CryoSleep"] = df["CryoSleep"].fillna(-1).astype(int)
    new_df["VIP"] = df["VIP"].fillna(-1).astype(int)
    new_df["HomePlanet"] = df.HomePlanet.apply(lambda x: hp_map.get(x, 0))
    new_df["Destination"] = df.Destination.apply(lambda x: ds_map.get(x, 0))
    DNS = df.Cabin.apply(lambda x: x.split('/') if x is not np.NaN else [-1, -1, -1])
    D, N, S = np.array(DNS.values.tolist()).T
    new_df['D'] = [d_map.get(i, 0) for i in D]
    new_df['N'] = N
    new_df['S'] = [s_map.get(i, 0) for i in S]
    for i in range(0, 100, 20):
        new_df[f'Age>{i}'] = (df['Age'].fillna(100) >= i).astype(int)
    if train:
        new_df['target'] = df['Transported'].astype(int)
    return new_df

In [None]:
new_train = clean_data(train_df, True)
new_train

In [None]:
new_test = clean_data(test_df)
new_test

# Training

In [None]:
import lightgbm
from sklearn.model_selection import train_test_split as tts

models = []
model = lightgbm.LGBMClassifier(
    n_estimators=20000,
    num_leaves = 50,
    max_depth=8,
    learning_rate=0.003,
    objective='binary',
    feature_fraction=0.75,
    lambda_l1=7.5,
    lambda_l2=2.5,
    #metric='binary_accuracy',
    early_stopping=1000
)

train_cols = [c for c in new_test.columns if c != 'PassengerId']
X = new_train[train_cols].values
y = new_train['target']
train_x, valid_x, train_y, valid_y = tts(X, y, test_size=0.2, shuffle=True, random_state=101)
lgbm1 = model.fit(train_x, train_y,
                  eval_set=[(train_x, train_y), (valid_x, valid_y)],
                  verbose=100
                 )
print(f"train score: {lgbm1.score(train_x, train_y)}")
print(f"validation score: {lgbm1.score(valid_x, valid_y)}")
models.append(lgbm1)

# Prediction

In [None]:
preds = []
for model in models:
    preds.append(model.predict(new_test[train_cols].values))
preds = np.round(np.mean(preds, axis=0))

In [None]:
sub = pd.DataFrame({"PassengerId":new_test.PassengerId, 'Transported':
                    [True if i==1. else False for i in preds]})
sub.to_csv("submission.csv", index=False)
sub