In [None]:
import pandas as pd
import tensorflow_decision_forests as tfdf

train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
train_df.columns = [x.lower() for x in train_df.columns]
test_df.columns = [x.lower() for x in test_df.columns]
train_df = train_df.drop(columns=["name"])
test_df = test_df.drop(columns=["name"])
train_df = train_df[train_df["transported"].notna()]
print(train_df.info())
train_df.head()


In [None]:
def fill_na(df: pd.DataFrame) -> pd.DataFrame:
    df["homeplanet"].fillna("none", inplace=True)
    df["cryosleep"].fillna(False, inplace=True)
    df["cabin"].fillna("none", inplace=True)
    df["destination"].fillna("none", inplace=True)
    df["age"].fillna(df["age"].mean(), inplace=True)
    df["vip"].fillna(False, inplace=True)
    df["roomservice"].fillna(0, inplace=True)
    df["foodcourt"].fillna(0, inplace=True)
    df["shoppingmall"].fillna(0, inplace=True)
    df["spa"].fillna(0, inplace=True)
    df["vrdeck"].fillna(0, inplace=True)
    return df


train_df = fill_na(train_df)
test_df = fill_na(test_df)
train_df.info()


In [None]:
# num_cols = [k for k, v in train_df.dtypes.items() if v == "float64"]
# cat_cols = [x for x in train_df.columns if x not in num_cols]
num_cols = ["age", "roomservice", "foodcourt", "shoppingmall", "spa", "vrdeck"]
# cat_cols = ["homeplanet", "cryosleep", "cabin", "destination", "vip"]
cat_cols = ["homeplanet", "cryosleep", "destination", "vip"]
print(num_cols, cat_cols)
train_df["transported"].value_counts()


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

t = ["transported"]
t.extend(num_cols)

sns.pairplot(train_df[t], hue="transported", diag_kind="kde", corner=True)


In [None]:
from collections import Counter


for c in cat_cols:
    d = {True: Counter(), False: Counter()}
    for i in train_df[train_df["transported"]][c]:
        d[True][i] += 1
    for i in train_df[~train_df["transported"]][c]:
        d[False][i] += 1

    tmp = {
        c: list(d[True].keys()),
        "count": list(d[True].values()),
        "transported": [True for _ in range(len(d[True].values()))],
    }
    tmp[c].extend(list(d[False].keys()))
    tmp["count"].extend(list(d[False].values()))
    tmp["transported"].extend([False for _ in range(len(d[False].values()))])

    tmp = pd.DataFrame(tmp)
    sns.barplot(tmp, x=c, y="count", hue="transported")
    plt.show()


In [None]:
# categorical vars
from sklearn.preprocessing import OrdinalEncoder
import numpy as np


def encode(df: pd.DataFrame):
    oe = OrdinalEncoder()
    for c in ["cryosleep", "vip"]:
        df[c] = oe.fit_transform(df[c].to_numpy().reshape(1, -1))[0]

    # split cabin info
    c = df["cabin"].str.split("/")
    a = [x[0] for x in c]
    df["cabin0"] = a
    b = [x[1] if x[0] !='none' else 'none' for x in c]
    df["cabin1"] = b
    c = [x[2] if x[0] !='none' else 'none' for x in c]
    df["cabin2"] = c

    return df.drop(columns="cabin")


# encode(train_df)
train_df = encode(train_df)
test_df = encode(test_df)
test_df.dtypes


In [None]:
# Convert the dataset into a TensorFlow dataset.
tdf = train_df.sample(int(train_df.shape[0] * 0.8))
edf = train_df[~train_df["passengerid"].isin(tdf["passengerid"])]
assert tdf.shape[0] + edf.shape[0] == train_df.shape[0]

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(tdf.drop(columns="passengerid"), label="transported")
eval_ds = tfdf.keras.pd_dataframe_to_tf_dataset(edf.drop(columns="passengerid"), label="transported")

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df.drop(columns="passengerid"))


In [None]:
edf.head()


In [None]:
from sklearn.metrics import accuracy_score

models = {}
# for x in tfdf.keras.get_all_models():
#     if "Distributed" in x.__name__:
#         continue
for x in [tfdf.keras.RandomForestModel, tfdf.keras.GradientBoostedTreesModel]:
    print(x.__name__)

    model: tfdf.keras.CoreModel = x(
        hyperparameter_template="benchmark_rank1",
        verbose=0,
        # max_depth=1024,
        # num_trees=1024,
    )
    model.fit(train_ds, verbose=0)

    # Summary of the model structure.
    # model.summary()
    preds = model.predict(eval_ds, verbose=0)
    d = {"PassengerId": [], "Transported": []}
    for p, i in zip(preds, edf["passengerid"]):
        d["PassengerId"].append(i)
        d["Transported"].append(p[0] > 0.5)

    acc = accuracy_score(edf["transported"].tolist(), d["Transported"])
    print(acc)
    models[acc] = model

# pick the winner
max_score = max(models.keys())
model = models[max_score]
model.name


In [None]:
# # train with everything
# train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
#     train_df.drop(columns="passengerid"), label="transported"
# )

# model = tfdf.keras.RandomForestModel(
#     hyperparameter_template="benchmark_rank1", verbose=0, max_depth=1024
# )
# model.fit(train_ds, verbose=0)
# print(model.evaluate(train_ds))


In [None]:
# Evaluate the model.
preds = model.predict(test_ds)
d = {"PassengerId": [], "Transported": []}
for p, i in zip(preds, test_df["passengerid"]):
    d["PassengerId"].append(i)
    d["Transported"].append(p[0] > 0.5)

pd.DataFrame(d).to_csv("data/preds.csv", index=False)
