In [67]:
# building off of trees, the preprocessing can be duplicated
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import tensorflow_decision_forests as tfdf
import numpy as np
from tqdm import trange

train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
train_df.columns = [x.lower() for x in train_df.columns]
test_df.columns = [x.lower() for x in test_df.columns]
train_df = train_df.drop(columns=["name"])
test_df = test_df.drop(columns=["name"])
train_df = train_df[train_df["transported"].notna()]


def fill_na(df: pd.DataFrame) -> pd.DataFrame:
    df["homeplanet"].fillna("none", inplace=True)
    df["cryosleep"].fillna(False, inplace=True)
    df["cabin"].fillna("none", inplace=True)
    df["destination"].fillna("none", inplace=True)
    df["age"].fillna(df["age"].mean(), inplace=True)
    df["vip"].fillna(False, inplace=True)
    df["roomservice"].fillna(0, inplace=True)
    df["foodcourt"].fillna(0, inplace=True)
    df["shoppingmall"].fillna(0, inplace=True)
    df["spa"].fillna(0, inplace=True)
    df["vrdeck"].fillna(0, inplace=True)
    return df


train_df = fill_na(train_df)
test_df = fill_na(test_df)


def make_vectors(df: pd.DataFrame, test_df: pd.DataFrame):
    labels = df["transported"].to_numpy()
    df.drop(columns="transported")

    # num cols
    num_cols = ["age", "roomservice", "foodcourt", "shoppingmall", "spa", "vrdeck"]
    scaler = StandardScaler()
    vecs = []
    test_vecs = []
    for c in num_cols:
        mean = df[c].mean()
        std = df[c].std()
        x = (df[c] - mean) / std
        vecs.append(x.to_numpy())

        x = (test_df[c] - mean) / std
        test_vecs.append(x)

    # cat cols
    cat_cols = ["homeplanet", "cryosleep",  "destination", "vip"]
    cat_vecs = []
    test_cat_vecs = []
    for c in cat_cols:
        print(c)
        ohe = OneHotEncoder()
        x = ohe.fit_transform(df[c].to_numpy().reshape(-1, 1)).toarray()
        cat_vecs.append(x)
        x = ohe.transform(test_df[c].to_numpy().reshape(-1, 1)).toarray()
        test_cat_vecs.append(x)

    vecs = np.array(vecs).transpose().tolist()
    test_vecs = np.array(test_vecs).transpose().tolist()
    for i in trange(len(vecs)):
        for c in cat_vecs:
            vecs[i].extend(c[i])
    for i in trange(len(test_vecs)):
        for c in test_cat_vecs:
            test_vecs[i].extend(c[i])

    return np.array(vecs), labels, np.array(test_vecs)


train_x, train_y, test_x = make_vectors(train_df, test_df)

print(len(train_x), len(train_y), len(test_x))
assert len(train_x[0]) == len(test_x[0]), (len(train_x[0]), len(test_x[0]))
train_x[:3, :4]


homeplanet
cryosleep
destination
vip


100%|██████████| 8693/8693 [00:00<00:00, 554161.94it/s]
100%|██████████| 4277/4277 [00:00<00:00, 564991.28it/s]

8693 8693 4277





array([[ 0.70939612, -0.3330855 , -0.28101057, -0.28356232],
       [-0.33669797, -0.16806376, -0.27537073, -0.2417568 ],
       [ 2.03444863, -0.26798518,  1.95988491, -0.28356232]])

In [68]:
from sklearn.model_selection import train_test_split

train_x, eval_x, train_y, eval_y = train_test_split(
    train_x, train_y, test_size=0.2, random_state=42
)

train_x[:3, :4]

array([[-0.05773954, -0.3330855 , -0.24654489, -0.28356232],
       [-0.82487521, -0.3330855 ,  0.46783454, -0.23172348],
       [-0.05773954, -0.3330855 , -0.28101057, -0.28356232]])

In [69]:
# train
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(train_x, train_y)
model.score(eval_x,eval_y)

0.7722829212190915

In [72]:
# Evaluate the model.
preds = model.predict(test_x)
d = {"PassengerId": [], "Transported": []}
for p, i in zip(preds, test_df["passengerid"]):
    d["PassengerId"].append(i)
    d["Transported"].append(p)

pd.DataFrame(d).to_csv("data/preds.csv", index=False)
