# Define Data

In [None]:
from IPython.display import clear_output
!pip3 install tensorflow_decision_forests --upgrade
clear_output()

import numpy as np
import pandas as pd
import os,random

import tensorflow as tf
import tensorflow_decision_forests as tfdf

TRAIN_PATH = "../input/spaceship-titanic/train.csv"
TEST_PATH = "../input/spaceship-titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/spaceship-titanic/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "PassengerId"
TARGET = "Transported"
METRICS=["accuracy"]

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything()

DELETE_COL = ["Name","Cabin"]
BOOL_COL = ["CryoSleep","VIP"]

MODEL_NUM_TREES = 2000
MODEL_GROWING_STRATEGT = "BEST_FIRST_GLOBAL"
MODEL_MAX_DEPTH = 8
MODEL_SPLIT_AXIS = "SPARSE_OBLIQUE"
MODEL_CATEGORICAL_ALGORITHM = "RANDOM"

MODEL_SAVED_PATH = "tensorflow_decison_forest_model"

# Preprocess Data

In [None]:
# load 
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
    
train = train.drop(DELETE_COL,axis=1)
test = test.drop(DELETE_COL,axis=1)

# check null
def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = df[col].mode()[0]
                
checkNull_fillData(train)
checkNull_fillData(test)

# object -> int
train[TARGET] = train[TARGET].astype(int)
for col in BOOL_COL:
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

# check duplicated data
feature = [col for col in train.columns if col != ID and col != TARGET]
train = train[train[feature].duplicated()==False]

# Build Model

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=TARGET)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

model = tfdf.keras.RandomForestModel(
    num_trees=MODEL_NUM_TREES,
    growing_strategy=MODEL_GROWING_STRATEGT,
    max_depth=MODEL_MAX_DEPTH,
    split_axis=MODEL_SPLIT_AXIS,
    categorical_algorithm=MODEL_CATEGORICAL_ALGORITHM
)
model.compile(metrics=METRICS)
model.fit(train_ds)

model.save(MODEL_SAVED_PATH)
model.summary()

# Predict Data

In [None]:
pred_test = model.predict(test_ds)

sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = (pred_test > 0.5).astype(bool)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()