# Define Data

In [None]:
from IPython.display import clear_output
!pip3 install tensorflow_decision_forests --upgrade
clear_output()

import numpy as np
import pandas as pd
import os,random

import tensorflow as tf
import tensorflow_decision_forests as tfdf

TRAIN_PATH = "../input/tabular-playground-series-mar-2022/train.csv"
TEST_PATH = "../input/tabular-playground-series-mar-2022/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/tabular-playground-series-mar-2022/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "row_id"
TARGET = "congestion"
TIME = "time"
METRICS=["mae"]

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything()

MODEL_NUM_TREES = 1000
MODEL_GROWING_STRATEGT = "BEST_FIRST_GLOBAL"
MODEL_MAX_DEPTH = 8
MODEL_SPLIT_AXIS = "SPARSE_OBLIQUE"
MODEL_CATEGORICAL_ALGORITHM = "RANDOM"

MODEL_SAVED_PATH = "tensorflow_decison_forest_model"

# Preprocess Data

In [None]:
train = pd.read_csv(TRAIN_PATH,parse_dates=[TIME])
test = pd.read_csv(TEST_PATH,parse_dates=[TIME])

def addTimeFeature(df,time_col):
    df['weekday'] = df[time_col].dt.weekday
    df['hour'] = df[time_col].dt.hour
    df['minute'] = df[time_col].dt.minute 
    
    df = df.drop([time_col],axis=1)
    
    return df

train = addTimeFeature(train,TIME)
test = addTimeFeature(test,TIME)

def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = "Missing"
                
checkNull_fillData(train)
checkNull_fillData(test)

# Build Model

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    train, 
    label=TARGET,
    task=tfdf.keras.Task.REGRESSION
)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test,
    task=tfdf.keras.Task.REGRESSION
)

model = tfdf.keras.RandomForestModel(
    num_trees=MODEL_NUM_TREES,
    growing_strategy=MODEL_GROWING_STRATEGT,
    max_depth=MODEL_MAX_DEPTH,
    split_axis=MODEL_SPLIT_AXIS,
    categorical_algorithm=MODEL_CATEGORICAL_ALGORITHM,
    task = tfdf.keras.Task.REGRESSION
)
model.compile(metrics=METRICS)
model.fit(train_ds)

model.save(MODEL_SAVED_PATH)
model.summary()

# Predict Data

In [None]:
pred_test = model.predict(test_ds)

sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = pred_test
sub[TARGET] = round(sub[TARGET],0).astype(int)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()