# Tabular Playground March 22

A very simple ExtraTreesRegressor model baseline, with good performance.

# Preliminaries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv", index_col='row_id', parse_dates=['time'])
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv", index_col='row_id', parse_dates=['time'])

In [None]:
train_df["roadway"] = train_df["x"].astype(str) + train_df["y"].astype(str) + train_df["direction"]
test_df["roadway"] = test_df["x"].astype(str) + test_df["y"].astype(str) + test_df["direction"]
train_df.drop(columns=["x","y","direction"], inplace=True)
test_df.drop(columns=["x","y","direction"], inplace=True)

In [None]:
enc = OneHotEncoder()
enc.fit(train_df[["roadway"]])

In [None]:
def add_features(df):
    roadways_enc_df = pd.DataFrame(enc.transform(df[["roadway"]]).toarray(), index=df.index)
    new_df = pd.concat([df, roadways_enc_df,] , axis=1)
    
    new_df['minutes'] = df['time'].dt.hour * 60 + df['time'].dt.minute
    new_df['dayofweek'] = df['time'].dt.dayofweek
 
    new_df.drop(columns=["time", "roadway"], inplace=True)
    
    return new_df

In [None]:
train_df_2 = add_features(train_df)
test_df_2 = add_features(test_df)

y_train = train_df_2["congestion"]
train_df_2.drop(columns=["congestion"],inplace=True)

In [None]:
%%time
model = ExtraTreesRegressor(n_estimators=300, n_jobs=-1, random_state=1)
model.fit(train_df_2, y_train)

In [None]:
predictions = model.predict(test_df_2)

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")
submission['congestion'] = np.round(predictions)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)