In [None]:
import subprocess
import pandas as pd
from pathlib import Path

from math import sin, cos, pi


from sklearn.ensemble import RandomForestRegressor

In [None]:
data_path = Path('/kaggle/input/tabular-playground-series-mar-2022/')

train = pd.read_csv(data_path / 'train.csv', index_col='row_id', parse_dates=['time'])
test = pd.read_csv(data_path / 'test.csv', index_col='row_id', parse_dates=['time'])
submission = pd.read_csv(data_path / 'sample_submission.csv', index_col='row_id')

## Cyclical Features

Since map directions are cyclical, we can try to capture this by breaking the direction into `sin` and `cos` components. (Here, I hand code some value to avoid floating point noise.)

In [None]:
sin_vals = {
    'NB': 0.0,
    'NE': sin(1 * pi/4),
    'EB': 1.0,
    'SE': sin(3 * pi/4),
    'SB': 0.0,
    'SW': sin(5 * pi/4),    
    'WB': -1.0,    
    'NW': sin(7 * pi/4),  
}

cos_vals = {
    'NB': 1.0,
    'NE': cos(1 * pi/4),
    'EB': 0.0,
    'SE': cos(3 * pi/4),
    'SB': -1.0,
    'SW': cos(5 * pi/4),    
    'WB': 0.0,    
    'NW': cos(7 * pi/4),  
}


train['sin'] = train['direction'].map(sin_vals)
test['sin'] = test['direction'].map(sin_vals)

train['cos'] = train['direction'].map(cos_vals)
test['cos'] = test['direction'].map(cos_vals)

Let's still keep the direction as an encoded variable, but you can experiment with its feature importance compared to the cyclical features.

In [None]:
encoded_vals = {
    'NB': 0,
    'NE': 1,
    'EB': 2,
    'SE': 3,
    'SB': 4,
    'SW': 5,
    'WB': 6, 
    'NW': 7,
}

train['direction'] = train['direction'].map(encoded_vals)
test['direction'] = test['direction'].map(encoded_vals)

Let's make some standard time features

In [None]:
train['year'] = train['time'].dt.year
train['month'] = train['time'].dt.month
train['day'] = train['time'].dt.day
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute
train['weekday'] = train['time'].dt.weekday

test['year'] = test['time'].dt.year
test['month'] = test['time'].dt.month
test['day'] = test['time'].dt.day
test['hour'] = test['time'].dt.hour
test['minute'] = test['time'].dt.minute
test['weekday'] = test['time'].dt.weekday

train = train.drop('time', axis='columns')
test = test.drop('time', axis='columns')

Let's use a simple Random Forest model.

In [None]:
rf = RandomForestRegressor(n_estimators=500, max_depth=5, n_jobs=-1)

X = train.copy()
y = X.pop('congestion')

rf.fit(X, y)

submission_name =  'rf_with_cyclical.csv'
submission['congestion'] = rf.predict(test)
submission.to_csv(submission_name)