In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import fbprophet as prophet

train_ = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test_ = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')
train_.head()

# Approach and Methods

based on the timestamps it appears that there is a separate entry for each (x,y,direction) pairing.  The directions and coordinates may have some latent relationship, but since there are independent observations for each grouping it makes sense to treat the dataset as 65 independent timeseries and model them separately.  

As a first pass we will split the datasets for both train and test by their coordinates and direction and then generate a separate prophet timeseries forecasting model and predictions set for each dataset.  This should give good focused results.

# Alternatives and Future Experimentation

If forecasting the timeseries separately with a basic prophet model is not sufficient, then other methods that try to capture geographic relationships in the coordinate and direction systems can be employed by adding exogenous regressors to the prophet models for each series or by adding hyperparameter tuning for each model.

In [None]:
directions = ['EB', 'NB']#, 'SB', 'WB', 'NE', 'SW', 'NW', 'SE']
direction_map = {directions[i] : i for i in range(len(directions))}

def format_dataset(data: pd.DataFrame):
    df = data[data.direction.isin(directions)].copy()
    # preprocessing via bernhardklinger/march-tps-lgbm
    df['ds'] = pd.to_datetime(df['time'])
    df['month']= df.ds.dt.month
    df['day']= df.ds.dt.dayofyear
    df['am'] = (df.ds.dt.hour < 12) & (df.ds.dt.hour >6)
    df['wkday'] = df.ds.dt.weekday        
    df['time']= (df.ds.dt.hour-12)*3+df.ds.dt.minute/20
    df['xydirday']= df.x.astype(str)+df.y.astype(str)+df.direction+df.day.astype(str)
    df['xydir'] = df.x.astype(str)+df.y.astype(str)+df.direction
    df['all']= df['xydir']+df.ds.astype(str)
    df['direction'] = [direction_map[direction] for direction in df.direction]
    df.set_index('row_id', inplace=True)
    return df



In [None]:
train = format_dataset(train_)
test = format_dataset(test_)

train.describe()

In [None]:
test.describe()

In [None]:
def split_datasets(data: pd.DataFrame):
    datasets = {}
    for _, i in direction_map.items():
        df = data[data.direction == i]
        for x_ in range(1):#range(3):
            df2 = df[df.x == x_]
            for y_ in range(1):#range(4):
                df3 = df2[df2.y == y_]
                if not df3.empty:
                    df4 = df3.copy(True)
                    nm = f"{x_}_{y_}_{i}"
                    df4.rename(columns = {'y' : 'y_', 'x' : 'x_'}, inplace = True)
                    if 'congestion' in df4.columns:
                        df4.rename(columns = {'congestion' : 'y'}, inplace = True)
                    datasets[nm] = df4
    return datasets

train_sets = split_datasets(train)
test_sets = split_datasets(test)

In [None]:
print(len(train_sets))
print(len(train_sets['0_0_0']))
print(len(test_sets))

In [None]:
test_sets['0_0_0'].head()

In [None]:
train_sets['0_0_0'].head()

In [None]:
def add_features(model: prophet.Prophet):
    model.add_country_holidays('US')
    model.add_seasonality(name='weekly', period=504, fourier_order=3)
    model.add_seasonality(name='monthly', period=2196, fourier_order=5)
    model.add_seasonality(name='hourly', period=3, fourier_order=2)

In [None]:
from fbprophet.diagnostics import cross_validation, performance_metrics
models = {}
diagnostic = {}
for nm, train_set in train_sets.items():
    #fit new prophet model
    model = prophet.Prophet()
    add_features(model)
    model.fit(train_set)
    
    # cross-validation
    cv = cross_validation(model, horizon = '30 days', period = '30 days', initial = '90 days', parallel = 'processes')
    
    pm = performance_metrics(cv, rolling_window=1) 
    
    diagnostic[nm] = pm
    
    models[nm] = model

In [None]:

diagnostic

In [None]:
result_set = []
for nm, test_set in test_sets.items():
    model = models[nm]
    n = len(test_set.y_)
    future = model.make_future_dataframe(periods=n, include_history=False)
    predictions = model.predict(future)
    result_set.append(pd.DataFrame({'row_id' : test_set.index.values, 'congestion' : np.round(predictions.yhat, 0)}))
    


In [None]:
result = pd.concat(result_set)

In [None]:
result.to_csv('submission_prophet.csv', index=False)