Problem statement

For the March edition of the 2022 Tabular Playground Series you're challenged to forecast twelve-hours of traffic flow in a U.S. metropolis. The time series in this dataset are labelled with both location coordinates and a direction of travel -- a combination of features that will test your skill at spatio-temporal forecasting within a highly dynamic traffic network.

Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

Load files

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read files

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

Analyse target

In [None]:
sns.displot(train['congestion'])

In [None]:
sns.violinplot(train['congestion'])

In [None]:
train.isnull().sum()

Drop target

In [None]:
target = train["congestion"]

train.drop(["congestion"], axis=1, inplace=True)
train

Combine train and test

In [None]:
combi = train.append(test)
combi

Timestamp data

In [None]:
# combi['time'] = pd.to_datetime(combi['time'], format="%Y-%m-%d %H:%M:%S")
# combi

Month

In [None]:
# combi['month'] = pd.DatetimeIndex(combi['time']).month
# combi["weekday"] = combi['time'].dt.dayofweek
# combi

Day of Week

Hour

In [None]:
# combi['hour'] = pd.to_datetime(combi['time'], format='%Y-%m-%d %H:%M:%S').dt.hour
# combi

Number of week

In [None]:
# combi["week"] = combi['time'].dt.week
# combi

In [None]:
# sns.displot(combi['week'])

In [None]:
# combi['hourminute'] = combi['time'].dt.hour *60 + combi['time'].dt.minute

Analyse direction

In [None]:
sns.displot(combi['direction'])

Replace direction with numbers

In [None]:
combi['direction'].replace({'EB':1, 'NB':2, 'SB':3, 'WB':4, 'NE':5, 'SW':6, 'NW':7,'SE':8}, inplace=True)
combi

Features

In [None]:
def feature_engineering(data):
    data['time'] = pd.to_datetime(data['time'])
    data['month'] = data['time'].dt.month
    data['weekday'] = data['time'].dt.weekday
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['is_month_start'] = data['time'].dt.is_month_start.astype('int')
    data['is_month_end'] = data['time'].dt.is_month_end.astype('int')
    data['hour+minute'] = data['time'].dt.hour * 60 + data['time'].dt.minute
    data['is_weekend'] = (data['time'].dt.dayofweek > 4).astype('int')
    data['is_afternoon'] = (data['time'].dt.hour > 12).astype('int')
    data['x+y'] = data['x'].astype('str') + data['y'].astype('str')
    data['x+y+direction'] = data['x'].astype('str') + data['y'].astype('str') + data['direction'].astype('str')
    data['hour+direction'] = data['hour'].astype('str') + data['direction'].astype('str')
    data['hour+x+y'] = data['hour'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+direction+x'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str')
    data['hour+direction+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['y'].astype('str')
    data['hour+direction+x+y'] = data['hour'].astype('str') + data['direction'].astype('str') + data['x'].astype('str') + data['y'].astype('str')
    data['hour+x'] = data['hour'].astype('str') + data['x'].astype('str')
    data['hour+y'] = data['hour'].astype('str') + data['y'].astype('str')
    data = data.drop(['time'], axis=1)
    return data

In [None]:
 combi = feature_engineering(combi)

In [None]:
# combi.columns not in ['row_id', 'time', 'direction']
features = [e for e in combi.columns if e not in ['row_id', 'time', 'direction']]
# elements = [e for e in combi.columns if e != 'row_id' and e != 'time' and e != 'direction']

In [None]:
features

In [None]:
# features = [combi.columns 'x', 'y', 'direction', 'weekday', 'month', 'hour', 'hourminute']

In [None]:
# features = ['x', 'y', 'direction', 'weekday', 'month', 'hour', 'hourminute']
# combi[features]

Define X and y

In [None]:
y = target
X = combi[features][: len(train)]
X_test = combi[features][len(train) :]

Split into training and validating

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y, shuffle=True)
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

Select model

In [None]:
from catboost import CatBoostRegressor,Pool
eval_dataset = Pool(X_val,y_val,cat_features=features)

model = CatBoostRegressor(n_estimators=12500,
                          learning_rate=0.125,
                          cat_features = features,
                          random_state=42,
                          verbose=1000,
                          loss_function='RMSE',
#                           early_stopping_rounds=199,
                          task_type="GPU",
                          eval_metric='MAE'
                         ).fit(X_train,
                               y_train,
                               eval_set=eval_dataset,
                               use_best_model=True)
print(model.score(X_train, y_train))

Predict on validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_val, y_pred)

In [None]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_val, y_pred, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

Predict on test set

In [None]:
preds = model.predict(X_test)
preds = preds.astype(int)
preds[preds < 0] = 0
preds

Prepare submission

In [None]:
submission.congestion = preds
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission