# Introduction

Spatial-temporal time series.

**Acknowledgments:**
* [TPS Mar '22 - Cyclical Features](https://www.kaggle.com/inversion/tps-mar-22-cyclical-features/notebook) by [inversion](https://www.kaggle.com/inversion).
* [TPSMAR22 Without Machine Learning](https://www.kaggle.com/ambrosm/tpsmar22-without-machine-learning?scriptVersionId=89093653) by [AmbrosM](https://www.kaggle.com/ambrosm).

# Libraries

In [None]:
# Core
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import combinations
import math
from math import factorial
from math import sin, cos, pi
import statistics
import scipy.stats
from scipy.stats import pearsonr
import time
from datetime import datetime
import matplotlib.dates as mdates
import dateutil.easter as easter

# Sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, plot_roc_curve, roc_curve
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA

# Models
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

# Tensorflow
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

# Data

**Load data**

In [None]:
# Save to df
train_data=pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col='row_id', parse_dates=['time'])
test_data=pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='row_id', parse_dates=['time'])

# Shape and preview
print('Training data df shape:',train_data.shape)
print('Test data df shape:',test_data.shape)
train_data.head()

**Missing values**

In [None]:
print('Number of missing values in training set:',train_data.isna().sum().sum())
print('')
print('Number of missing values in test set:',test_data.isna().sum().sum())

**Duplicates**

In [None]:
print(f'Duplicates in training set: {train_data.duplicated().sum()}')
print('')
print(f'Duplicates in test set: {test_data.duplicated().sum()}')

**Time range**

In [None]:
print('Training time range:')
print(f'Min:{train_data.time.min()}, Max:{train_data.time.max()}')
print('')
print('Test time range:')
print(f'Min:{test_data.time.min()}, Max:{test_data.time.max()}')

**Initial thoughts:**
* The test set is much smaller than the training set - we are only predicting the traffic flow during a 12 hour window.
* The overall time range is small so we can ignore long term trends and maybe even seasonality. 
* The spatial aspect of this is interesting. My guess is that traffic flow behaves a bit like diffusion in that points/cities close to each other will be affected by each others traffic flows in the near future/past. 

# EDA

In [None]:
# To do

# Feature Engineering

**Cyclical encoding**

In [None]:
# From https://www.kaggle.com/inversion/tps-mar-22-cyclical-features/notebook
def cyc_enc(df):
    sin_vals = {
        'NB': 0.0,
        'NE': sin(1 * pi/4),
        'EB': 1.0,
        'SE': sin(3 * pi/4),
        'SB': 0.0,
        'SW': sin(5 * pi/4),    
        'WB': -1.0,    
        'NW': sin(7 * pi/4),  
    }
    cos_vals = {
        'NB': 1.0,
        'NE': cos(1 * pi/4),
        'EB': 0.0,
        'SE': cos(3 * pi/4),
        'SB': -1.0,
        'SW': cos(5 * pi/4),    
        'WB': 0.0,    
        'NW': cos(7 * pi/4),  
    }
    
    df['sin']=df['direction'].map(sin_vals)
    df['cos']=df['direction'].map(cos_vals)
    
    df.drop('direction', axis=1, inplace=True)
    
    return df

**Time features**

In [None]:
def time_eng(df):
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['weekday'] = df['time'].dt.weekday
    df['hour'] = df['time'].dt.hour
    df['minute'] = df['time'].dt.minute
    
    df.drop('time',axis=1,inplace=True)
    return df

**Apply feat. eng.**

In [None]:
# Train set
X=train_data.copy()
X=time_eng(X)

# Test set
X_test=test_data.copy()
X_test=time_eng(X_test)

# Modelling

We calculate the median congestion for each location at every time of the day, and use these to predict the congestion on the 30th September 1991 (test set).

In [None]:
# From https://www.kaggle.com/ambrosm/tpsmar22-without-machine-learning?scriptVersionId=89093653
X=train_data.copy()
X=time_eng(X)

medians = X.groupby(['x', 'y', 'direction', 'weekday', 'hour', 'minute']).congestion.median().astype(int)
medians

In [None]:
preds_df = X_test.merge(medians, left_on=['x', 'y', 'direction', 'weekday', 'hour', 'minute'], right_index=True)[['congestion']]
preds_df.reset_index(inplace=True)

# Submission

In [None]:
# Save to csv
preds_df.to_csv('submission.csv', index=False)

# Check format
preds_df.head()