In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import gc

In [None]:
train= pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')
test =  pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')
ss = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')


TARGET = 'congestion'
test[TARGET] = 0

#Turning time to DateTime
train['time'] = pd.to_datetime(train['time'])
test['time'] = pd.to_datetime(test['time'])
ss['time'] = test['time']

#Combining all location features 
for df in [train, test]:
    df['xydir'] = df['x'].astype(str) + '_' + df['y'].astype(str) + train['direction']
    df['xy'] = df['x'].astype(str) + '_' + df['y'].astype(str) 
ss['xydir'] = test['xydir']

In [None]:
train['test'] = False
test['test'] = True
data = pd.concat([train, test]).reset_index(drop=True)
del train, test; gc.collect()

In [None]:
data['day'] = data.time.dt.weekday
data['hour'] =data.time.dt.hour
data['minute'] = data.time.dt.minute
data['dhm'] = data['day'].astype(str) + '_' + data['hour'].astype(str) + '_' + data['minute'].astype(str)
data['hm'] = data['hour'].astype(str) + '_' + data['minute'].astype(str)
data['hm_xydir'] = data['hm'] + data['xydir']
data['hm_xy'] = data['hm'] + data['xydir']

In [None]:
################
#Global stats
################
#Find the min, max, median, variance, and mean for both 
#the x-y-direction and x-y for each unique hour-minute of the day.
#Modified from https://www.kaggle.com/code/packinman/tps-mar-2022-automl-pycaret-regression
FEATURES = []
for location_time in ['hm_xydir', 'hm_xy']:
    for stat in ['min','max','median','var','mean']:
        name = f'global_{location_time}_{stat}'
        stat = data.loc[data.test==False, [location_time, TARGET]].groupby([location_time]).agg(stat).to_dict()[TARGET]
        data[name] = data[location_time].map(stat)
        FEATURES.append(name)

In [None]:
###############################
#Lag Stats
###############################
#I found the mean, var, median, min, max, and 1 interval shift
# for 3/5/10 interval windows and expanding windows.  The intervals
# were every week and every day.
EXPANDING = ['roll', 'expand']
STATS = ['mean','var','median', 'min','max','shift']
TIMES = ['dhm','hm']
LOCATIONS = ['xydir'] 
LENGTHS = [3,5, 10]
i=0
for stat in STATS:
    for time in TIMES:
        for length in LENGTHS:
            for expanding in EXPANDING:
                print(i, end=", ")
                name = f'{time}_{stat}_{length}_{expanding}' 
                if stat == 'shift':
                    #Only shift once per time
                    if length==3:
                        name = f'{time}_{stat}_1'
                        data[name] = data.groupby([time, 'xydir'])[TARGET].apply(lambda x: x.shift())
                elif expanding =='roll':
                    data[name] = data.groupby([time, 'xydir'])[TARGET].apply(lambda x: x.shift().rolling(length,min_periods=1).agg(stat))
                else:
                    #Only use the expanding 1x per loop
                    if length==3:
                        name = f'{time}_{stat}_{expanding}'
                        data[name] = data.groupby([time, 'xydir'])[TARGET].apply(lambda x: x.shift().expanding(min_periods=1).agg(stat))
                FEATURES.append(name)
                i+=1

In [None]:
data

In [None]:
#Checking that the hm calculation works
msk = (data.xydir == '0_0NB') & (data.hm=='0_0')
FEATURES_TO_SEE = ['xydir','hm', 'dhm','time', TARGET, 'hm_max_5_roll', 'hm_max_10_roll', 'hm_shift_1']
data.loc[msk, FEATURES_TO_SEE].head(20)

In [None]:
data.columns

In [None]:
#Checking that the dhm calculation works
msk = (data.xydir == '0_0NB') & (data.dhm=='0_0_0')
FEATURES_TO_SEE = ['xydir','dhm','time', 'dhm_mean_3_roll','dhm_mean_expand','dhm_mean_5_roll', 'dhm_mean_10_roll']
data.loc[msk, FEATURES_TO_SEE]

In [None]:
data.to_parquet('all_rows.parquet')