# Wide Data For Geographic Information

The data in this competition is connected through both location and time.  Many notebooks make good use of the time series aspect of this competition, but I have not see n any that try to make geographic connections between the 65 x-y/direction.  This notebook is my attempt to try this. 

Instead of the rows being a timestamp-direction pair with the target being the congestion, my data are transformed to be only a timestamp with the targets being the congestion of all 65 x-y/direction combinations.  The features I use are the 1 day lag, row-wise statistics (mean, var, etc) of the 1 day lag features, and the daily moving median.  

In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import optuna

In [None]:
train_y = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/train.csv')
test_y =  pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/test.csv')
ss = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv')


#Scaling the TARGET values back by 100 for NN learning 
TARGET = 'congestion'
train_y[TARGET] = train_y[TARGET] / 100
test_y[TARGET] = 0

#Turning time to DateTime
train_y['time'] = pd.to_datetime(train_y['time'])
test_y['time'] = pd.to_datetime(test_y['time'])
ss['time'] = test_y['time']

#Combing all location features 
train_y['xydir'] = train_y['x'].astype(str) + '_' + train_y['y'].astype(str) + train_y['direction']
test_y['xydir'] = test_y['x'].astype(str) + '_' + test_y['y'].astype(str) + test_y['direction']
ss['xydir'] = test_y['xydir']

In [None]:
#Turning the data from `long` to `wide`
train_y = train_y[['time','xydir','congestion']].set_index(['time','xydir'])
train_y = train_y.unstack()
train_y.columns = train_y.columns.map(lambda x: x[1])
train_y = train_y.reset_index()

test_y = test_y[['time','xydir','congestion']].set_index(['time','xydir'])
test_y = test_y.unstack()
test_y.columns = test_y.columns.map(lambda x: x[1])
test_y = test_y.reset_index()

In [None]:
train_y.head()

In [None]:
'''
Feature Engineering
'''
#Feature lists
RAW_FEATURES = [feat for feat in train_y.columns if feat != 'time']
HM_MOVING_MEDIANS = [f'HMMM_{feat}' for feat in RAW_FEATURES] #For daily moving medians
DHM_MOVING_MEDIANS = [f'DHMMM_{feat}' for feat in RAW_FEATURES] #For weekly moving medians
ONE_HOUR_MEAN = ['1hm'+feat for feat in RAW_FEATURES] # for mean over the hour

#Creating day, hour, and minute features
train_y['day'] = train_y.time.dt.day % 7
train_y['hour'] =train_y.time.dt.hour
train_y['minute'] = train_y.time.dt.minute
train_y['dhm'] = train_y['day'].astype(str) + '_' + train_y['hour'].astype(str) + '_' + train_y['minute'].astype(str)
train_y['hm'] = train_y['hour'].astype(str) + '_' + train_y['minute'].astype(str)
train_y['day'] = train_y['day'] / 7
train_y['hour'] =train_y['hour'] / 24
train_y['minute'] = train_y['minute']/40

#1) Row wise mean, median, var, kurtosis
train_y['row_kurtosis'] = train_y[RAW_FEATURES].kurtosis(axis=1)
train_y['row_mean'] = train_y[RAW_FEATURES].mean(axis=1)
train_y['row_median'] = train_y[RAW_FEATURES].median(axis=1)
train_y['row_var'] = train_y[RAW_FEATURES].var(axis=1)

#2) moving median
#help from https://stackoverflow.com/questions/36969174/pandas-average-value-for-the-past-n-days
#train_y[HM_MOVING_MEDIANS] = train_y.groupby('hm')[RAW_FEATURES].apply(lambda x: x.shift().expanding(min_periods=1).median())
train_y[HM_MOVING_MEDIANS] = train_y.groupby('hm')[RAW_FEATURES].apply(lambda x: x.expanding(min_periods=1).median())

#3) weekly moving median
train_y[DHM_MOVING_MEDIANS] = train_y.groupby('dhm')[RAW_FEATURES].apply(lambda x: x.expanding(min_periods=1).median())

#4) 1 hour mean
train_y[ONE_HOUR_MEAN] = train_y[RAW_FEATURES].rolling(3).mean().fillna(-1)


In [None]:
'''
Creating train_X and test_X
----------------------------

train_X is the lag -1 day features and train_y is just the present day
test_X, also, is one day behind test_y
'''
train_X = train_y[['time']].copy()
train_X['time'] = train_X.time - pd.Timedelta(days=1)
train_X = pd.merge(train_X, train_y, on='time', how='left')

test_X = test_y[['time']].copy()
test_X['time'] = test_X.time - pd.Timedelta(days=1)
test_X = pd.merge(test_X, train_y, on='time', how='left')

#Discarding all Nan rows from train_X, train_y
missing = train_X['0_0EB'].isnull()
train_X = train_X[~missing].reset_index(drop=True)
train_y = train_y[~missing].reset_index(drop=True)

In [None]:
#Use train time stamps 1 week before test as validation data
val_times = test_y.time - pd.Timedelta(days=7) 
val_msk = train_X.time.isin(val_times)

val_X = train_X[val_msk].copy()
val_y = train_y[val_msk].copy()

train_X = train_X[~val_msk].copy()
train_y = train_y[~val_msk].copy()

In [None]:
def create_vanilla_nn(FEATURES, RAW_FEATURES, DROPOUT_LEVEL, FIRST_DROPOUT, NUM_NEURONS, NUM_HIDDEN_LAYERS, lr):
    INPUT_SHAPE, OUTPUT_SHAPE = len(FEATURES), len(RAW_FEATURES) 
    inp = tf.keras.layers.Input(shape = (INPUT_SHAPE,) )

    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(FIRST_DROPOUT)(x)

    for i in range(NUM_HIDDEN_LAYERS):
        x = tf.keras.layers.Dense(NUM_NEURONS[i], activation='relu')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(DROPOUT_LEVEL)(x)

    x = tf.keras.layers.Dense(OUTPUT_SHAPE, activation= 'sigmoid')(x)

    model = tf.keras.Model(inputs=inp, outputs= x)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  loss=tf.keras.losses.MeanSquaredError())
    return model

Below is my optuna trial to find both optimal features and optimal hyperparameters for the neural network.  Uncomment if you want to perform this yourself.  It takes around 1 hour for 100 trials

In [None]:
'''
def objective(trial):
    ###################################
    # Generate our trial model.
    ###################################
    #Model Architecture specifications
    NN_params = {}
    FEATURES = RAW_FEATURES.copy()
    if trial.suggest_categorical("hm_mm", [True, False]):
        FEATURES += HM_MOVING_MEDIANS
    if trial.suggest_categorical("dhm_mm", [True, False]):
        FEATURES += DHM_MOVING_MEDIANS
    if trial.suggest_categorical("one_hour_mean", [True, False]):
        FEATURES += ONE_HOUR_MEAN
    if trial.suggest_categorical("row_stats", [True, False]):
        FEATURES += ['row_kurtosis', 'row_mean', 'row_median', 'row_var']
    if trial.suggest_categorical("time_base", [True, False]):
        FEATURES += ['day', 'hour', 'minute']
    NN_params['FEATURES'] = FEATURES
    NN_params['RAW_FEATURES'] = RAW_FEATURES
    NN_params["DROPOUT_LEVEL"] = trial.suggest_float("dropout", 0.00,0.6)
    NN_params["FIRST_DROPOUT"] = trial.suggest_float("first_dropout", 0.00,0.9)
    NN_params["NUM_HIDDEN_LAYERS"] = trial.suggest_int("depth", 1,8)
    NN_params["lr"] = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)

    num_neurons = []
    for i in range(NN_params["NUM_HIDDEN_LAYERS"]):
        if i==0:
            num_neurons.append(trial.suggest_int(f"num_neurons_l{i}", len(RAW_FEATURES),5000))
        else:
            num_neurons.append(trial.suggest_int(f"num_neurons_l{i}", len(RAW_FEATURES), num_neurons[-1]))
    NN_params["NUM_NEURONS"] = num_neurons
    
    model = create_vanilla_nn(**NN_params)
    BATCH_SIZE = 256
    ES = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', min_delta=0, patience=20, restore_best_weights=True)

    H = model.fit(train_X[NN_params["FEATURES"]], train_y[RAW_FEATURES], batch_size= BATCH_SIZE, epochs=200, 
              validation_split=.05, callbacks = [ES], verbose = 0)
    
    
        #Val Score
    val_preds = model.predict(val_X[NN_params["FEATURES"]], batch_size=1000)
    score = 100 * np.mean(np.abs(val_y[RAW_FEATURES].values - val_preds))
    
    
    #Stopping Memory Leaks
    del model
    tf.keras.backend.clear_session()

    return score
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
'''

In [None]:
FEATURES = RAW_FEATURES.copy() + HM_MOVING_MEDIANS + ONE_HOUR_MEAN + ['row_kurtosis', 'row_mean', 'row_median', 'row_var'] + ['day', 'hour', 'minute']
params = {'FEATURES': FEATURES, 
          'RAW_FEATURES': RAW_FEATURES, 
          'DROPOUT_LEVEL': 0.12292871613987645, 
          'FIRST_DROPOUT': 0.4302064801787497, 
          'NUM_NEURONS': [4526], 
          'NUM_HIDDEN_LAYERS': 1,
          'lr': 0.0030322842074454745, }

#Run the model 10 times and average preds
n_folds = 10
val_cum = 0
for i in range(n_folds):
    tf.keras.backend.clear_session()
    model = create_vanilla_nn(**params)
    BATCH_SIZE = 256
    ES = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', min_delta=0, patience=20, restore_best_weights=True)

    H = model.fit(train_X[FEATURES], train_y[RAW_FEATURES], batch_size= BATCH_SIZE, epochs=200, 
              validation_data=(val_X[FEATURES], val_y[RAW_FEATURES]), callbacks = [ES], verbose=0)
    val_preds = model.predict(val_X[FEATURES], batch_size=1000)
    test_y[RAW_FEATURES] += model.predict(test_X[FEATURES]) / n_folds      

    #Val Score
    val_cum += val_preds / n_folds
    val_score = 100 * np.mean(np.abs(val_y[RAW_FEATURES].values - val_preds))
    print(f'FOLD {i+1} of {n_folds}: {val_score}')

cumulative_score = 100 * np.mean(np.abs(val_y[RAW_FEATURES].values - val_cum))
print(f'Cumulative validation: {cumulative_score}')

In [None]:
#Getting Predictions back to regular shape from `wide`
test_y = pd.melt(test_y, id_vars = 'time', value_vars=RAW_FEATURES, var_name = 'xydir',value_name = 'congestion')
ss = pd.merge(ss[['row_id', 'time','xydir']], test_y, on = ['time','xydir'], how='left')

In [None]:
#####################
#Post Processing
#####################
#Idea from https://www.kaggle.com/code/ambrosm/tpsmar22-generalizing-the-special-values
train_melt = pd.melt(train_y, id_vars = 'time', value_vars=RAW_FEATURES, var_name = 'xydir',value_name = 'congestion')
train_melt

last_month = ss.time[0] - pd.Timedelta(days=60)
lm = train_melt[train_melt.time >last_month].copy()
lm['day_of_week'] = lm.time.dt.day % 7
lm['hour'] = lm.time.dt.hour
lm['minute'] = lm.time.dt.minute
ss['day_of_week'] = ss.time.dt.day % 7
ss['hour'] = ss.time.dt.hour
ss['minute'] = ss.time.dt.minute

#Getting quantiles
lower = lm.groupby(['day_of_week','hour','minute', 'xydir']).congestion.quantile(0.15).reset_index()
upper = lm.groupby(['day_of_week','hour','minute', 'xydir']).congestion.quantile(0.7).reset_index()

#place quantiles in ss
on = ['day_of_week','hour','minute', 'xydir']
ss['lower'] = ss[on].merge(lower, on = on, how = 'left')['congestion']
ss['upper'] = ss[on].merge(upper, on = on, how = 'left')['congestion']

#Clip by quantiles
ss['congestion'] = np.clip(ss['congestion'], ss['lower'], ss['upper'])

#Submission
ss['congestion'] = ss['congestion'] *100 #I scored 50+ when I forgot to do this.  Watch out!
ss[['row_id','congestion']].to_csv('ss.csv',index=False)