## Impact of Training Data Window and Shuffling on Model Performance 


The point of this notebook is just to look into the impact of the size of the training history on model performance. A related experiment is whether or not shuffling of the training history has any effect.

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import time
import gc

import numpy as np
from google.cloud import bigquery
from google.cloud import storage

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


from tensorflow import keras
from tensorflow.keras import layers
import seaborn as sns
from pandas.tseries.offsets import BDay

from tensorflow.keras.layers import Embedding
from tensorflow.keras import activations
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from sklearn import preprocessing
from datetime import datetime
import matplotlib.pyplot as plt
import pickle5 as pickle


from ficc.utils.nelson_siegel_model import *
from ficc.utils.diff_in_days import *
from ficc.utils.auxiliary_functions import sqltodf


from IPython.display import display, HTML
import os


from ficc.data.process_data import process_data
from ficc.utils.auxiliary_variables import PREDICTORS, NON_CAT_FEATURES, BINARY, CATEGORICAL_FEATURES, IDENTIFIERS, PURPOSE_CLASS_DICT, NUM_OF_DAYS_IN_YEAR
from ficc.utils.gcp_storage_functions import upload_data, download_data
from ficc.utils.auxiliary_variables import RELATED_TRADE_BINARY_FEATURES, RELATED_TRADE_NON_CAT_FEATURES, RELATED_TRADE_CATEGORICAL_FEATURES

pd.set_option('display.float_format', lambda x: '%.3f' % x)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2023-04-26 03:38:51.550660: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-26 03:38:51.696013: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-26 03:38:51.697727: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


Initializing pandarallel with 8.0 cores
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
print(tf.__version__)

2.7.0


In [4]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [47]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/jupyter/ficc/isaac_creds.json"
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
pd.options.mode.chained_assignment = None

bq_client = bigquery.Client()
storage_client = storage.Client()

TRAIN_TEST_SPLIT = 0.85
LEARNING_RATE = 0.0001
BATCH_SIZE = 1000
NUM_EPOCHS = 35

DROPOUT = 0.01
SEQUENCE_LENGTH = 5
NUM_FEATURES = 7

#### Data Preparation
We grab the data from a GCP bucket. The data is prepared using the ficc python package. More insight on how the data is prepared can be found [here](https://github.com/Ficc-ai/ficc/blob/ahmad_ml/ml_models/sequence_predictors/data_prep/data_preparation.ipynb)

In [6]:
# # %%time

# import gcsfs
# fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
# # with fs.open('ahmad_data/processed_data_2022-10-24-20:56.pkl') as f:
# # with fs.open('ahmad_data/processed_data_2022-10-24-20:56_c_spread.pkl') as f:
# # with fs.open('ahmad_data/processed_data_2022-12-02-17:52.pkl') as f:
# with fs.open('isaac_data/processed_file_FULL_2023-04-12-20:44.pkl') as f:
#     data = pd.read_pickle(f)

In [7]:
if os.path.isfile('processed_file_FULL_2023-04-12-20:44.pkl'):
    with open('processed_file_FULL_2023-04-12-20:44.pkl', 'rb') as f:
        data = pickle.load(f)
else:
    with open('processed_file_FULL_2023-04-12-20:44.pkl', 'wb') as f:
        import gcsfs
        fs = gcsfs.GCSFileSystem(project='eng-reactor-287421')
        with fs.open('isaac_data/processed_file_FULL_2023-04-12-20:44.pkl') as f:
            data = pd.read_pickle(f)
            pickle.dump(data, f)

In [8]:
data.trade_date.max(), data.trade_date.min() 

(Timestamp('2023-02-28 00:00:00'), Timestamp('2022-09-01 00:00:00'))

In [9]:
data['new_ys'] = data['yield'] - data['new_ficc_ycl']
data['new_ys_realtime'] = data['yield'] - data['new_real_time_ficc_ycl']
data.dropna(subset=['new_ys', 'new_ys_realtime'], inplace=True)

In [10]:
auxiliary_features = ['dollar_price',
                      'last_calc_date',
                     'calc_date', 
                     'trade_date',
                      'last_trade_date',
                     'trade_datetime', 
                     'purpose_sub_class', 
                     'called_redemption_type', 
                     'calc_day_cat',
                     'yield',
                     'ficc_ycl',
                     #'same_ys',
                     #'trade_history_sum',
                     'new_ficc_ycl',
                      'new_real_time_ficc_ycl',
                     'days_to_refund',
                      'last_dollar_price',
                      'last_rtrs_control_number',
                     'is_called',
                     ]

In [11]:
if 'target_attention_features' not in PREDICTORS:
    PREDICTORS.append('target_attention_features')
    
if 'ficc_treasury_spread' not in PREDICTORS:
    PREDICTORS.append('ficc_treasury_spread')
    NON_CAT_FEATURES.append('ficc_treasury_spread')
    
for col in ['new_ficc_ycl', 'new_real_time_ficc_ycl']:     
    if col not in PREDICTORS:
        PREDICTORS.append(col)
        NON_CAT_FEATURES.append(col)

for col in ['extraordinary_make_whole_call', 'make_whole_call', 'has_unexpired_lines_of_credit']:     
    if col not in data.columns:
        try: 
            print(f'Removing {col} from PREDICTORS and BINARY')
            BINARY.remove(col)
            PREDICTORS.remove(col) 
        except:
            continue

Removing extraordinary_make_whole_call from PREDICTORS and BINARY
Removing make_whole_call from PREDICTORS and BINARY
Removing has_unexpired_lines_of_credit from PREDICTORS and BINARY


In [12]:
def process_data(data): 
    data['ted-rate'] = (data['t_rate_10'] - data['t_rate_2']) * 100
    
    # Here is a list of exclusions that we will be experimenting with. The model is trained with these exclusions. These exclusions were discussed with a team member.
    # Callable less than a year in the future
    # Maturity less than a year in the future and more than 30 years in the future
    
    data = data[(data.days_to_call == 0) | (data.days_to_call > np.log10(400))]
    data = data[(data.days_to_refund == 0) | (data.days_to_refund > np.log10(400))]
    data = data[(data.days_to_maturity == 0) | (data.days_to_maturity > np.log10(400))]
    data = data[data.days_to_maturity < np.log10(30000)]
    data['trade_history_sum'] = data.trade_history.parallel_apply(lambda x: np.sum(x))
    data.issue_amount = data.issue_amount.replace([np.inf, -np.inf], np.nan)
    data.dropna(inplace=True, subset=PREDICTORS+['trade_history_sum'])
    data.purpose_sub_class.fillna(0, inplace=True)
    
    # data['calc_date_duration'] = data[['last_calc_date','last_trade_date']].parallel_apply(get_calc_date_duration, axis=1)
    # data['new_ficc_ycl_fixed_shape'] = data[['trade_date', 'calc_date_duration']].parallel_apply(lambda x: calculate_ycl(x, new_yc_params), axis = 1)
    # data['new_ficc_ycl_prev_day'] = data[['last_calc_date', 'last_trade_date' ,'calc_date_duration','trade_date']].parallel_apply(get_yield_for_last_duration, axis=1)
    
    return data

In [13]:
%%time

processed_data = process_data(data) 
# processed_data = processed_data[IDENTIFIERS + PREDICTORS + auxiliary_features]

CPU times: user 40.6 s, sys: 12.6 s, total: 53.2 s
Wall time: 58.7 s


In [14]:
encoders = {}
fmax = {}
for f in CATEGORICAL_FEATURES:
    print(f)
    fprep = preprocessing.LabelEncoder().fit(processed_data[f].drop_duplicates()) #note that there are apparently no trades with CC 
    fmax[f] = np.max(fprep.transform(fprep.classes_))
    encoders[f] = fprep
    
with open('encoders.pkl','wb') as file:
    pickle.dump(encoders,file)

rating
incorporated_state_code
trade_type
purpose_class


In [15]:
train_dataframe = processed_data[(processed_data.trade_date <
                                  '2023-02-01')].sort_values(by='trade_date', ascending=True).reset_index(drop=True)

test_dataframe = processed_data[(processed_data.trade_date >'2023-02-01')].sort_values(by='trade_date', ascending=True).reset_index(drop=True)

In [16]:
def create_input(df):
    global encoders
    datalist = []
    datalist.append(np.stack(df['trade_history'].to_numpy()))
    datalist.append(np.stack(df['target_attention_features'].to_numpy()))

    noncat_and_binary = []
    for f in NON_CAT_FEATURES + BINARY:
        noncat_and_binary.append(np.expand_dims(df[f].to_numpy().astype('float32'), axis=1))
    datalist.append(np.concatenate(noncat_and_binary, axis=-1))
    
    for f in CATEGORICAL_FEATURES:
        encoded = encoders[f].transform(df[f])
        datalist.append(encoded.astype('float32'))
    
    return datalist

In [17]:
%%time
x_train = create_input(train_dataframe)
x_train[0] = x_train[0][:,:,[0,2,3,4,5,6]]
y_train = train_dataframe.new_ys

x_test = create_input(test_dataframe)
x_test[0] = x_test[0][:,:,[0,2,3,4,5,6]]
y_test = test_dataframe.new_ys

CPU times: user 18.2 s, sys: 874 ms, total: 19.1 s
Wall time: 19.1 s


In [18]:
cutoffs = ['2022-09-01', '2022-11-01', '2022-12-01']

dates = train_dataframe[['trade_date']].reset_index().set_index('trade_date')

indices = [dates.loc[cutoff:]['index'].min() for cutoff in cutoffs]

## Model Training and Testing

In [19]:
# Normalization layer for the trade history
trade_history_normalizer = Normalization(name='Trade_history_normalizer')
trade_history_normalizer.adapt(x_train[0],batch_size=BATCH_SIZE)

# Normalization layer for the non-categorical and binary features
noncat_binary_normalizer = Normalization(name='Numerical_binary_normalizer')
noncat_binary_normalizer.adapt(x_train[2], batch_size = BATCH_SIZE)

tf.keras.utils.set_random_seed(10)

2023-04-26 03:41:53.819279: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-26 03:41:53.821524: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-26 03:41:53.825029: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-26 03:41:53.827699: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

#### Implementation of the model

In [21]:
def generate_model(SEQUENCE_LENGTH = SEQUENCE_LENGTH ,NUM_FEATURES = NUM_FEATURES, trade_history_normalizer = trade_history_normalizer):
    inputs = []
    layer = []

    ############## INPUT BLOCK ###################
    trade_history_input = layers.Input(name="trade_history_input", 
                                       shape=(SEQUENCE_LENGTH,NUM_FEATURES), 
                                       dtype = tf.float32) 

    target_attention_input = layers.Input(name="target_attention_input", 
                                       shape=(SEQUENCE_LENGTH, 3), 
                                       dtype = tf.float32) 


    inputs.append(trade_history_input)
    inputs.append(target_attention_input)

    inputs.append(layers.Input(
        name="NON_CAT_AND_BINARY_FEATURES",
        shape=(len(NON_CAT_FEATURES + BINARY),)
    ))


    layer.append(noncat_binary_normalizer(inputs[2]))
    ####################################################


    ############## TRADE HISTORY MODEL #################

    lstm_layer = layers.LSTM(50, 
                             activation='tanh',
                             input_shape=(SEQUENCE_LENGTH,NUM_FEATURES),
                             return_sequences = True,
                             name='LSTM')

    lstm_attention_layer = CustomAttention(50)

    lstm_layer_2 = layers.LSTM(100, 
                               activation='tanh',
                               input_shape=(SEQUENCE_LENGTH,50),
                               return_sequences = False,
                               name='LSTM_2')


    features = lstm_layer(trade_history_normalizer(inputs[0]))
    features = lstm_attention_layer(features, features, inputs[1])
    features = layers.BatchNormalization()(features)
    # features = layers.Dropout(DROPOUT)(features)

    features = lstm_layer_2(features)
    features = layers.BatchNormalization()(features)
    # features = layers.Dropout(DROPOUT)(features)

    trade_history_output = layers.Dense(100, 
                                        activation='relu')(features)

    ####################################################

    ############## REFERENCE DATA MODEL ################
    global encoders
    for f in CATEGORICAL_FEATURES:
        fin = layers.Input(shape=(1,), name = f)
        inputs.append(fin)
        embedded = layers.Flatten(name = f + "_flat")( layers.Embedding(input_dim = fmax[f]+1,
                                                                        output_dim = max(30,int(np.sqrt(fmax[f]))),
                                                                        input_length= 1,
                                                                        name = f + "_embed")(fin))
        layer.append(embedded)


    reference_hidden = layers.Dense(400,
                                    activation='relu',
                                    name='reference_hidden_1')(layers.concatenate(layer, axis=-1))

    reference_hidden = layers.BatchNormalization()(reference_hidden)
    reference_hidden = layers.Dropout(DROPOUT)(reference_hidden)

    reference_hidden2 = layers.Dense(200,activation='relu',name='reference_hidden_2')(reference_hidden)
    reference_hidden2 = layers.BatchNormalization()(reference_hidden2)
    reference_hidden2 = layers.Dropout(DROPOUT)(reference_hidden2)

    reference_output = layers.Dense(100,activation='tanh',name='reference_hidden_3')(reference_hidden2)

    ####################################################

    feed_forward_input = layers.concatenate([reference_output, trade_history_output])

    hidden = layers.Dense(300,activation='relu')(feed_forward_input)
    hidden = layers.BatchNormalization()(hidden)
    hidden = layers.Dropout(DROPOUT)(hidden)

    hidden2 = layers.Dense(100,activation='tanh')(hidden)
    hidden2 = layers.BatchNormalization()(hidden2)
    hidden2 = layers.Dropout(DROPOUT)(hidden2)

    final = layers.Dense(1)(hidden2)

    model = keras.Model(inputs=inputs, outputs=final)
    
    return model

In [22]:
fit_callbacks = [
    #WandbCallback(save_model=False),
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=10,
        verbose=0,
        mode="auto",
        restore_best_weights=True,
    ),
    time_callback
]

tb_callback = tf.keras.callbacks.TensorBoard(log_dir='logs', update_freq='epoch', profile_batch=0, write_steps_per_second=True)

In [40]:
def create_tf_data(x_train, y_train, shuffle=False, shuffle_buffer=1):

    train_size = int(0.8*len(x_train[0]))
                     
    X=()
    for x in x_train:
        X += (tf.data.Dataset.from_tensor_slices(x),)
        

    temp = tf.data.Dataset.zip((X))
    del X
    dataset = tf.data.Dataset.zip((temp,
                        tf.data.Dataset.from_tensor_slices(y_train)))
    del temp
    if shuffle:
        shuffle_buffer = int(len(x_train[0])*shuffle_buffer)
        dataset = dataset.shuffle(shuffle_buffer)
            
    train_ds = dataset.take(train_size)
    val_ds = dataset.skip(train_size)                 
    return train_ds, val_ds

In [109]:
def train_model(x_train, y_train, shuffle, shuffle_buffer=1):
    tf.keras.backend.clear_session()
    gc.collect()

    timestamp = datetime.now().strftime('%Y-%m-%d %H-%M')
    
    fit_callbacks = fit_callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=10,
        verbose=0,
        mode="auto",
        restore_best_weights=True),
        # time_callback,
        CSVLoggerTimeHistory(timestamp+'_training_logs.csv', separator=",", append=False)]
    
    with tf.device('/cpu:0'):
        train_ds, val_ds = create_tf_data(x_train, y_train, shuffle, shuffle_buffer)
        train_ds = train_ds.batch(BATCH_SIZE).prefetch(2).cache()
        val_ds = val_ds.batch(BATCH_SIZE).prefetch(2).cache()

    model_new_ys = generate_model(5, 6, trade_history_normalizer)

    fit_callbacks = fit_callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=10,
        verbose=0,
        mode="auto",
        restore_best_weights=True),
        # time_callback,
        CSVLoggerTimeHistory('_'.join([model_new_ys.name,timestamp,'training_logs.csv'])
                             , separator=",", 
                             append=False)
    ]
    
    
    model_new_ys.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001),
          loss=keras.losses.MeanAbsoluteError(),
          metrics=[keras.metrics.MeanAbsoluteError()])

    history_new_ys = model_new_ys.fit(train_ds,
                                      validation_data=val_ds,
                                        epochs=NUM_EPOCHS,     
                                        verbose=1, 
                                        callbacks=fit_callbacks,
                                        use_multiprocessing=True,
                                        workers=8)
    
    return history_new_ys, model_new_ys

<b>Testing 3 cutoffs: </b>

['2022-09-01': 5 months, 
'2022-11-01': 3 months, 
'2022-12-01': 2 months]

Shuffled

In [27]:
experiment_models = {}

In [29]:
for i, cutoff in enumerate(cutoffs):
    print(f'Running model for cutoff on {cutoff} onwards')
    cutoff_idx = indices[i]
    new_x_train = [x[cutoff_idx:] for x in x_train]
    new_y_train = y_train[cutoff_idx:]
   
    # Normalization layer for the trade history
    trade_history_normalizer = Normalization(name='Trade_history_normalizer')
    trade_history_normalizer.adapt(new_x_train[0],batch_size=BATCH_SIZE)

    # Normalization layer for the non-categorical and binary features
    noncat_binary_normalizer = Normalization(name='Numerical_binary_normalizer')
    noncat_binary_normalizer.adapt(new_x_train[2], batch_size = BATCH_SIZE)
    
    temp_model = train_model(new_x_train, new_y_train, True)
    
    experiment_models[cutoff] = temp_model
    print('\n\n')

Running model for cutoff on 2022-09-01 onwards
Epoch 1/35


2023-04-26 03:48:07.812549: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 723226 of 4425824
2023-04-26 03:48:17.812536: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1648386 of 4425824
2023-04-26 03:48:27.812539: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2732057 of 4425824
2023-04-26 03:48:37.812534: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 3803395 of 4425824
2023-04-26 03:48:43.618657: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.
2023-04-26 03:48:47.734621: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200




2023-04-26 03:49:55.070959: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1066397 of 4425824
2023-04-26 03:50:05.070976: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1939527 of 4425824
2023-04-26 03:50:15.070969: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2696209 of 4425824
2023-04-26 03:50:25.070968: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 3762029 of 4425824
2023-04-26 03:50:31.293418: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35



Running model for cutoff on 2022-11-01 onwards
Epoch 1/35


2023-04-26 04:22:40.993235: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1060021 of 2731538
2023-04-26 04:22:50.993231: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2123965 of 2731538


   5/2186 [..............................] - ETA: 30s - loss: 56.7090 - mean_absolute_error: 56.7090     

2023-04-26 04:22:56.682901: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.




2023-04-26 04:23:45.453985: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1072046 of 2731538
2023-04-26 04:23:55.453995: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2141768 of 2731538
2023-04-26 04:24:00.917463: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35



Running model for cutoff on 2022-12-01 onwards
Epoch 1/35


2023-04-26 04:42:50.732321: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1092744 of 1707057


   5/1366 [..............................] - ETA: 18s - loss: 60.9581 - mean_absolute_error: 60.9581    

2023-04-26 04:42:56.297209: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.




2023-04-26 04:43:32.692823: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1088807 of 1707057
2023-04-26 04:43:38.326357: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35





In [32]:
results = pd.DataFrame(columns=['cutoff', 'MAE'])
for key, model in experiment_models.items():
    predictions = model.predict(x_test, batch_size=BATCH_SIZE)
    results = results.append({'cutoff':key, 
                   'MAE': round(np.mean(np.abs(np.array(y_test) - np.array(predictions).reshape(-1,))), 3)},
                   ignore_index=True)

In [33]:
results

Unnamed: 0,cutoff,MAE
0,2022-09-01,10.528
1,2022-11-01,10.988
2,2022-12-01,11.36


<b>Testing 3 cutoffs: </b>

['2022-09-01': 5 months, 
'2022-11-01': 3 months, 
'2022-12-01': 2 months]

UNshuffled

In [None]:
experiment_models_unshuffled = {}

for i, cutoff in enumerate(cutoffs):
    print(f'Running model for cutoff on {cutoff} onwards')
    cutoff_idx = indices[i]
    new_x_train = [x[cutoff_idx:] for x in x_train]
    new_y_train = y_train[cutoff_idx:]
   
    # Normalization layer for the trade history
    trade_history_normalizer = Normalization(name='Trade_history_normalizer')
    trade_history_normalizer.adapt(new_x_train[0],batch_size=BATCH_SIZE)

    # Normalization layer for the non-categorical and binary features
    noncat_binary_normalizer = Normalization(name='Numerical_binary_normalizer')
    noncat_binary_normalizer.adapt(new_x_train[2], batch_size = BATCH_SIZE)
    
    temp_model = train_model(new_x_train, new_y_train, False)
    
    experiment_models_unshuffled[cutoff] = temp_model
    print('\n\n')

Running model for cutoff on 2022-09-01 onwards


In [None]:
results_unshuffled = pd.DataFrame(columns=['cutoff', 'MAE'])
for key, model in experiment_models_unshuffled.items():
    predictions = model.predict(x_test, batch_size=BATCH_SIZE)
    results_unshuffled = results_unshuffled.append({'cutoff':key, 
                   'MAE': round(np.mean(np.abs(np.array(y_test) - np.array(predictions).reshape(-1,))), 3)},
                   ignore_index=True)

In [36]:
results_unshuffled

Unnamed: 0,cutoff,MAE
0,2022-09-01,13.144
1,2022-11-01,12.699
2,2022-12-01,12.255


<b>Testing 3 cutoffs: </b>

['2022-09-01': 5 months, 
'2022-11-01': 3 months, 
'2022-12-01': 2 months]

50% Shuffle Buffer

In [None]:
import logging

logging.basicConfig(filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
logging.warning('This will get logged to a file')


In [41]:
experiment_models_partially_shuffled = {}

for i, cutoff in enumerate(cutoffs):
    print(f'Running model for cutoff on {cutoff} onwards')
    cutoff_idx = indices[i]
    new_x_train = [x[cutoff_idx:] for x in x_train]
    new_y_train = y_train[cutoff_idx:]
   
    # Normalization layer for the trade history
    trade_history_normalizer = Normalization(name='Trade_history_normalizer')
    trade_history_normalizer.adapt(new_x_train[0],batch_size=BATCH_SIZE)

    # Normalization layer for the non-categorical and binary features
    noncat_binary_normalizer = Normalization(name='Numerical_binary_normalizer')
    noncat_binary_normalizer.adapt(new_x_train[2], batch_size = BATCH_SIZE)
    
    temp_model = train_model(new_x_train, new_y_train, True, 0.5)
    
    experiment_models_partially_shuffled[cutoff] = temp_model
    print('\n\n')

Running model for cutoff on 2022-09-01 onwards
Epoch 1/35


2023-04-26 06:23:30.810145: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1083302 of 2212912
2023-04-26 06:23:40.810138: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2165214 of 2212912


   1/3541 [..............................] - ETA: 24:11:23 - loss: 52.4213 - mean_absolute_error: 52.4213

2023-04-26 06:23:41.249566: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.




2023-04-26 06:25:11.683213: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 556359 of 2212912
2023-04-26 06:25:21.683208: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1633948 of 2212912
2023-04-26 06:25:27.026515: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35



Running model for cutoff on 2022-11-01 onwards
Epoch 1/35


2023-04-26 06:55:43.178720: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1091271 of 1365769


   5/2186 [..............................] - ETA: 29s - loss: 52.2220 - mean_absolute_error: 52.2220     

2023-04-26 06:55:45.682214: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.




2023-04-26 06:56:46.778473: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 1092857 of 1365769
2023-04-26 06:56:49.277211: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:405] Shuffle buffer filled.


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35



Running model for cutoff on 2022-12-01 onwards
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35





In [42]:
results_partially_shuffled = pd.DataFrame(columns=['cutoff', 'MAE'])
for key, model in experiment_models_partially_shuffled.items():
    predictions = model.predict(x_test, batch_size=BATCH_SIZE)
    results_partially_shuffled = results_partially_shuffled.append({'cutoff':key, 
                   'MAE': round(np.mean(np.abs(np.array(y_test) - np.array(predictions).reshape(-1,))), 3)},
          fit_callbacks   ignore_index=True)

In [43]:
results_partially_shuffled

Unnamed: 0,cutoff,MAE
0,2022-09-01,10.989
1,2022-11-01,10.998
2,2022-12-01,11.292


In [113]:
print('Unshuffled:')
display(results_unshuffled)
print('='*50)
print('Shuffled:')
display(results)
print('='*50)
print('Partially Shuffled:')
display(results_partially_shuffled)

Unshuffled:


Unnamed: 0,cutoff,MAE
0,2022-09-01,13.144
1,2022-11-01,12.699
2,2022-12-01,12.255


Shuffled:


Unnamed: 0,cutoff,MAE
0,2022-09-01,10.528
1,2022-11-01,10.988
2,2022-12-01,11.36


Partially Shuffled:


Unnamed: 0,cutoff,MAE
0,2022-09-01,10.989
1,2022-11-01,10.998
2,2022-12-01,11.292
