In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import time
import json
import sys
import os
import gc

In [None]:
import tensorflow as tf
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
dtypes = {'Asset_ID':'int8', 'row_id':'int32', 'Count':'int32', 'Open':'float64', 'High':'float64',
          'Low':'float64', 'Close':'float64', 'Volume':'float64', 'VWAP':'float16',
          "Upper_Shadow":'float64', "Lower_Shadow":'float64', "Gap":'float64'}

In [None]:
WINDOW = 3
BATCH_SIZE = 41536
EPOCHS = 50

ASSET_DETAILS = pd.read_csv(
    "../input/g-research-crypto-forecasting/asset_details.csv").sort_values(by=['Asset_ID'])
#Collect weights for each asset to replace VWAP
WEIGHTS = ASSET_DETAILS.Weight.values.tolist()
#List of Train Columns
TRAIN_COLS = ["Count", "Open", "High", "Low", "Close", "Volume",
              "Weight", "Upper_Shadow", "Lower_Shadow", "Gap"]

def reindexer(df):
    df = df.sort_values(by=['timestamp']).set_index('timestamp')
    df = df.reindex(range(df.index[0], df.index[-1]+60, 60))
    return df.interpolate(method='linear')

def feature_engineering(df):
    #Add new features
    df['Weight'] = df.Asset_ID.apply(lambda x: WEIGHTS[int(x)])
    #Note: If adding new features, remember to add it to TRAIN_COLS
    df['Upper_Shadow'] = df['High'] - np.maximum(df['Open'], df['Close'])
    df['Lower_Shadow'] = np.minimum(df['Open'], df['Close']) - df['Low']
    df["Gap"] = np.abs(df['Close'] - df['Open'])
    
    df = df.groupby('Asset_ID').apply(lambda x: reindexer(x)).reset_index(0, drop=True)
    df = df.sort_values(by='row_id')
    
    return df

def rescale_values(df):
    asset_id = str(int(df.Asset_ID.values[0]))
    #print(f"{asset_id:<2}  ", end='')#, type(asset_id), SCALER.get(asset_id, 0))
    for col in TRAIN_COLS:
        if col=='Weight': continue
        div = SCALER[asset_id][col]['max'] - SCALER[asset_id][col]['min']
        df[col] = df[col].apply(lambda x: (x - SCALER[asset_id][col]['min'])/div)
    return df

In [None]:
with open("../input/data-cleaning/scale_saved_values.json", 'r') as f:
    SCALER = json.load(f)

In [None]:
class TimeSeriesLoader():
    '''Creates a windowed dataset for Time Series Model.'''
    
    def __init__(self, df, window=3, train=True):
        '''
        df         => DataFrame where data will be extracted
        window     => Length of window for Time Series
        '''
        self.df = df
        self.window = window
        self.is_train = train
        self.is_file = True if type(self.df)==str else False
        #If train, remove some of the tail data to avoid overlapping with test
        #Remove starting from the min timestamp of test
        if self.is_train and self.is_file:
            self.df = self.df[self.df.index < 1623542400]
        
        self.X = np.array([])
        self.ID = np.array([])
        self.y = np.array([])
        
        #Partition data into 4 slices to save on RAM
        m = 10 #number of slices
        if self.is_file:
            n = 2600000 #10% is 25939823 - from previous notebook
        else:
            n = int(np.ceil(self.df.shape[0]/m))
            
        for i in range(m):
            self.create_windows(i*n, (i+1)*n) #Initialize data with windows
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        '''Fetch data at index idx extending |window| steps back. If train, fetch the targets.'''
        if self.is_train:
            return self.X[idx], self.ID[idx], self.y[idx]
        else:
            return self.X[idx], self.ID[idx]
    
    def create_windows(self, start, end):
        # Make sure that the data shape is of this pattern
        needed_shape = (-1, self.window, len(TRAIN_COLS))
            
        pardir="./last_windows/"
        try:
            os.mkdir(pardir)
        except:
            pass
        
        self.X_i = []
        self.ID_i = np.array([])
        self.y_i = np.array([])
        self.row = np.array([])
        
        #Check self.df is a if file or DataFrame
        if self.is_train and self.is_file:
            inter_df = pd.read_csv(self.df, skiprows=range(1, start), nrows=end-start)
            #print("Upper", inter_df.shape)
            try:
                inter_df = inter_df[inter_df.timestamp < 1623542400]
            except:
                inter_df = inter_df[inter_df.index < 1623542400]
        else:
            inter_df = self.df.iloc[start:end].copy()
            #print("Lower", inter_df.shape)
            
        #Loop through each asset_id
        for asset_id in set(inter_df.Asset_ID):
            df = inter_df[inter_df.Asset_ID == asset_id].copy()
            df = rescale_values(df)
            
            windows = [list(range(i, i+self.window)) for i in range(df.shape[0]-self.window+1)]
            X = []
            ID = df.Asset_ID.values
            row = df.row_id.values
            try:
                y = df['Target'].values
            except:
                y = None
            
            #Fetch the last saved window. Otherwise, initialize with zeros.
            try:
                temp = np.load(pardir+f"last_window_id{asset_id}_{self.window}.npz")['x']
            except:
                try:
                    prefix = "../input/window-creation/"
                    temp = np.load(prefix+f"last_window_id{asset_id}_{self.window}.npz")['x']
                except:
                    temp = np.zeros((self.window, len(TRAIN_COLS)))

            for i in range(min(df.shape[0], (self.window-1))):
                temp[:self.window-1] = temp[1:]
                temp[-1] = df[TRAIN_COLS].iloc[i].values
                X.append(temp.copy())
                
            #Save X
            self.X_i = np.concatenate((
                np.array(self.X_i).reshape(needed_shape), #Previous items
                np.array(X).reshape(needed_shape), #Below windows
                df[TRAIN_COLS].values[windows].reshape(needed_shape)), #Windows
                axis=0
            )
            #save ID
            self.ID_i = np.concatenate((self.ID_i, ID), axis=0)
            self.row = np.concatenate((self.row, row), axis=0)
            if self.is_train:
                self.y_i = np.concatenate((self.y_i, y), axis=0)
            
            #Save last window for a moving updated window historical data
            self.last_window = self.X_i[-1]
            np.savez(pardir+f"last_window_id{asset_id}_{self.window}.npz", x=self.last_window)
        
            del df, temp
            gc.collect()
            
        #Sort Mini Batch
        sorter = np.argsort(self.row)
        #Make sure you are not sorting empty array
        if len(sorter):
            self.X_i = self.X_i[sorter]
            self.ID_i = self.ID_i[sorter]
        
            #Append mini batch to full
            self.X = np.concatenate((self.X.reshape(needed_shape), self.X_i), axis=0)
            self.ID = np.concatenate((self.ID, self.ID_i), axis=0)

            if self.is_train:
                self.y_i = self.y_i[sorter]
                self.y = np.concatenate((self.y, self.y_i), axis=0)
        
        del self.X_i, self.ID_i, self.y_i
        gc.collect()

In [None]:
class NBatchLogger(tf.keras.callbacks.Callback):
    """
    A Logger that log average performance per `display` steps.
    """
    def __init__(self, display):
        self.step = 0
        self.display = display
        self.metric_cache = {}

    def on_batch_end(self, batch, logs={}):
        self.step += 1
        for k in self.params['metrics']:
            if k in logs:
                self.metric_cache[k] = self.metric_cache.get(k, 0) + logs[k]
        if self.step % self.display == 0:
            metrics_log = ''
            for (k, v) in self.metric_cache.items():
                val = v / self.display
                if abs(val) > 1e-3:
                    metrics_log += ' - %s: %.4f' % (k, val)
                else:
                    metrics_log += ' - %s: %.4e' % (k, val)
            print('step: {}/{} ... {}'.format(self.step,
                                          self.params['steps'],
                                          metrics_log))
            self.metric_cache.clear()

In [None]:
#train_data_loader = TimeSeriesLoader("../input/gresearchcustomfetrain/train_with_FE.csv")
class NPZTrainIterator():
    def __init__(self, batch_size):
        self.batch = batch_size
        self.data = np.load('../input/window-creation/train_data_ready.npz')
        
        self.X = self.data['X']
        self.ID = self.data['ID'].reshape((-1, 1))
        self.y = self.data['y']
        self.len = self.y.shape[0]
        
        del self.data
        gc.collect()
        
    def __getitem__(self, idx):
        if type(idx) != int:
            idx = idx.start #get start of slice
            
        if self.len - idx >= self.batch:
            return ((self.X[idx:idx+self.batch],
                     self.ID[idx:idx+self.batch]),
                    self.y[idx:idx+self.batch])
        else:
            X = np.zeros((self.batch, WINDOW, 10))
            ID = np.zeros((self.batch,))
            y = np.zeros((self.batch))
            
            n = self.len - idx
            X[:n] = self.X[idx:]
            ID[:n] = self.ID[idx:]
            y[:n] = self.y[idx:]
            return ((X, ID), y)
    
    def __iter__(self):
        for i in range(0, self.len, self.batch):
            (x, i), y = self.__getitem__(i)
            yield {'fea_in':x, 'emb_in':i}, y
        return

In [None]:
npz = np.load('../input/window-creation/train_data_ready.npz')
X, ID, y = npz['X'], npz['ID'].reshape(-1, 1), npz['y']
#get residual
rmd = X.shape[0]%BATCH_SIZE
print(f"Train rows: {X.shape[0]}, Residual rows: {rmd}, Rows to add: {BATCH_SIZE - rmd}")
#Pad data with zeros
X = np.concatenate([X, np.zeros((BATCH_SIZE-rmd, WINDOW, 10))], axis=0)
ID = np.concatenate([ID, np.zeros((BATCH_SIZE-rmd, 1))], axis=0)
y = np.concatenate([y, np.zeros((BATCH_SIZE-rmd,))], axis=0)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, concatenate, Dropout

seed = 703841
np.random.seed(seed)
tf.random.set_seed(seed)

def make_model(batch_size=None):
    fea_in = Input(shape=(WINDOW, 10), batch_size=batch_size, dtype=tf.float64, name='fea_in')
    feat = LSTM(128, return_sequences=True, dropout=0.1, bias_initializer="random_normal", name='LSTM_fea1')(fea_in)
    feat = LSTM(128, return_sequences=True, dropout=0.1, bias_initializer="random_normal", name='LSTM_fea2')(feat)
    feat = LSTM(128, return_sequences=False, bias_initializer="random_normal", name='LSTM_fea3')(feat)
    feat = Dropout(0.20, name='DropoutFea')(feat)
    
    emb_in = Input(shape=(1,), batch_size=batch_size, dtype=tf.int8, name='emb_in')
    embedding = Embedding(input_dim=15, output_dim=128, name='Embedding')(emb_in)
    embedding = LSTM(128, return_sequences=True, dropout=0.1, bias_initializer="random_normal", name='LSTM_emb1')(embedding)
    embedding = LSTM(128, return_sequences=True, dropout=0.1, bias_initializer="random_normal", name='LSTM_emb2')(embedding)
    embedding = LSTM(128, return_sequences=False, bias_initializer="random_normal", name='LSTM_emb3')(embedding)
    embedding = Dropout(0.20, name='DropoutEmb')(embedding)
    
    pred = concatenate([feat, embedding], name='Concatenate')
    pred = Dense(1, name='Output')(pred)
    
    model = tf.keras.Model(inputs=[fea_in, emb_in], outputs=[pred])
    return model

training_model = make_model(batch_size=BATCH_SIZE)
training_model.summary()
tf.keras.utils.plot_model(training_model)

In [None]:
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer._decayed_lr(tf.float32)
    return lr

In [None]:
with strategy.scope():
    model = make_model(batch_size=BATCH_SIZE)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-5)
    model.compile(
        #Train LR will start high on first train loop and become low on further iterations
        optimizer=opt,
        loss='mean_squared_error',
        metrics=['mean_absolute_error', get_lr_metric(opt)])

In [None]:
from IPython.display import clear_output

full_history = []
step = BATCH_SIZE*200
revs = 10

st = int(np.ceil(X.shape[0]/step))
for r in range(revs):
    for c, i in enumerate(range(0, X.shape[0], step)):
        clear_output(wait=True)
        print(f"Review part {r+1}/{revs}, Training on part {i} - {min(X.shape[0], i+step)} / {X.shape[0]} ({c+1}/{st}):")
        hist = model.fit(
            {'fea_in':X[i:i+step], 'emb_in':ID[i:i+step]}, y[i:i+step],
            batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)
        try:
            for k in hist.history.keys():
                full_history[r].history[k] += hist.history[k]
        except:
            full_history.append(hist)
            
model.save_weights("./model_weights.hdf5")

In [None]:
#Plot each history
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(25, 6))
ax[0].title.set_text("Train Loss:")
ax[1].title.set_text("Train MAE:")
loss_leg = []
mae_leg = []
for i, h in enumerate(full_history):
    n = len(h.history['loss'])
    ax[0].plot(range(n), h.history['loss'])
    loss_leg.append(f"loss_{i}")
    
    ax[1].plot(range(n), h.history['mean_absolute_error'])
    mae_leg.append(f"mae_{i}")
    
ax[0].set_ylim([0, 0.001])
ax[0].legend(loss_leg, loc='upper right')

ax[1].set_ylim([0, 0.02])
ax[1].legend(mae_leg, loc='upper right')

plt.savefig('loss_and_mae_plots.png')
plt.show()
plt.close()

In [None]:
#Plot LR history
lr_leg = []
for i, h in enumerate(full_history):
    n = len(h.history['lr'])
    plt.plot(range(n), h.history['lr'])
    lr_leg.append(f"LR_{i}")
plt.ylim([0, 0.001])
plt.legend(lr_leg)
plt.title("Learning rate decay schedule:")

plt.savefig("lr_decay.png")
plt.show()
plt.close()

In [None]:
import shutil

try:
    os.mkdir('./last_windows/')
except:
    pass

tgt_dir = '../working/last_windows/'
for dirname, _, filenames in os.walk('/kaggle/input/window-creation'):
    for filename in filenames:
        f = os.path.join(dirname, filename)
        if 'last_windows' in f:
            shutil.copy(f, tgt_dir)
            print('copied:', f)
        else:
            continue

## Since TPU can't be used on this competition... Prediction is in another notebook! (>,0)/