In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt # Fast data reading/writing
import seaborn as sns
import matplotlib.pyplot as plt

First, we load the asset details so that we can load the data in the order of their asset id. Filenames are based on asset names.

In [None]:
asset_details = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv', index_col='Asset_ID')
names = asset_details.sort_index().Asset_Name.values
ids = asset_details.sort_index().index

Since all the dataframes are too big to be stored, trained and evaluated at once, we define a function to do it in one go:

In [None]:
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr 
from glob import iglob
from datetime import datetime
params = {'lambda_l1': 0.004498875792752676, 'lambda_l2': 0.03243290696956152, 'num_leaves': 60, 
              'max_depth': 6, 'min_data_in_leaf': 2496, 'learning_rate': 0.18502752618241153, 'n_estimators': 100,
              'boosting_type': 'goss', 'random_state': 1}
used_features = ['RSI', 'MACD_crossover_norm', 'stochastic_crossover', 'log_ret1', 'log_ret30', 'log_ret240', 'log_ret1440', 'mfi']

def train(asset_name):
    df = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/{asset_name.lower().replace(' ', '_')}.jay").to_pandas() # Load asset data
    df.drop('index', axis=1, inplace=True)
    df.set_index('timestamp', inplace=True)
    
    X, y = df.drop(['Target'], axis=1)[used_features], df.Target # Separate into features and labels

    
    # Training the model
    model = LGBMRegressor(**params)
    model.fit(X, y)
    
    print(f'Trained model for {asset_name}')
    
    return model

def global_train():
    all_df = []
    for filename in iglob("/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/*.jay"):
        all_df.append(dt.fread(filename).to_pandas()) # Load asset data
    all_df = pd.concat(all_df)
    X_all, y_all = all_df.drop('Target', axis=1)[used_features], all_df.Target
    
    global_model = LGBMRegressor(**params)
    global_model.fit(X_all, y_all)
    
    return global_model

### Feature engineering functions

In [None]:
from collections import deque

class DynamicSimpleMovingAverage:
    def __init__(self, column, window_size):
        # Dynamically shift moving averages over column
        # Column should be columns with values from training data
        self.window_size = window_size
        self.window = deque(column.iloc[-window_size:].to_numpy())
        self.sum = np.sum(self.window)
        
    def get_sma(self, new_values):
        mas = np.full(len(new_values), np.nan)
        for i, value in enumerate(new_values):
            self.sum -= self.window.popleft()
            self.sum += value
            self.window.append(value)
            mas[i] = self.sum/self.window_size
        return mas

In [None]:
class DynamicExponentialMovingAverage:
    def __init__(self, column, window_size):
        # Dynamically shift exponential moving averages over column
        # Unlike simple moving average, instead of sums we need to keep track of only the previous EMA.
        # Column should be column with previous emas from training data
        self.prev_ema = column[-1]
        self.alpha = 2/(1+window_size)
        
    def get_ema(self, new_values):
        emas = np.full(len(new_values), np.nan)
        for i in range(len(new_values)):
            ema = self.alpha * new_values[i] + self.prev_ema * (1-self.alpha)
            self.prev_ema = ema
            emas[i] = ema
        return emas

In [None]:
class Feature():
    # An interface for dynamic feature computations
    def __init__(self, name):
        self.name = name
        
    def get(self, new_values):
        # Compute the feature based on dataframe of new values
        pass

In [None]:
class RSI(Feature):
    def __init__(self, name, close_col, gain_mean, loss_mean, period):
        super().__init__(name)
        used_col = close_col.to_numpy()[-period:]
        self.gain_mean = gain_mean[-1]
        self.loss_mean = loss_mean[-1]
        self.last_val = used_col[-1]
        self.period = period
    
    def get_diff(self, new_value):
        gain = 0
        loss = 0
        if new_value < self.last_val:
            gain = 0
            loss = abs(new_value-self.last_val)
        else:
            gain = new_value-self.last_val
            loss = 0
        self.last_val = new_value
        
        return gain, loss
    
    def get(self, new_values):
        close_col = new_values['Close'].to_numpy()
        
        rsis = np.zeros(len(close_col))
        for i in range(len(close_col)):
            gain, loss = self.get_diff(close_col[i])
            self.gain_mean = (self.gain_mean * (self.period-1) + gain)/self.period
            self.loss_mean = (self.loss_mean * (self.period-1) + loss)/self.period
            rs = self.gain_mean / self.loss_mean
            rsis[i] = (100 - 100/(1+rs))
        
        return rsis

In [None]:
class MFI(Feature):
    def __init__(self, name, close_col, volume_col, period):
        super().__init__(name)
        used_close = close_col.to_numpy()[-(period+1):]
        used_vol = volume_col.to_numpy()[-(period+1):]
        sign_diffs = np.sign(used_close[1:] - used_close[:-1])
        self.money_flow = sign_diffs * used_close[1:] * used_vol[1:]
        self.pos = self.money_flow.clip(min=0).sum()
        self.neg = -1 * self.money_flow.clip(max=0).sum()
        self.last_val = used_close[-1]
        
    
    def get(self, new_values):
        close_col = new_values['VWAP'].to_numpy()
        vol_col = new_values['Volume'].to_numpy()
        mfis = np.zeros(len(close_col))
        
        for i in range(len(close_col)):
            # Remove first element of window in sums
            self.pos -= self.money_flow[0].clip(min=0) 
            self.neg -= -1 * self.money_flow[0].clip(max=0)
            
            # Slide window
            self.money_flow = np.roll(self.money_flow, -1)
            
            curr_close = close_col[i]
            curr_vol = vol_col[i]
            if curr_close < self.last_val:
                self.money_flow[-1] = -1 * curr_close * curr_vol
                self.neg += -1 * self.money_flow[-1]
            else:
                self.money_flow[-1] = curr_close * curr_vol
                self.pos += self.money_flow[-1]
            
            if self.neg == 0: # to prevent div by zero
                mfis[i] = 50
            else:
                mfis[i] = 100 - 100/(1 + self.pos/self.neg)

        return mfis

In [None]:
class MACD(Feature):
    def __init__(self, name, macd_long_period, macd_long_col, macd_short_period, macd_short_col, signal_period, macd_col):
        super().__init__(name)
        self.macd_long_ema = DynamicExponentialMovingAverage(macd_long_col.to_numpy(), macd_long_period)
        self.macd_short_ema = DynamicExponentialMovingAverage(macd_short_col.to_numpy(), macd_short_period)
        self.signal_ema = DynamicExponentialMovingAverage(macd_col.to_numpy(), signal_period)

    def get(self, new_values):
        close_col = new_values['Close'].to_numpy()
        macd = self.macd_short_ema.get_ema(close_col) - self.macd_long_ema.get_ema(close_col)
        signal = self.signal_ema.get_ema(macd)
        macd_crossovers = (macd - signal) / signal # Normalize with signal 
        return macd_crossovers

In [None]:
class Stochastic(Feature):
    def __init__(self, name, close_col, k_col, period):
        super().__init__(name)
        self.window = deque(close_col.iloc[-period:].to_numpy())
        self.low = np.min(self.window)
        self.high = np.max(self.window)
        self.d_sma = DynamicSimpleMovingAverage(k_col, 3) # needs K% column of training data
        
    def get(self, new_values):
        close_col = new_values['Close'].to_numpy()
        
        k = np.zeros(len(close_col))
        for i in range(len(close_col)):
            self.window.popleft()
            self.window.append(close_col[i])
            self.low = np.min(self.window)
            self.high = np.max(self.window)
            k[i] = (close_col[i] - self.low)/(self.high - self.low) * 100
            
        d = self.d_sma.get_sma(k)
        return k-d

In [None]:
class CumLogReturns(Feature):
    def __init__(self, name, close_col, period):
        super().__init__(name)
        used_col = close_col.to_numpy()[-(period+1):]
        self.window = np.log(used_col[1:] / used_col[:-1])
        self.sum = self.window.sum()
        self.last_val = used_col[-1]
        
    def get(self, new_values):
        close_col = new_values['Close'].to_numpy()
        ret = np.zeros(len(close_col))
        for i in range(len(close_col)):
            self.sum -= self.window[0]
            new_ret = np.log(close_col[i] / self.last_val)
            self.sum += new_ret
            self.window = np.roll(self.window, -1)
            self.window[-1] = new_ret
            self.last_val = close_col[i]
            ret[i] = self.sum
        return ret

### Functions for processing incoming data

In [None]:
def get_last_train_rows():
    # Get rows of all assets in one dataframe, of the last timestamp seen 
    last_rows = []
    for name in names:
        last_rows.append(dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/{name.lower().replace(' ', '_')}.jay").to_pandas().iloc[-1])
    concat = pd.concat(last_rows)
    concat.index = sorted(asset_details.index)

In [None]:
def get_window(asset_dfs, window_size):
    # Get rows of the current window, as a list of dataframes for every asset
    return [asset_df[:-window_size] for asset_df in asset_dfs]

In [None]:
from datetime import timedelta
def interpolate(test_batch, prev_timestamp_rows):
    prev_timestamp_rows['row_id'] = -1 # Add row_id column as dummy so they match columns
    asset_dfs = {}
    for asset_id in prev_timestamp_rows['Asset_ID'].unique(): 
        prev_row = prev_timestamp_rows[prev_timestamp_rows['Asset_ID'] == asset_id]
        if asset_id not in test_batch['Asset_ID'].values:
            # If this asset is not included in new data at all, create a new row filled with nans for it, to be interpolated later.
            asset_df = pd.DataFrame(columns=prev_timestamp_rows.columns, index=[prev_timestamp_rows['Asset_ID'].index[0] + timedelta(minutes=1)])
            asset_df['Asset_ID'] = asset_id
        else:
            asset_df = test_batch.loc[test_batch['Asset_ID'] == asset_id, :]
#         if asset_df.index.value_counts()[0] > 1:
#             return {} # In case of some weird event where the timestamp is the same as the previous iteration, just return an empty dict to skip this iteration entirely


        asset_df.replace([np.inf, -np.inf], 0, inplace=True) # Replace infs with zeros
        asset_df.loc[asset_df.Volume == 0.0, 'Volume'] = np.nan # Zero volume seems unlikely, so interpolate this instead
        if (asset_df.index[0] == prev_row.index[0]):
            asset_df.reset_index(inplace=True)
            asset_df['timestamp'][0] += timedelta(minutes=1) # If somehow the timestamp remain the same, add 1 minute to it so asfreq() doesnt break
            asset_df.set_index('timestamp', inplace=True)
        if (asset_df.index[0] >= prev_row.index[0]): 
            asset_df = pd.concat([prev_row, asset_df]).asfreq(freq='60S') # Adds nans to missing minutes using previous row
            asset_df['row_id'] = asset_df['row_id'].fillna(-1) # So that we can recognize interpolated rows and skip them for prediction
            asset_df['Asset_ID'] = asset_df['Asset_ID'].fillna(asset_id) # This should not be interpolated
            asset_df = asset_df.interpolate(method='linear', axis=0) # Interpolate and forward fill potential missing values at the end
            asset_df = asset_df.iloc[1:] # Remove the previous row again
        asset_dfs[asset_id] = asset_df.fillna(method='ffill')
    return asset_dfs

In [None]:
def engineer_features(batch_assets, features):
    engineered = {}
    start_time = time.time()
    for asset_id in batch_assets.keys():
        df = batch_assets[asset_id]
        init_timestamp = df.index[0]        
        for feature in features[asset_id]:
            df[f'{feature.name}'] = feature.get(df)
        df = df.drop(['Count', 'High', 'Low', 'Open', 'Close', 'Volume', 'VWAP'], axis=1)
#         engineered[asset_id] = window.loc[init_timestamp:]
        engineered[asset_id] = df
#     print(f'Engineering took {time.time()-start_time} seconds.')
    return engineered

In [None]:
def get_new_windows(old_windows, curr_batch, window_size):
    # Get the new window to be the last rows that fit in the window
    if len(curr_batch) == 0:
        return old_windows
    else:
        return {asset_id: pd.concat([old_windows[asset_id], curr_batch[asset_id]]).iloc[-window_size:] for asset_id in old_windows.keys()} 

In [None]:
def predict_targets(asset_dfs, models, global_model, global_weight=0.5):
    targets = []
    for asset_id in asset_dfs.keys():
        asset_df = asset_dfs[asset_id]
        model = models[asset_id]
        features = asset_df.drop(['row_id', 'Asset_ID', 'Target', 'group_num'], axis=1, errors='ignore').to_numpy()
        targets.extend((zip(asset_df['row_id'].to_numpy(), 
                            global_weight*global_model.predict(features)+(1-global_weight)*model.predict(features))))
    targets = sorted(filter((lambda tup: tup[0] >= 0), targets), key=(lambda tup: tup[0])) # Remove interpolated rows and sort by id
    return list(map((lambda tup: tup[1]), targets)) # Get the target values and add them to the predictions.

Training the models:

In [None]:
models = {}
for asset_id, asset_name in zip(ids, names):
    models[asset_id] = train(asset_name)
    
global_model = global_train()

In [None]:
# example_rsi = [[26.9, 2.7, -7.5/37_595.2]] # Bitcoin - 29 jan 18:36 UTC
# prediction = models[1].predict(example_rsi, pred_contrib = True)
# print(prediction)

In [None]:
window_size = 1441
# Retrieve the first window
windows = {}
features = {}
sma_dict = {}
for asset_name, asset_id in zip(names, ids):
    engineered_window = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/{asset_name.lower().replace(' ', '_')}.jay").to_pandas()
    engineered_window = engineered_window.set_index('timestamp').iloc[-window_size:]
    preprocessed_window = dt.fread(f"../input/crypto-challenge-mlii-project-preprocessing-2/{asset_name.lower().replace(' ', '_')}.jay").to_pandas()
    preprocessed_window = preprocessed_window.set_index('timestamp').iloc[-window_size:]
    windows[asset_id] = preprocessed_window
    close_col = preprocessed_window['Close']
    vwap_col = preprocessed_window['VWAP']
    vol_col = preprocessed_window['Volume']
    asset_features = [RSI('RSI', close_col, engineered_window['gain_mean'], engineered_window['loss_mean'], 28),
                      MACD('MACD_crossover_norm', 52, engineered_window['ema_52'], 24, engineered_window['ema_24'], 18, engineered_window['MACD_signal']), 
                      Stochastic('stochastic_crossover', close_col, engineered_window['stochastic_k'], 28),
                      CumLogReturns('log_ret1', close_col, 1),
                      CumLogReturns('log_ret30', close_col, 30),
                      CumLogReturns('log_ret240', close_col, 240),
                      CumLogReturns('log_ret1440', close_col, 1440),
                      MFI('mfi', vwap_col, vol_col, 28)
                     ]
    features[asset_id] = asset_features

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [None]:
# For testing without using API

# copy1 = dummy_test.copy()
# copy2 = dummy_test.copy()
# copy2['timestamp'] += 60
# copy2.drop(copy2.index[copy2['Asset_ID'] == 2], inplace=True)
# copy3 = dummy_test.copy()
# copy3['timestamp'] += 180
# copy3.drop(copy3.index[copy3['Asset_ID'] == 2], inplace=True) # Test what happens when assets are not all provided
# test_data = [(copy1, pd.DataFrame()), (copy2, pd.DataFrame()), (copy3, pd.DataFrame())]
# for i, (test_batch, sample_preds) in enumerate(test_data):
#     start_time = time.time()
#     test_batch['timestamp'] = pd.to_datetime(test_batch['timestamp'], unit='s')
#     test_batch.set_index('timestamp', inplace=True)
#     #TODO last rows should include interpolated ones
#     last_rows = pd.concat([asset.iloc[-1:] for asset in windows.values()]) # Slice [-1:] so we get a DataFrame instead of Series
#     asset_dfs = interpolate(test_batch, last_rows) # Use the final rows from the previous time to determine if there are any gaps
#     engineered_dfs = engineer_features(asset_dfs, features, sma_dict)
#     print(engineered_dfs[1])
#     sns.lineplot(data=engineered_dfs[3], x='timestamp', y='RSI')
#     windows = get_new_windows(windows, asset_dfs, window_size)
#     targets = predict_targets(engineered_dfs, models)
#     sample_preds['Target'] = predict_targets(engineered_dfs, models)
#     print(f'Predicted {len(test_batch)} values! Took {time.time()-start_time} seconds.')

In [None]:
#TODO: train on new data
import time
#dummy_test = None
for test_batch, sample_preds in iter_test:
    start_time = time.time()
    test_batch['timestamp'] = pd.to_datetime(test_batch['timestamp'], unit='s')
    test_batch.set_index('timestamp', inplace=True)
    test_batch.index = test_batch.index.ceil('min') # Round up to nearest minute
    
    last_rows = pd.concat([asset.iloc[-1:] for asset in windows.values()]) # Slice [-1:] so we get a DataFrame instead of Series
    asset_dfs = interpolate(test_batch, last_rows) # Use the final rows from the previous time to determine if there are any gaps
    engineered_dfs = engineer_features(asset_dfs, features)
    windows = get_new_windows(windows, asset_dfs, 1)
    targets = predict_targets(engineered_dfs, models, global_model)
    targets = np.clip(np.nan_to_num(targets), -0.99, 0.99)
    sample_preds['Target'] = targets
    env.predict(sample_preds) # Call the predict function to pass it through the API.
    #print(f'Predicted {len(test_batch)} values! Took {time.time()-start_time} seconds.')

In [None]:
# start = datetime.fromtimestamp(1623542340)
# end = datetime.fromtimestamp(1623542520)
# df = dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/bitcoin.jay").to_pandas().set_index('timestamp').loc[start:end]
# display(df)

In [None]:
# from datetime import datetime
# from datatable import dt
# import pandas as pd
# display(dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-preprocessing-2/bitcoin.jay").to_pandas().set_index('timestamp').loc[datetime.fromtimestamp(1623540000):datetime.fromtimestamp(1623542520)])
# display(dt.fread(f"/kaggle/input/crypto-challenge-mlii-project-feature-eng-2/bitcoin.jay").to_pandas().set_index('timestamp').loc[datetime.fromtimestamp(1623540000):datetime.fromtimestamp(1623542520)])
# orig = pd.read_csv('../input/g-research-crypto-forecasting/train.csv')
