# Submitting Lagged Features via API

In this notebook we submit a lagged features via the API.

The API works by providing a single row for each Asset - one timestamp at a time - to prevent using future data in predictions.

In order to utilise lagged features in our model, we must store the outputs from the API so we can calculate features using past data.

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import gresearch_crypto
import time
import datetime

TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

In [None]:
df_train = pd.read_csv(TRAIN_CSV)
df_train.head()

In [None]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

In [None]:
def get_features(df, 
                 asset_id, 
                 train=True):
    '''
    This function takes a dataframe with all asset data and return the lagged features for a single asset.
    
    df - Full dataframe with all assets included
    asset_id - integer from 0-13 inclusive to represent a cryptocurrency asset
    train - True - you are training your model
          - False - you are submitting your model via api
    '''
    
    df = df[df['Asset_ID']==asset_id]
    df = df.sort_values('timestamp')
    if train == True:
        df_feat = df.copy()
        # define a train_flg column to split your data into train and validation
        totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
        valid_window = [totimestamp("12/03/2021")]
        df_feat['train_flg'] = np.where(df_feat['timestamp']>=valid_window[0], 0,1)
        df_feat = df_feat[['timestamp','Asset_ID','Close','Target','train_flg']].copy()
    else:
        df = df.sort_values('row_id')
        df_feat = df[['Asset_ID','Close','row_id']].copy()
    
    # Create your features here, they can be lagged or not
    df_feat['sma15'] = df_feat['Close'].rolling(15).mean()/df_feat['Close'] -1
    df_feat['sma60'] = df_feat['Close'].rolling(60).mean()/df_feat['Close'] -1
    df_feat['sma240'] = df_feat['Close'].rolling(240).mean()/df_feat['Close'] -1
    
    '''
    
    df_feat['return15'] = df_feat['Close'][:-15]/df_feat['Close'][15:] -1
    df_feat['return60'] = df_feat['Close'][:-60]/df_feat['Close'][60:] -1
    df_feat['return240'] = df_feat['Close'][:-240]/df_feat['Close'][240:] -1
    ''' 
    df_feat = df_feat.fillna(0)
    
    return df_feat

In [None]:
# create your feature dataframe for each asset and concatenate
feature_df = pd.DataFrame()
for i in range(14):
    feature_df = pd.concat([feature_df,get_features(df_train,i,train=True)])

In [None]:
# assign weight column feature dataframe
feature_df = pd.merge(feature_df, df_asset_details[['Asset_ID','Weight']], how='left', on=['Asset_ID'])

In [None]:
# define features for LGBM
features = ['Asset_ID','sma15','sma60','sma240','return15','return60','return240']
categoricals = ['Asset_ID']

In [None]:
# define the evaluation metric
def weighted_correlation(a, train_data):
    
    weights = train_data.add_w.values.flatten()
    b = train_data.get_label()
    
    
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'eval_wcorr', corr, True

# Editing starts here
Some kernels are saving the entire previous history dataframe for prediction that sounds really bizzare, I mean the memory usage will be way too high . So here rather than saving all the previous samples we only save the computations from the past sample , in this way we can save up a lot of memory , otherwise we may face a lot of compute_issues.

[Note] Just for the sake of an example i am submitting one of the created features as the predictions and not training a new model to do so

# Functions :-

In [None]:
# For the rolling average we can only store samples for the respective window  , say 15 minutes mot the others as that sounds dumb and not needed
# I will also give some additional functions for you other than rolling average so that you need to code them out




class RollingAverage():
    '''
    This code is for saving up ram in Rolling average  lagged feats
    '''
    def __init__(self,windows=[]):
          
            self.max_length=max(windows)
            self.dataframes=[]
    def compute(self,current):
        self.dataframes.append(float(current['Close']))
       
        if len(self.dataframes)>self.max_length: 
            self.dataframes.pop(0)  # This sample is not needed anymore so we can remove it
      
       
        min15avg=np.mean(np.array(self.dataframes)[max([-1*len(self.dataframes),-15]):]) # 15min window average
        min60avg=np.mean(np.array(self.dataframes)[max([-1*len(self.dataframes),-65]):]) # 16min window average
        min240avg=np.mean(np.array(self.dataframes)[max([-1*len(self.dataframes),-240]):])# 240 min window average
        # Compute features here 
        current['sma15'] = min15avg/current['Close'] - 1 
        current['sma60'] = min60avg/current['Close'] - 1
        current['sma240'] = min240avg/current['Close'] - 1
        
        
        return current
    

        
        
        
        
        
        
        
        
        
        
      
        
        
                
        
    
    

In [None]:
dfcrop=df_train[df_train['Asset_ID']==0]
dfcrop.head()

In [None]:
rollingavg=RollingAverage([15,60,240])


In [None]:
rollingavg.max_length

In [None]:
rollingavg.compute(dfcrop.iloc[0])

In [None]:
rollingavg.compute(dfcrop.iloc[1])

In [None]:

rollingavg.compute(dfcrop.iloc[2])

In [None]:
avgs=[RollingAverage([15,60,240]) for _ in range(14)] # Create 14 different objects one for each asset

In [None]:
# Exponentially Weighted Average
class ExponentiallyWeightedAverage:
    v1=0
    def __init__(self,beta):
        self.beta=beta
    def compute(self,value):
        self.v1=self.beta*value+(1-self.beta)*self.v1
        return self.v1
ewm=ExponentiallyWeightedAverage(0.75)
print(ewm.compute(dfcrop.iloc[0]))
print(ewm.compute(dfcrop.iloc[1]))       
print(ewm.compute(dfcrop.iloc[2]))     

In [None]:
dfcrop.head()

In [None]:
# Measures the rate of change of a feature over time
class RateCalculator:
    def __init__(self,default=0.0):
        # Default value is rate for first timestamp
        self.default=default
        self.previous=False
    def compute(self,row):
        if type(self.previous)==type(False):
            self.previous=row.copy()
            row[list(row.keys())]=self.default
            
            return row
        
        final=row/self.previous
        self.previous=row
        return final
    
rate=RateCalculator()
print(rate.compute(dfcrop.iloc[0]))

print(rate.compute(dfcrop.iloc[1]))       
print(rate.compute(dfcrop.iloc[2]))     

        
        
        
        

In [None]:
avgs=[RollingAverage([15,60,240]) for _ in range(14)] # Create 14 different objects one for each asset
ewms=[ExponentiallyWeightedAverage(0.75) for _ in range(14)]
rates=[RateCalculator(1.0) for _ in range(14)]


# Submit

In [None]:
start = time.time()

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

# create dataframe to store data from the api to create lagged features
history = pd.DataFrame()
for i, (df_test, df_pred) in enumerate(iter_test):
    

    for j , row in df_test.iterrows():
        # get features using history dataframe
        avg=avgs[int(row['Asset_ID'])]
        ewm=ewms[int(row['Asset_ID'])]
        rate=rates[int(row['Asset_ID'])]
        row_features1=avg.compute(row)
        row_features2=ewm.compute(row)
        row_features3=rate.compute(row)
      
        y_pred = float(row_features1['sma15']) # Giving a naive submission for now

        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
    
    # we only want to keep the necessary recent part of our history dataframe, which will depend on your
    # max_lookback value (your furthest lookback in creating lagged features).
 
    
    # Send submissions
    env.predict(df_pred)
stop = time.time()
print(stop-start)