# Submission Inference

__Here I used a primitive version of lagged target and market information with other rolling aggregations__



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import gresearch_crypto
import time
import datetime
from numba import jit

In [None]:
TRAIN_CSV = '../input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '../input/g-research-crypto-forecasting/asset_details.csv'
SUPPLEMENT_CSV = '../input/g-research-crypto-forecasting/supplemental_train.csv'
MODEL_FILE = '../input/modelfiles/20220126_cv2.txt'

In [None]:
MODEL_OLDEST_FILE = '../input/modelfiles/20220126_cv1.txt'
MODEL_FEWR_FILE = '../input/modelfiles/20220126_cv3.txt'

model_oldest = lgb.Booster(model_file=MODEL_OLDEST_FILE)
model_fewer = lgb.Booster(model_file=MODEL_FEWR_FILE)
features_oldest = model_oldest.feature_name()
features_fewer = model_fewer.feature_name()

In [None]:
df_sup = pd.read_csv(SUPPLEMENT_CSV)
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV)
model = lgb.Booster(model_file=MODEL_FILE)
features = model.feature_name()

# Define Original Features

In [None]:
def calcHullMA(price: pd.Series, N=50):
    SMA1 = price.rolling(N).mean()
    SMA2 = price.rolling(int(N/2)).mean()
    return (2 * SMA2 - SMA1).rolling(int(np.sqrt(N))).mean()

def get_features(df_feat):
    #for recreated Targets
    df_feat["target_return"] = (df_feat["Close"].shift(1) / df_feat["Close"].shift(16)) -1

    # Create your features here, they can be lagged or not
    df_feat['sma15'] = df_feat['Close'].rolling(15).mean()/df_feat['Close'] -1
    df_feat['sma60'] = df_feat['Close'].rolling(60).mean()/df_feat['Close'] -1
    df_feat['sma240'] = df_feat['Close'].rolling(240).mean()/df_feat['Close'] -1
    
    df_feat['return15'] = df_feat['Close']/df_feat['Close'].shift(15) -1
    df_feat['return60'] = df_feat['Close']/df_feat['Close'].shift(60) -1
    df_feat['return240'] = df_feat['Close']/df_feat['Close'].shift(240) -1
    
    df_feat['sma15_count'] = df_feat['Count'].rolling(15).mean()/df_feat['Close'] -1
    df_feat['sma60_count'] = df_feat['Count'].rolling(60).mean()/df_feat['Close'] -1
    df_feat['sma240_count'] = df_feat['Count'].rolling(240).mean()/df_feat['Close'] -1
    
    df_feat['return15_count'] = df_feat['Volume']/df_feat['Volume'].shift(15) -1
    df_feat['return60_count'] = df_feat['Volume']/df_feat['Volume'].shift(60) -1
    df_feat['return240_count'] = df_feat['Volume']/df_feat['Volume'].shift(240) -1
    
    df_feat["hull"] = df_feat.Close - calcHullMA(df_feat.Close, 240)
    df_feat["hull2"] = df_feat.Close - calcHullMA(df_feat.Close, 76)
    df_feat["hull3"] = df_feat.Close - calcHullMA(df_feat.Close, 800)
    ###################################################################
    #Returns etc
    fibo_list = [55, 210, 340, 890, 3750]
    #if verbose: print("[Feature] Return")
    df_feat[f'log_return'] = np.log(df_feat.Close).diff().ffill().bfill()
    for i in fibo_list:
        df_feat[f'log_return_{i}'] = np.log(df_feat.Close).diff().rolling(i).mean().ffill().bfill()
    
    df_feat = df_feat.fillna(0)
    
    return df_feat

def inference_target_recreation(tt, weights, asset_id):
    m = np.average(tt, axis=1, weights=weights)
    num = np.mean(np.multiply(tt, m.reshape(-1, 1)), axis=0)
    denom = np.mean(np.multiply(m, m), axis=0)
    beta = num / denom
    t2 = (beta * m[-1]).T
    p2 = np.mean(t2)
    return t2[asset_id], p2

#@jit(nopython=True)
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return np.append(np.array([1]*n), ret[n - 1:] / n)[1:]

#@jit(nopython=True)
def calcHullMA_inference(series, N=50):
    SMA1 = moving_average(series, N)
    SMA2 = moving_average(series, int(N/2))
    res = (2 * SMA2 - SMA1)
    return np.mean(res[-int(np.sqrt(N)):])

def calculate_target(data: pd.DataFrame, details: pd.DataFrame, price_column: str):
    ids = list(details.Asset_ID)
    asset_names = list(details.Asset_Name)
    weights = np.array(list(details.Weight))

    all_timestamps = np.sort(data['timestamp'].unique())
    targets = pd.DataFrame(index=all_timestamps)

    for i, id in enumerate(ids):
        asset = data[data.Asset_ID == id].set_index(keys='timestamp')
        price = pd.Series(index=all_timestamps, data=asset[price_column])
        targets[asset_names[i]] = (
            price.shift(periods=1) /
            price.shift(periods=16)
        ) - 1
    
    targets['m'] = np.average(targets.fillna(0), axis=1, weights=weights)
    
    m = targets['m']

    num = targets.multiply(m.values, axis=0).rolling(3750).mean().values
    denom = m.multiply(m.values, axis=0).rolling(3750).mean().values
    beta = np.nan_to_num(num.T / denom, nan=0., posinf=0., neginf=0.)

    targets = targets - (beta * m.values).T
    targets.drop('m', axis=1, inplace=True)
    
    return targets.reset_index(), num, denom, beta

# Dictionary of DataFrame for every asset

__This dictionary is not that important, the other one is the real deal here__

In [None]:
df_dict = {}
for asset in df_sup['Asset_ID'].unique():
    #print(f"Filling dictionary with asset {asset}")
    df_dict[asset] = get_features(df_sup.loc[df_sup.Asset_ID == asset].reset_index(drop=True))
    asset_name = df_asset_details.loc[df_asset_details.Asset_ID == asset, "Asset_Name"].values[-1]

# Dictionary of Features for every asset

__I will update this dictionary with every batch in prediction phase. For now it is filled with latest values from the supplemental file__

In [None]:
f = {}
for k in df_dict.keys():
    f[k] = {
        "all_close": df_dict[k]["Close"].values[-5000:],
        "log_return" : df_dict[k]["log_return"].values[-5000:],
        "Count" : df_dict[k]["Count"].values[-240:],
        "Volume" : df_dict[k]["Volume"].values[-240:],
        "target_return" : df_dict[k]["target_return"].values[-3750:],
    }

In [None]:
import gc
del df_dict
del df_sup
#del rec_targets
gc.collect()

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [None]:
def inference_target_recreation2(tt, weights):
    m = np.average(tt, axis=1, weights=weights)
    num = np.mean(np.multiply(tt, m.reshape(-1, 1)), axis=0)
    denom = np.mean(np.multiply(m, m), axis=0)
    beta = num / denom
    t2 = (beta * m[-1]).T
    p2 = np.mean(t2)
    return t2, p2

In [None]:
%%time
fibo_list = [55, 210, 340, 890, 3750]
market_list = [1.]
for i, (df_test, df_pred) in enumerate(iter_test):
    pred_array = np.zeros((len(df_test), len(features)))
    rownums = df_pred.row_id.values
    #all_tars, market_p = inference_target_recreation2(tt, weights)
    market_p = np.mean(market_list)
    market_list = []
    for jj, (j , row) in enumerate(df_test.iterrows()):
        #Initial necessary values
        #Initial necessary values
        asset = row['Asset_ID']
        last_close = row["Close"]
        last_log_return = np.log(last_close) - np.log(f[asset]["all_close"][-1])
        #Dictionary updates
        f[asset]["all_close"] = np.append(f[asset]["all_close"][1:], [last_close])
        f[asset]["log_return"] = np.append(f[asset]["log_return"][1:], [last_log_return])
        f[asset]["Count"] = np.append(f[asset]["Count"][1:], [row["Count"]])
        f[asset]["Volume"] = np.append(f[asset]["Volume"][1:], [row["Volume"]])
        ###logreturns
        #fibo_list = [55, 89] + [210, 340, 890, 1440, 3750, 5000]
        f[asset]["log_return_1"] = last_log_return
        row["log_return"] = last_log_return
        for i in fibo_list:
            row[f"log_return_{i}"] = np.mean(f[asset]["log_return"][-i:])
            
        #row['logret_std_55'] = np.std(f[asset]["log_return"][-55:])
        #row['logret_std_3750'] = np.std(f[asset]["log_return"][-3750:])
        
        row["sma15"] = np.mean(f[asset]["all_close"][-15:])/last_close -1 
        row["sma60"] = np.mean(f[asset]["all_close"][-60:])/last_close -1 
        row["sma240"] = np.mean(f[asset]["all_close"][-240:])/last_close -1 

        row["return15"] = last_close/f[asset]["all_close"][-15] -1
        row["return60"] = last_close/f[asset]["all_close"][-60] -1 
        row["return240"] = last_close/f[asset]["all_close"][-240] -1 

        row["sma15_count"] = np.mean(f[asset]["Count"][-15:])/row["Count"] -1
        row["sma60_count"] = np.mean(f[asset]["Count"][-60:])/row["Count"] -1
        row["sma240_count"] = np.mean(f[asset]["Count"][-240:])/row["Count"] -1

        row["return15_count"] = row["Volume"]/f[asset]["Volume"][-15] -1
        row["return60_count"] = row["Volume"]/f[asset]["Volume"][-60] -1
        row["return240_count"] = row["Volume"]/f[asset]["Volume"][-240] -1

        row["hull"] = last_close - calcHullMA_inference(f[asset]["all_close"][-260:], 240)

        #row["rec_target"], row["market_p"] = inference_target_recreation(tt, weights, int(asset))
        #row["rec_target"], row["market_p"] = all_tars[int(asset)], market_p
        row["target_return"] = (last_close / f[asset]["all_close"][-16]) -1
        market_list.append(row["target_return"])
        row["market_p"] = market_p
        row["train_flg"] = 1
        #df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = model.predict([row[features]], num_iteration=model.best_iteration)[0]
        pred_array[jj, :] = row[features].values
    preds = model.predict(pred_array, num_iteration=model.best_iteration) * 0.5
    preds += model_oldest.predict(pred_array, num_iteration=model.best_iteration) * 0.25
    preds += model_fewer.predict(pred_array, num_iteration=model.best_iteration) * 0.25
    df_pred.loc[df_pred["row_id"].isin(rownums), "Target"] = preds
    env.predict(df_pred)