## Target Computation

Based off of https://www.kaggle.com/vi2018/g-research-crypto-repro-target-computation

This notebook attempts to compute target as described here:

https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/286778

Version 2.0 improves readability by avoiding some unnecessary shift operations.

## Target Computation

Based off of https://www.kaggle.com/vi2018/g-research-crypto-repro-target-computation

This notebook attempts to compute target as described here:

https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/286778

Version 2.0 improves readability by avoiding some unnecessary shift operations.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import time

INPUT=Path("../input/g-research-crypto-forecasting")

In [None]:
def ResidualizeMarket(df, mktColumn, window):
  if mktColumn not in df.columns:
    return df

  mkt = df[mktColumn]
  num = df.multiply(mkt.values, axis=0).rolling(window).mean() #numerator of linear regression coefficient
  denom = mkt.multiply(mkt.values, axis=0).rolling(window).mean()  #denominator of linear regression coefficient
  beta = np.nan_to_num( num.values.T / denom.values, nan=0., posinf=0., neginf=0.)  #if regression fell over, use beta of 0

  resultRet = df - (beta * mkt.values).T  #perform residualization
  resultBeta = 0.*df + beta.T  #shape beta

  return resultRet.drop(columns=[mktColumn]), resultBeta.drop(columns=[mktColumn])


In [None]:
# Function log_return_ahead computes R_t = log(P_{t+16} / P_{t+1})
def log_return_ahead(series, periods=1):
    return np.exp(-np.log(series).diff(periods=-periods).shift(-1)) - 1

In [None]:
train_df = pd.read_csv(INPUT/"train.csv")

In [None]:
prices = train_df.pivot(index=["timestamp"], columns=["Asset_ID"], values=["Close"])

prices.columns = [f"A{a}" for a in range(14)]

prices = prices.reindex(range(prices.index[0], prices.index[-1]+60,60), method='ffill')

prices.index = prices.index.map(lambda x: datetime.utcfromtimestamp(x))
prices['mday'] = prices.index.day
prices['minute'] = prices.index.minute
prices['hour'] = prices.index.hour
# remove first minute of each month
prices = prices[~((prices['mday']==1) & (prices['minute']==0) & (prices['hour']==0))]
prices = prices.drop(columns=['mday','minute','hour'])
prices.sort_index(inplace=True)


In [None]:
target = train_df.pivot(index=["timestamp"], columns=["Asset_ID"], values=["Target"])
target.columns = [f"A{a}" for a in range(14)]
target = target.reindex(range(target.index[0], target.index[-1]+60,60), method='ffill')
target.index = target.index.map(lambda x: datetime.utcfromtimestamp(x))
target.sort_index(inplace=True)

In [None]:
log_returns_15min = log_return_ahead(prices, periods=15)

In [None]:
assets_df = pd.read_csv(INPUT/"asset_details.csv", index_col = "Asset_ID")
assets_df.sort_index(inplace=True)

In [None]:
weights = assets_df.Weight.values
weighted_avg_market_log_returns = log_returns_15min.mul(weights, axis='columns').div(log_returns_15min.notnull().mul(weights, axis='columns').sum(axis=1), axis=0).sum(axis=1)

In [None]:
log_returns_15min["market"] = weighted_avg_market_log_returns
residualized_market_returns, beta = ResidualizeMarket(log_returns_15min, "market", window=3750)

In [None]:
target_diffs = residualized_market_returns - target

#target_diffs.dropna(inplace=True)
print(np.nanmean(np.abs(target_diffs.values)))
print(np.nanmax(np.abs(target_diffs.values)))