## Target Computation

This notebook attempts to compute target as described here:

https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/286778

Version 2.0 improves readability by avoiding some unnecessary shift operations.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import time
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=(20,8)
INPUT=Path("../input/g-research-crypto-forecasting")

In [None]:
def ResidualizeMarket(df, mktColumn, window):
    if mktColumn not in df.columns:
        return df

    mkt = df[mktColumn]

    num = df.multiply(mkt.values, axis=0).rolling(window).mean().values  #numerator of linear regression coefficient
    denom = mkt.multiply(mkt.values, axis=0).rolling(window).mean().values  #denominator of linear regression coefficient
    beta = np.nan_to_num( num.T / denom, nan=0., posinf=0., neginf=0.)  #if regression fell over, use beta of 0

    resultRet = df - (beta * mkt.values).T  #perform residualization
    resultBeta = 0.*df + beta.T  #shape beta 

    return resultRet.drop(columns=[mktColumn]), resultBeta.drop(columns=[mktColumn])

def reduce_memory(df):
    before = df.memory_usage().sum()  
    for col in df.columns:        
        dtype = df[col].dtype
        if dtype == 'float64':
            c_min = df[col].min()
            c_max = df[col].max()        
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    df['Asset_ID'] = df['Asset_ID'].astype('int8')
    df['Count'] = df['Count'].astype('int32')
    df['timestamp'] = df['timestamp'].astype('uint32')
                    
    after = df.memory_usage().sum()
    
    print('Memory taken before transformation : ', before)
    print('Memory taken after transformation : ', after)
    print('Memory taken reduced by : ',( before - after) * 100/ before, '%')
    
    return df

In [None]:
# Function log_return_ahead computes R_t = log(P_{t+16} / P_{t+1})
def log_return_ahead(series, periods=1): 
    return -np.log(series).diff(periods=-periods).shift(-1)

In [None]:
train_df = reduce_memory(pd.read_csv(INPUT/"train.csv"))
train_df.head()

In [None]:
sub_train_df = reduce_memory(pd.read_csv(INPUT/"supplemental_train.csv"))
sub_train_df.head()

In [None]:
# continuous
sub_train_df.timestamp.min()

In [None]:
train_df.timestamp.max()

In [None]:
train_df = train_df.append(sub_train_df)

In [None]:
assets = pd.read_csv(INPUT/"asset_details.csv")
assets = assets.set_index("Asset_ID")

In [None]:
# drop some data
train_df["datetime"] = train_df["timestamp"].map(lambda x: datetime.fromtimestamp(x))

### Price of assets
$$P^a$$

In [None]:
prices = train_df.pivot(index=["timestamp"], columns=["Asset_ID"], values=["Close"])
prices = prices.droplevel(0, 1)
asset_names = assets["Asset_Name"].to_dict()
prices = prices.rename(columns=asset_names)

In [None]:
prices = prices.reindex(range(prices.index[0], int(prices.index[-1]+60),60), method='ffill')

In [None]:
# fillna automatically
# prices = prices.fillna(method="ffill")

In [None]:
# prices.index = prices.index.map(lambda x: datetime.fromtimestamp(x))

In [None]:
prices.sort_index(inplace=True)
prices.head()

### Log Returns over 15 Minutes

$$R^a(t) = log (P^a(t+16)\ /\ P^a(t+1))$$


In [None]:
log_returns_15min = log_return_ahead(prices, periods=15)

### Weighted Average Market Returns

$$M(t) = \frac{\sum_a w^a R^a(t)}{\sum_a w^a}  $$

In [None]:
assets_df = pd.read_csv(INPUT/"asset_details.csv", index_col = "Asset_ID")
assets_df.sort_index(inplace=True)

In [None]:
weights = assets_df.Weight.values
weights

In [None]:
weighted_avg_market_log_returns = log_returns_15min.mul(weights, axis='columns').mean(axis=1)

In [None]:
# log_returns_15min.mul(weights, axis='columns')[-200:].plot()
# weighted_avg_market_log_returns[-200:].plot(style="k8", grid=True)

In [None]:
log_returns_15min["market"] = weighted_avg_market_log_returns
residualized_market_returns, beta = ResidualizeMarket(log_returns_15min, "market", window=3750)

In [None]:
# residualized_market_returns[-200:].plot(grid=True)

### Compare computed with provided target

In [None]:
target = train_df.pivot(index=["timestamp"], columns=["Asset_ID"], values=["Target"])
target = target.droplevel(0, 1)
target = target.rename(columns=asset_names)
target = target.reindex(range(target.index[0], int(target.index[-1]+60),60), method='pad')
# target.index = target.index.map(lambda x: datetime.fromtimestamp(x))
target.sort_index(inplace=True)

In [None]:
residualized_market_returns["Bitcoin"][-500:].plot(grid=True)
target["Bitcoin"][-500:].plot(style='r--', grid=True)

In [None]:
residualized_market_returns["Bitcoin Cash"][-500:].plot()
target["Bitcoin Cash"][-500:].plot(style='r--',grid=True)

In [None]:
target_diffs = np.abs(residualized_market_returns - target)

In [None]:
plt.hist(target_diffs.values.reshape(-1), bins=1000)
plt.xlim((-0.01,0.01))
plt.grid()
plt.show()

In [None]:
plt.hist(target_diffs.iloc[-100000:].values.reshape(-1), bins=1000)
plt.xlim((-0.01,0.01))
plt.grid()
plt.show()

In [None]:
for c in target.columns:
    print(c)
    if c in residualized_market_returns.columns:
        print(residualized_market_returns[c].corr(target[c]))

In [None]:
residualized_market_returns["Maker"][-500:].plot()
target["Maker"][-500:].plot(style='r--',grid=True)

# Now we replicate the Target and re-construct the datasets for easy access.

In [None]:
features = train_df.pivot(index=["timestamp"], columns=["Asset_ID"], values=["Close", "Count", "Open", "High", "Low", "Volume", "VWAP"])

In [None]:
residualized_market_returns.head()

In [None]:
features = features.rename(columns=asset_names)

In [None]:
features = features.reindex(range(features.index[0], int(features.index[-1]+60),60), method='ffill')
# features.index = features.index.map(lambda x: datetime.fromtimestamp(x))

In [None]:
non_fill_price = features["Close"].copy()
non_fill_price.columns = pd.MultiIndex.from_tuples((("CloseN", a) for a in non_fill_price.columns))

In [None]:
features = features.fillna(method="ffill")

In [None]:
residualized_market_returns.columns = pd.MultiIndex.from_tuples((("Target", a) for a in residualized_market_returns.columns))

In [None]:
train_df = pd.concat([features, residualized_market_returns, non_fill_price], axis=1)

In [None]:
train_df.head()

In [None]:
train_df[("Close", "Maker")].corr(train_df[("CloseN", "Maker")])

In [None]:
# train_df[("Close", "Maker")].corr(train_df[("CloseN", "Maker")])

In [None]:
train_df[("CloseN", "Maker")]

In [None]:
# del features
# del residualized_market_returns
# del prices
# del target

In [None]:
import gc
gc.collect()

In [None]:
# train_df.index = train_df.index.values.astype(int) // int(1e9)
# train_df.index.name = "timestamp"

In [None]:
train_df = train_df.stack(level=1, dropna=False)

In [None]:
train_df = train_df.reset_index(drop=False)

In [None]:
# drop data before "2020-08-04" for Maker
train_df.drop(train_df.loc[(train_df["timestamp"] <= 1596513600) & (train_df["Asset_ID"] == "Maker")].index, inplace=True)
# drop data before "xxx" for Monero
train_df.drop(train_df.loc[(train_df["timestamp"] <= 1541394000) & (train_df["Asset_ID"] == "Monero")].index, inplace=True)
# drop data before "2018-07-14" for Stellar
train_df.drop(train_df.loc[(train_df["timestamp"] <= 1531540800) & (train_df["Asset_ID"] == "Stellar")].index, inplace=True)

In [None]:
train_df = train_df.reset_index(drop=True)
train_df.to_feather("./train.feather")

In [None]:
train_df

In [None]:
maker = train_df[train_df.Asset_ID == "Maker"].set_index("timestamp")

In [None]:
maker.dropna()

# Conclusion

Sublement file is used to construct some rolling features.