# Target reconstruction - quick summary of SOTA

This notebook summarises community contributions, many found in and around [alexfir's notebook](https://www.kaggle.com/alexfir/recreating-target) - upvote there if these insights help you in this competition!

## Reconstruction statistics for the train set:

* Mean abs error: 0.00000000000000082
* Max abs error:  0.00000000000008532
* Std abs error:  0.00000000000000276


In [None]:
import os
import numpy as np
import pandas as pd

data_dir = '../input/g-research-crypto-forecasting/'
dtypes = {
    'timestamp': np.int64,
    'Asset_ID': np.int8,
    'Close': np.float64,
    'Target': np.float64,
}
crypto_df = pd.read_csv(data_dir + 'train.csv', dtype=dtypes, usecols=list(dtypes.keys()))
asset_details = pd.read_csv(data_dir + 'asset_details.csv')
crypto_df = crypto_df.merge(asset_details, on='Asset_ID')
crypto_df['timestamp'] = pd.to_datetime(crypto_df['timestamp'], unit='s')

In [None]:
# essentially verbatim from https://www.kaggle.com/alexfir/recreating-target/
def reconstruct_targets(data: pd.DataFrame, details: pd.DataFrame, price_column: str, use_log=False):
    ids = list(details.Asset_ID)
    asset_names = list(details.Asset_Name)
    weights = np.array(list(details.Weight))
    
    timestamp_series = pd.Series([pd.Timestamp("2018-01-01 00:00")]).append(crypto_df['timestamp'], ignore_index=True)
    all_timestamps = np.sort(timestamp_series.unique())

    targets = pd.DataFrame(index=all_timestamps)

    for i, id in enumerate(ids):
        asset = data[data.Asset_ID == id].set_index(keys='timestamp')
        price = pd.Series(index=all_timestamps, data=asset[price_column])
        if use_log:
            targets[asset_names[i]] = np.log(price.shift(periods=-16)) - np.log(price.shift(periods=-1))
        else:
            targets[asset_names[i]] = (
                price.shift(periods=-16) /
                price.shift(periods=-1)
            ) - 1
            
    targets['m'] = np.average(targets.fillna(0), axis=1, weights=weights)
    m = targets['m']
    
    num = targets.multiply(m.values, axis=0).rolling(3750).mean().values
    denom = m.multiply(m.values, axis=0).rolling(3750).mean().values
    beta = np.nan_to_num(num.T / denom, nan=0., posinf=0., neginf=0.)
    
    recon_targets = targets - (beta * m.values).T
    recon_targets.drop('m', axis=1, inplace=True)
    
    return recon_targets

In [None]:
def merge_recon_with_train():
    recon_targets = reconstruct_targets(data=crypto_df, details=asset_details, price_column='Close')
    recon_targets = pd.melt(recon_targets.reset_index(), id_vars='index')
    recon_targets = recon_targets.rename(columns={'index':'timestamp', 'variable':'Asset_Name', 'value':'recon_Target'})
    return crypto_df.merge(recon_targets, on=['Asset_Name', 'timestamp'])

In [None]:
%%time
df = merge_recon_with_train()

print("Is it true that all NaN's match between Target and recon_Target?")
print(f"{all(df.Target.isna() == df.recon_Target.isna())}\n")

print("Is it true that none of the Targets have been dropped?")
print(f"{len(crypto_df.Target)==len(df.Target)}\n")

print("Statistics for absolute error between Target and reconstructed Target for the entirety of train data:")
df['abserror'] = abs(df['Target'] - df['recon_Target'])
print(df.abserror.describe())