## メモ

ラグやマーケットリターンなどを組み合わせて適当に特徴量作り、Ridge回帰でTargetを予測したnotebook。
提出がタイムアウトになるので、提出部分の高速化が必要。

コードは公開していないが、俺が運用している仮想通貨ボットで使っている特徴量 + ridge回帰(alphaは適当)のスコア(相関)は0.013程度だった。
このnotebookだと相関が0.04くらい。

## Notes

A notebook that predicts the target by ridge regression by creating some features by combining lag and market returns.
Since the submission will time out, it is necessary to speed up the submission part.

Although the code is not disclosed, the score (correlation) when using the features used in my crypto trading bot was about 0.013.
With this notebook, the correlation is about 0.04.

## TODO

- ffill missing bars
- evaluation metrics (use weights. other metrics like sharpe, double sharpe)
- model and feature improvement
- robust cv (nested cv or bbc-cv) + hyper parameter tuning

In [None]:
import os
import lzma

import cloudpickle
import datatable as dt
import gresearch_crypto
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.base import BaseEstimator, clone
from sklearn.ensemble import VotingRegressor, BaggingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, StandardScaler

In [None]:
def get_feature_columns(df):
    features = df.columns[df.columns.str.startswith('feature')]
    return sorted(list(features))

def save_model(model, path):
    data = cloudpickle.dumps(model)
    data = lzma.compress(data)
    with open(path, 'wb') as f:
        f.write(data)
        
def process_data(df, df_asset):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', utc=True)
    df = df.rename(columns={
        'Asset_ID': 'market',
        'Open': 'op',
        'High': 'hi',
        'Low': 'lo',
        'Close': 'cl',
        'Volume': 'volume',
        'VWAP': 'vwap',
        'Count': 'trade_count',
        'Target': 'target',
    })
    df = df.join(df_asset[['weight']], on='market', how='left')
    df = df.set_index(['timestamp', 'market'])
    return df

def sort_and_remove_duplicates(df):
    df = df.sort_index(kind='mergesort')
    # https://stackoverflow.com/questions/13035764/remove-rows-with-duplicate-indices-pandas-dataframe-and-timeseries
    df = df.loc[~df.index.duplicated(keep='last')]
    return df

def my_purge_kfold(n, n_splits=5, purge=3750 * 14):
    idx = np.arange(n)
    cv = []
    for i in range(n_splits):
        val_start = i * n // n_splits
        val_end = (i + 1) * n // n_splits
        val_idx = idx[val_start:val_end]
        train_idx = idx[(idx < val_start - purge) | (val_end + purge <= idx)]
        cv.append((
            train_idx,
            val_idx,
        ))
    return cv

In [None]:
# preprocess asset data

df = dt.fread('../input/g-research-crypto-forecasting/asset_details.csv').to_pandas()
df = df.rename(columns={
    'Asset_ID': 'market',
    'Weight': 'weight',
    'Asset_Name': 'name',
})
df = df.set_index('market')
df = df.sort_values('market')
df.to_pickle('/tmp/df_asset.pkl')
display(df)

In [None]:
# preprocess train data

# supplemental_trainは提出後に増えるデータらしい
# trainとsupplemental_trainをくっつけてtrainにすれば良さそう
df = pd.concat([
    dt.fread('../input/g-research-crypto-forecasting/train.csv').to_pandas(),
    dt.fread('../input/g-research-crypto-forecasting/supplemental_train.csv').to_pandas(),
])
df_asset = pd.read_pickle('/tmp/df_asset.pkl')
df = process_data(df, df_asset)
df = sort_and_remove_duplicates(df)
df.to_pickle('/tmp/df.pkl')
display(df)

In [None]:
# check interval is 1 min

df = pd.read_pickle('/tmp/df.pkl')
df = df.reset_index()
df['timestamp'] = df['timestamp'].view(int) / 10 ** 9
df['interval'] = df['timestamp'] - df.groupby('market')['timestamp'].shift(1)
df = df.dropna()
for interval in range(60, 361, 60):
    print('{} {}'.format(interval, np.mean(df['interval'] == interval)))

In [None]:
# calc features

def calc_features(df):
    df = df.copy()
    
    df['ln_cl'] = np.log(df['cl'])
    
#     df['feature_upper_shadow'] = df['hi'] - np.maximum(df['op'], df['cl'])
#     df['feature_lower_shadow'] = np.minimum(df['cl'], df['op']) - df['lo']

    # shift is faster than diff
    df['feature_cl_diff1'] = df['ln_cl'] - df.groupby('market')['ln_cl'].shift(15)
    df['raw_return_causal'] = df['ln_cl'] - df.groupby('market')['ln_cl'].shift(15)
    
    inv_weight_sum = 1.0 / df.groupby('timestamp')['weight'].transform('sum')
    
    df['market_return_causal'] = (df['raw_return_causal'] * df['weight']).groupby('timestamp').transform('sum') * inv_weight_sum
    
    df['beta_causal'] = (
        (df['raw_return_causal'] * df['market_return_causal']).groupby('market').transform(lambda x: x.rolling(3750, 1).mean())
        / (df['market_return_causal'] ** 2).groupby('market').transform(lambda x: x.rolling(3750, 1).mean())
    )
    
    df['feature_cl_diff1_mean_simple'] = df['feature_cl_diff1'].groupby('timestamp').transform('mean')
    df['feature_cl_diff1_mean_weight'] = (df['feature_cl_diff1'] * df['weight']).groupby('timestamp').transform('sum') * inv_weight_sum
    df['feature_cl_diff1_resid'] = df['feature_cl_diff1'] - df['beta_causal'] * df['feature_cl_diff1_mean_weight']
    
    df['feature_cl_diff1_rank'] = df.groupby('timestamp')['feature_cl_diff1'].transform('rank')
    
    df = df.rename(columns={
        'beta_causal': 'feature_beta_causal',
    })
    
    return df

df = pd.read_pickle('/tmp/df.pkl')
df = calc_features(df)
features = get_feature_columns(df)
df = df[features + ['target', 'weight']]
df.to_pickle('/tmp/df_features.pkl')

In [None]:
df = pd.read_pickle('/tmp/df_features.pkl')
df = df.dropna()
features = get_feature_columns(df)
for feature in features:
    print('{} {}'.format(feature, pearsonr(df[feature], df['target'])))
    
for market, df_market in df.groupby('market'):
    for feature in features:
        print('{} {} {}'.format(market, feature, pearsonr(df_market[feature], df_market['target'])))

In [None]:
# cv
df = pd.read_pickle('/tmp/df_features.pkl')
features = get_feature_columns(df)
df = df.dropna()
# df = df.loc[df.index.get_level_values(0) < pd.to_datetime('2019-01-01 00:00:00Z')]
df = df.loc[df.index.get_level_values(0) < pd.to_datetime('2021-01-01 00:00:00Z')]

model = Ridge()
# model = lgb.LGBMRegressor(n_jobs=-1, random_state=1)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', model)
])

# model = BaggingRegressor(
#     model,
#     n_estimators=1,
#     random_state=1,
# )

cv = my_purge_kfold(df.shape[0])
df['y_pred'] = cross_val_predict(
    model,
    X=df[features],
    y=df['target'], 
    cv=cv, 
#     n_jobs=-1,
)

print(r2_score(df['target'], df['y_pred']))
print(pearsonr(df['target'], df['y_pred']))
print(df['target'].std())

print('pearsonr by market')
display(df.groupby('market').apply(lambda x: pearsonr(x['target'], x['y_pred'])[0]))

df2 = df.reset_index().set_index('timestamp')
market_count = df2['market'].unique().size
df2['target'].rolling(3 * 30 * 24 * 60 * market_count).corr(df2['y_pred']).iloc[::24 * 60 * market_count].plot()
plt.title('3 month rolling pearsonr')
plt.show()

if False:
    for market, df_market in df.groupby('market'):
        df2 = df_market.reset_index().set_index('timestamp')
        df2['target'].rolling(3 * 30 * 24 * 60).corr(df2['y_pred']).iloc[::24 * 60].plot()
        plt.title('3 month rolling pearsonr {}'.format(market))
        plt.show()

# ボットで使っている特徴量(重いのは削除) + ridgeのスコア
# r2 0.00012130795512599324
# pearsonr (0.013457722434615444, 0.0)
# target std 0.005677241985371386

In [None]:
# refit
model.fit(df[get_feature_columns(df)], df['target'])
save_model(model, 'model.xz')

In [None]:
# submit

recent_sec = 4000 * (5 * 60)
df = pd.read_pickle('/tmp/df.pkl')
df_asset = pd.read_pickle('/tmp/df_asset.pkl')
model = joblib.load('model.xz')

env = gresearch_crypto.make_env() 
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    # 最新データを追加
    test_df2 = process_data(test_df, df_asset)
    if 'row_id' in df.columns:
        df = df.drop(columns=['row_id']) # 念の為row_id重複に対応
    df = df.append(test_df2)
    
    # 最近のデータだけにする(for performance)
    test_min_timestamp = test_df2.index.get_level_values(0).min()
    df = df.loc[test_min_timestamp - pd.to_timedelta(recent_sec, unit='s') <= df.index.get_level_values(0)]
    df = sort_and_remove_duplicates(df)
    
    # 特徴量計算
    df_features = calc_features(df)
    
    # 予測
    df_features = df_features.loc[~df_features['row_id'].isna()]
    df_features['Target'] = model.predict(df_features[get_feature_columns(df_features)].values)
    sample_prediction_df = sample_prediction_df.merge(df_features[['row_id', 'Target']], how='left', on='row_id')
    
    if False:
        display(test_df)
        display(sample_prediction_df)
    
    env.predict(sample_prediction_df)