In [None]:
!pip install scikit-learn==0.24.2

## Motivation

I'm trying to solo this competion for less than a month. And this is the pipeline I'm gonna do.

## Pipeline
- **Data Loading from a preprocessed dataset(compressed and target reconstructed)** (https://www.kaggle.com/axzhang/fork-of-dataprepare2)
- **Define the Cross-Validation Pipeline since LB is meaningless** (This notebook)
- **Define features for each asset** (**TODO**)
  - Continuous features such as technical analysis.
  - Most features depend on number of looking-back window such as moving average. We should choose the window to use according to 
    * 1) **momentum**: the feature that has positve correlation with the target.
    * 2) **mean-reversion**: the feature that has negative correlation with the target.
  - Sparse features such as some break event.
    - Bear-Bull Market indicator: This kind of indicator is too sparse for minute-level data. Better train a different model for each type of market.
    - Trendy Occilator
     
- **Train a asset-specific model using GappedTimeSeries to avoid overfitting to a specific time period.** (**TODO**)
 - Each model should be weighted according to some weights (considering time and performance in validation set)
 - For some assets like **Maker**, I found that there are lots of gapped in the datasets. Currently, we skip them.
     - Using external datasets from Binance to fill the **NaN**s
- **Train a asset-agostic model using all the same cross-validation method.** (**TODO**)
 - Reasons 1) For minute-level data, I believe there are common patterns for all assets because of the micro-structure of the market is universally similar. 
 - Reasons 2) The target is constructed using all sets. So somehow they are correlated.
 - Avoid overfitting: we should do factor-neutralization for each feature since 1) each feature has different scale and 2) we don't want to let the model overfit one asset.
 
- **Massive Parameter Tuning** (**TODO**)
    * Using HPO package

In [None]:
TEST_DAY = 3 * 30
# train_day = 6 * 30
TRAIN_DAY = -1
GAP_DAY = 15
N_SPLIT = 5
CKPT = "ckpt"
SKIPS = ['Maker', "Monero", "Stellar"]

MODEL_PARAMS = {
    "n_estimators": 1000,
    "early_stopping_round": 50,
    "max_depth": 4,  # choose a very shallow depth to ovoid overfitting.
    "random_seed": 2021,
    "learning_rate": 1e-3,
    "colsample_bytree": 0.3,  # For the most of the time, trader only looks at <= 5 features to make decision. Accordingly, we limite the feature-wise sample size.
    "subsample": 0.8,
    "metric": "regression_l2",
    "verbosity": -1,
    "min_data_in_leaf": 100,
    "device": "gpu"
}

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb 
import sklearn
import os
import json
from scipy.stats import pearsonr
import logging

def pearson_eval(preds, train_data):
    """customized lgb evaluation method """
    labels = np.nan_to_num(train_data.get_label())
    return 'corr', pearsonr(labels, np.nan_to_num(preds))[0], True

logger = logging.getLogger()
logger.setLevel(logging.INFO)
lgb.register_logger(logger)
def weighted_correlation(a, b, weights):
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / (np.sqrt(var_a * var_b) + 1e-12)
    return corr

def validate_one_symble(model, features, label):
    pred = model.predict(features)
    dummy_weights = np.ones_like(pred)
    corr = weighted_correlation(label, pred, dummy_weights)
    return corr

def neutralize_series(series : pd.Series, by : pd.Series, proportion=1.0):
    """
    neutralize pandas series (originally from the Numerai Tournament)
    """
    scores = np.nan_to_num(series.values).reshape(-1, 1)
    exposures = np.nan_to_num(by.values).reshape(-1, 1)
    exposures = np.hstack((exposures, np.array([np.mean(np.nan_to_num(series.values))] * len(exposures)).reshape(-1, 1)))
    correction = proportion * (exposures.dot(np.linalg.lstsq(exposures, scores)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized

def feature_exposures(df, prediction_name = 'Target'):
    feature_names = features
    exposures = []
    for f in feature_names:
        fe = np.corrcoef(np.nan_to_num(df[prediction_name].values), np.nan_to_num(df[f].values))[0, 1]
        exposures.append(fe)
    return np.array(exposures)

def max_feature_exposure(df): return np.max(np.abs(feature_exposures(df)))
def feature_exposure(df): return np.sqrt(np.mean(np.square(feature_exposures(df))))

In [None]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df, row=False):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    if row:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean()
    else:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    
    df_feat['High/Mean'] = df_feat['High'] / df_feat['Mean']
    df_feat['Low/Mean'] = df_feat['Low'] / df_feat['Mean']
    df_feat['Volume/Count'] = df_feat['Volume'] / (df_feat['Count'] + 1)

    ## possible seasonality, datetime  features (unlikely to me meaningful, given very short time-frames)
    ### to do: add cyclical features for seasonality
    times = pd.to_datetime(df["timestamp"],unit="s",infer_datetime_format=True)
    if row:
        df_feat["hour"] = times.hour  # .dt
        df_feat["dayofweek"] = times.dayofweek 
        df_feat["day"] = times.day 
    else:
        df_feat["hour"] = times.dt.hour  # .dt
        df_feat["dayofweek"] = times.dt.dayofweek 
        df_feat["day"] = times.dt.day 
    #df_feat.drop(columns=["time"],errors="ignore",inplace=True)  # keep original epoch time, drop string

    return df_feat


def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    # TODO: Try different features here!
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.replace([np.inf, -np.inf], np.nan).dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]

    # TODO: Try different models here!
    model = LGBMRegressor(n_estimators=10)
    model.fit(X, y)
    return X, y, model

In [None]:
df = pd.read_feather("../input/filledtraindata/train.feather")
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
asset_df = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv", index_col="Asset_Name")

In [None]:
from sklearn.model_selection import TimeSeriesSplit
cv = TimeSeriesSplit(n_splits=N_SPLIT, test_size=TEST_DAY * 24 * 60, gap=GAP_DAY * 24 * 60, max_train_size=None if TRAIN_DAY < 0 else TRAIN_DAY * 24 * 60)

In [None]:
def get_score_for_one_symbol(all_df, asset_id, dry_run=False, model_params={}, dump_root="ckpt"):
    symbol_df = df[all_df.Asset_ID == asset_id].fillna(method="ffill").dropna()
    train_score_by_cv = [0] * N_SPLIT
    test_score_by_cv = [0] * N_SPLIT
    train_size_by_cv = [0] * N_SPLIT
    test_size_by_cv = [0] * N_SPLIT
    iter_by_cv = [0] * N_SPLIT
    for i, (train_index, test_index) in enumerate(cv.split(symbol_df)):
        train_size = len(train_index)
        test_size = len(test_index)
        start_index, end_index = train_index[0], test_index[-1]  # make the dataframe continuous in time
        df_proc = get_features(symbol_df.iloc[start_index: end_index])
        train_features, train_target = df_proc.iloc[:train_size], symbol_df["Target"].iloc[train_index]
        test_features, test_target = df_proc.iloc[-test_size:], symbol_df["Target"].iloc[test_index]
        train_features = train_features.replace([np.inf, -np.inf], np.nan).dropna(how="any")
        test_features = test_features.replace([np.inf, -np.inf], np.nan).dropna(how="any")
        train_set = lgb.Dataset(train_features, label=train_target)
        test_set = lgb.Dataset(test_features, label=test_target)
        # continuous
        assert symbol_df.iloc[start_index: end_index].shape[0] == len(range(symbol_df["timestamp"].iloc[start_index], symbol_df["timestamp"].iloc[end_index], 60))
        assert len(train_features) == len(train_target)
        assert len(test_features) == len(test_target)
        booster = lgb.train(train_set=train_set, params=model_params, valid_sets=[test_set], feval=pearson_eval,)
        corr_train = validate_one_symble(booster, train_features, train_target)
        corr_test = validate_one_symble(booster, test_features, test_target)
#         print("Score on Train[{}]: {:.4f}".format(i, corr_train))
#         print("Score on Test[{}]: {:.4f}".format(i, corr_test))
        train_score_by_cv[i] = float(corr_train)
        test_score_by_cv[i] = float(corr_test)
        train_size_by_cv[i] = int(train_size)
        test_size_by_cv[i] = int(test_size)
        iter_by_cv[i] = booster.best_iteration
        str_path = os.path.join(os.getcwd(), dump_root, asset_id, str(i))
        os.makedirs(str_path, exist_ok=True)
        model_str = booster.model_to_string()
        with open(os.path.join(str_path, "lgb.ckpt"), "w") as f:
            f.write(model_str)
        
        if dry_run:
            break
    avg_train_score = sum(train_score_by_cv) / N_SPLIT
    avg_test_score = sum(test_score_by_cv) / N_SPLIT
    best_iteration = booster.best_iteration
    meta = {
            "train_score": train_score_by_cv,
            "test_score": test_score_by_cv,
            "train_size_by_cv": train_size_by_cv,
            "test_size_by_cv": test_size_by_cv,
            "model_params": model_params,
            "avg_train_score": avg_train_score,
            "avg_test_score": avg_test_score,
            "iter_by_cv": iter_by_cv
        }
        
    meta_path = os.path.join(os.getcwd(), dump_root, asset_id, "lgb_meta.json")
    with open(meta_path, "w") as f:
        f.write(json.dumps(meta, indent=2))
    return avg_train_score, avg_test_score, meta

In [None]:
weights = asset_df["Weight"]
weights = weights / weights.sum()

In [None]:
train_score_by_symbol = {}
test_score_by_symbol = {}

for asset_id in df.Asset_ID.unique():
    if asset_id in SKIPS:
        print("Skip ", asset_id)
        continue
    print(asset_id + "\n***")
    train_score, test_score, meta = get_score_for_one_symbol(df, asset_id, dry_run=False, model_params=MODEL_PARAMS, dump_root=CKPT)
    train_score_by_symbol[asset_id] = train_score
    test_score_by_symbol[asset_id] = test_score
    
    print(meta)
    print("\n")

In [None]:
final_train_score = sum([score * weights[s] for s, score in train_score_by_symbol.items()])
final_test_score = sum([score * weights[s] for s, score in test_score_by_symbol.items()])
print("avg. model score on train: {:.4f}".format(final_train_score))
print("avg. model score on test: {:.4f}".format(final_test_score))

In [None]:
score_by_symbol = pd.DataFrame({"train_score": train_score_by_symbol, "test_score": test_score_by_symbol}).sort_values(by="train_score")

In [None]:
score_by_symbol