In [None]:
!pip install scikit-learn==0.24.2
!pip install talib-binary

## CrossValidation Method
- In TimeSeries, TimeSeriesSplit usually is used for cross-validation. However, this will limit the number of training data used for training.
- According to "Mörke, Mathis. "Marcos López de Prado: Advances in financial machine learning." (2019): 491-493.", we can also use PurgedKFold to do the cross-validation. In this schema, the purpose is to find the model performance under different market temperature. So we need to mannually find a split of market and test the model performance. Also, in order to prevent look-ahead bias, we use one month gap before and after the test data size.

## Feature Engineering
- The engineering techniques are described here https://www.kaggle.com/axzhang/pipeline-building-featureengineering-stage1

In [None]:
TEST_DAY = 3 * 30
# train_day = 6 * 30
TRAIN_DAY = -1
GAP_DAY = 15
N_SPLIT = 6
CKPT = "ckpt"
SKIPS = []
EPS = 1e-18

MODEL_PARAMS = {
    "n_estimators": 1000,
    "early_stopping_round": 50,
    "max_depth": 4,  # choose a very shallow depth to ovoid overfitting.
    "random_seed": 2021,
    "learning_rate": 1e-3,
    "colsample_bytree": 0.3,  # For the most of the time, trader only looks at <= 5 features to make decision. Accordingly, we limite the feature-wise sample size.
    "subsample": 0.3,
    "metric": "custom",
    "verbosity": -1,
    "min_data_in_leaf": 100,
    "device": "gpu"
}

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb 
import sklearn
import re
import os
import json
import talib
import gc
import pickle
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
import logging
pd.set_option('display.max_rows', 200)
def pearson_eval(preds, train_data):
    """customized lgb evaluation method """
    labels = np.nan_to_num(train_data.get_label())
    return 'corr', pearsonr(labels, np.nan_to_num(preds))[0], True

# logger = logging.getLogger()
# logger.setLevel(logging.INFO)
# lgb.register_logger(logger)
def weighted_correlation(a, b, weights):
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / (np.sqrt(var_a * var_b) + 1e-12)
    return corr

def validate_one_symble(model, features, label):
    pred = model.predict(features)
    dummy_weights = np.ones_like(pred)
    corr = weighted_correlation(label, pred, dummy_weights)
    return corr

def neutralize_series(series : pd.Series, by : pd.Series, proportion=1.0):
    """
    neutralize pandas series (originally from the Numerai Tournament)
    """
    scores = np.nan_to_num(series.values).reshape(-1, 1)
    exposures = np.nan_to_num(by.values).reshape(-1, 1)
    exposures = np.hstack((exposures, np.array([np.mean(np.nan_to_num(series.values))] * len(exposures)).reshape(-1, 1)))
    correction = proportion * (exposures.dot(np.linalg.lstsq(exposures, scores)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized

def feature_exposures(df, prediction_name = 'Target'):
    feature_names = features
    exposures = []
    for f in feature_names:
        fe = np.corrcoef(np.nan_to_num(df[prediction_name].values), np.nan_to_num(df[f].values))[0, 1]
        exposures.append(fe)
    return np.array(exposures)

def max_feature_exposure(df): return np.max(np.abs(feature_exposures(df)))
def feature_exposure(df): return np.sqrt(np.mean(np.square(feature_exposures(df))))

In [None]:
def get_feature_name(func_name, meta):
    if "timeperiod" in meta:
        if isinstance(meta["timeperiod"], str):
            name = func_name + "_" + "_".join([str(i) for i in eval(meta["timeperiod"])])
        else:
            name = func_name + "_" + str(meta["timeperiod"])
    else:
        name = func_name
    return name

def read_feature_config(symbol):
    with open(os.path.join("../input/bestparameter", "best_period_{}.json".format(symbol))) as f:
        config = json.load(f)
    new_config = {}
    for func_name, setting in config.items():
        setting["func_name"] = func_name
        feature_name = get_feature_name(func_name, setting)
        new_config[feature_name] = setting
    return new_config

In [None]:
def RET(df, n):
    return df['Close'].pct_change(n)

def RET_C(df, n):
    return df['Close'].pct_change(n)

def RET_H(df, n):
    return df['High'].pct_change(n)
    
def RET_L(df, n):
    return df['Low'].pct_change(n)
    
def RET_O(df, n):
    return df['Open'].pct_change(n)
    
def RET_V(df, n):
    return df['Volume'].pct_change(n)
    
def RET_VWAP(df, n):
    return df['VWAP'].pct_change(n)
    
def RET_Cnt(df, n):
    return df['Count'].pct_change(n)

def STD(df, n):
    return df['Close'].pct_change(1).rolling(n).std()

def RET_STD(df, n):
    return RET(df, n) * STD(df, n)

def RSI(df, n):
    return talib.RSI(df['Close'], n)

def ATR(df, n):
    return talib.ATR(df["High"], df.Low, df.Close, n)

def MFI(df, n):
    return talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'], n)

def VOL(df, n):
    ret = df['Close'].pct_change(1)
    return np.sqrt((ret ** 2).rolling(n).mean())

def TRIX(df, n):
    return talib.TRIX(df['Close'], n)

def MACD(df, fast, slow):
    return talib.MACD(df.Close, fast, slow)[0]

def MACD_HIST(df, fast, slow):
    return talib.MACD(df.Close, fast, slow)[2]

def DEMA(df, n1, n2):
    return np.log(df['Close'].rolling(n1).mean() / (df['Close'].rolling(n2).mean() + EPS))

def EFFICIENCY(df, n):
    speed = (df.Close - df.Close.shift(n))
    volatility = (df.Close - df.Close.shift(1)).abs().rolling(n).sum()
    return speed / (volatility+ EPS)

def EI(df, n):
    neg = df.Close - df.Low.rolling(n).min()
    high = df.High.rolling(n).max() - df.Close
    ei = high / (neg + EPS)
    ei = np.clip(ei, -100, 100)
    return ei

def EVEMT_BIGGER_VOLUME(df):
    return (df.Volume > df.Volume.shift(1)).astype(float)

def EVENT_MACROSS(df, fast, slow):
    fast_s = df.Close.rolling(fast).mean()
    slow_s = df.Close.rolling(slow).mean()
    return (fast_s > slow_s).astype(float)

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

In [None]:
def RET_VEC(df, n):
    return df['Close'].pct_change(n)

def RET_C_VEC(df, n):
    return df['Close'].pct_change(n)

def RET_H_VEC(df, n):
    return df['High'].pct_change(n)
    
def RET_L_VEC(df, n):
    return df['Low'].pct_change(n)
    
def RET_O_VEC(df, n):
    return df['Open'].pct_change(n)
    
def RET_V_VEC(df, n):
    return df['Volume'].pct_change(n)
    
def RET_VWAP_VEC(df, n):
    return df['VWAP'].pct_change(n)
    
def RET_Cnt_VEC(df, n):
    return df['Count'].pct_change(n)

def STD_VEC(df, n):
    return df['Close'].pct_change(1).rolling(n).std()

def RET_STD_VEC(df, n):
    return RET_VEC(df, n) * STD_VEC(df, n)

def RSI_VEC(df, n):
    close = df["Close"]
    feat = pd.DataFrame(index=close.index, columns=close.columns)
    for c in close.columns:
        feat[c] = talib.RSI(close[c], n)
    return feat

def ATR_VEC(df, n):
    close = df["Close"]
    feat = pd.DataFrame(index=close.index, columns=close.columns)
    for c in close.columns:
        feat[c] = talib.ATR(df["High"][c], df["Low"][c], df["Close"][c], n)
    return feat

def MFI_VEC(df, n):
    close = df["Close"]
    feat = pd.DataFrame(index=close.index, columns=close.columns)
    for c in close.columns:
        feat[c] = talib.MFI(df["High"][c], df["Low"][c], df["Close"][c], df["Volume"][c], n)
    return feat

def VOL_VEC(df, n):
    ret = df['Close'].pct_change(1)
    return np.sqrt((ret ** 2).rolling(n).mean())

def TRIX_VEC(df, n):
    close = df["Close"]
    feat = pd.DataFrame(index=close.index, columns=close.columns)
    for c in close.columns:
        feat[c] = talib.TRIX(close[c], n)
    return feat

def MACD_VEC(df, fast, slow):
    close = df["Close"]
    feat = pd.DataFrame(index=close.index, columns=close.columns)
    for c in close.columns:
        feat[c] = talib.MACD(close[c], fast, slow)[0]
    return feat

def MACD_HIST_VEC(df, fast, slow):
    close = df["Close"]
    feat = pd.DataFrame(index=close.index, columns=close.columns)
    for c in close.columns:
        feat[c] = talib.MACD(close[c], fast, slow)[2]
    return feat

def DEMA_VEC(df, n1, n2):
    return np.log(df['Close'].rolling(n1).mean() / (df['Close'].rolling(n2).mean() + EPS))

def EFFICIENCY_VEC(df, n):
    speed = (df["Close"] - df["Close"].shift(n))
    volatility = (df["Close"] - df["Close"].shift(1)).abs().rolling(n).sum()
    return speed / (volatility+ EPS)

def EI_VEC(df, n):
    neg = df["Close"] - df["Low"].rolling(n).min()
    high = df["High"].rolling(n).max() - df["Close"]
    ei = high / (neg + EPS)
    ei = np.clip(ei, -100, 100)
    return ei

def EVEMT_BIGGER_VOLUME_VEC(df):
    return (df["Volume"] > df["Volume"].shift(1)).astype(float)

def EVENT_MACROSS_VEC(df, fast, slow):
    fast_s = df["Close"].rolling(fast).mean()
    slow_s = df["Close"].rolling(slow).mean()
    return (fast_s > slow_s).astype(float)

def upper_shadow_VEC(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow_VEC(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

In [None]:
def get_features(df, row=False, config=None):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    d = {"Count": "RET_Cnt", 
         "Open": "RET_O", 
         "Close": "RET_C", 
         "Volume": "RET_V", 
         "VWAP": "RET_VWAP",
         "High": "RET_H",
         "Cnt": "RET_Cnt",
         "Low": "RET_L",
        }
    if config is None:
        config = {}
    for col in ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']:
        config["RET60_{}".format(col)] = {"func_name": d[col], "timeperiod": 60}
    
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['target_lag'] = df["Target"].shift(16)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    df_feat["MoM15"] = df_feat["VWAP"] - df_feat["VWAP"].shift(15)
    
    ## possible seasonality, datetime  features (unlikely to me meaningful, given very short time-frames)
    ### to do: add cyclical features for seasonality
    times = pd.to_datetime(df["timestamp"],unit="s",infer_datetime_format=True)
    if row:
        df_feat["hour"] = times.hour  # .dt
    else:
        df_feat["hour"] = times.dt.hour  # .dt

    # self-define feature engineering
    if config is not None:
        for feature_name, setting in config.items():
            func_name = setting["func_name"]
            func = eval(func_name)
            if "timeperiod" not in setting:
                # no argument
                feature: float = func(df_feat)
            elif isinstance(setting.get("timeperiod"), str):
                args = eval(setting.get("timeperiod"))
                feature: float = func(df_feat, *args)
            else:
                feature: float = func(df_feat, setting.get("timeperiod"))
            df_feat[feature_name] = feature
            
    df_feat.pop("Close")
    df_feat.pop("High")
    df_feat.pop("Low")
    df_feat.pop("Open")
    df_feat.pop("VWAP")
    df_feat.pop("Volume")
    return df_feat

In [None]:
def get_features_vec(df, config=None):
    removed_features = [
    'Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', "VOL_(.+?)",
    "RET_6","RET_5","RET_24","RET_9","RET_90","RET_92","RET_81","RET_96","RET_40",
    "STD_41","STD_82","STD_43","STD_30","STD_11","STD_6","STD_5","STD_8",
    "RET_STD_86","RET_STD_69","RET_STD_80","RET_STD_10","RET_STD_72","RET_STD_45","RET_STD_48","RET_STD_26","RET_STD_29","RET_STD_25","RET_STD_6",
    "RSI_11","RSI_32","RSI_37","RSI_38","RSI_8","RSI_42","RSI_6","RSI_5","RSI_94",
    "ATR_(.+?)",
    "MFI_21","MFI_13","MFI_48","MFI_38","MFI_10","MFI_69","MFI_8","MFI_6","MFI_96","MFI_99","MFI_5",
    "VOL_34","VOL_82","VOL_42","VOL_96","VOL_14","VOL_10","VOL_6","VOL_5","VOL_2","VOL_4",
    "TRIX_5","TRIX_9","TRIX_12","TRIX_14","TRIX_32","TRIX_34","TRIX_41","TRIX_44","TRIX_49",
    "EFFICIENCY_15","EFFICIENCY_14","EFFICIENCY_13","EFFICIENCY_72","EFFICIENCY_81","EFFICIENCY_92","EFFICIENCY_96","EFFICIENCY_97",
    "EI_11","EI_19","EI_23","EI_35","EI_97","EI_98","EI_96","EI_79","EI_83","EI_64",
    "MACD_HIST_2_133","MACD_HIST_2_183","MACD_HIST_22_53","MACD_HIST_2_53","MACD_HIST_32_63","MACD_HIST_22_183","MACD_HIST_2_33","MACD_HIST_32_183","MACD_HIST_32_193","MACD_HIST_42_163","MACD_HIST_42_183",
    "DEMA_2_28","DEMA_2_63","DEMA_2_108","DEMA_2_138","DEMA_2_143","DEMA_2_153","DEMA_8_13","DEMA_16_158","DEMA_42_98","DEMA_32_98","DEMA_48_133","DEMA_48_143","DEMA_44_63",
    "MACD_4_8","MACD_6_18","MACD_12_13","MACD_2_78","MACD_2_83","MACD_2_98","MACD_2_113","MACD_2_138","MACD_2_153","MACD_8_143","MACD_48_118",
    "EVENT_MACROSS_1_98","EVENT_MACROSS_1_93","EVENT_MACROSS_1_133","EVENT_MACROSS_3_48","EVENT_MACROSS_1_178","EVENT_MACROSS_27_53","EVENT_MACROSS_23_83","EVENT_MACROSS_49_68","EVENT_MACROSS_49_103","EVENT_MACROSS_47_123","EVENT_MACROSS_41_143","EVENT_MACROSS_43_163","EVENT_MACROSS_49_183",
    "RET1_Count","RET1_Open","RET1_High","RET1_Low","RET1_Close","RET1_Volume","RET1_VWAP"]
    df_feat = {}
    d = {"Count": "RET_Cnt", 
         "Open": "RET_O", 
         "Close": "RET_C", 
         "Volume": "RET_V", 
         "VWAP": "RET_VWAP",
         "High": "RET_H",
         "Cnt": "RET_Cnt",
         "Low": "RET_L",
        }
    if config is None:
        config = {}
    for col in ['Count', 'Volume', 'VWAP']:
        config["RET60_{}".format(col)] = {"func_name": d[col], "timeperiod": 60}
        config["RET240_{}".format(col)] = {"func_name": d[col], "timeperiod": 240}
    
    df_feat['Upper_Shadow'] = upper_shadow_VEC(df)
    df_feat['target_lag'] = df["Target"].shift(16)
    df_feat["MoM1"] = df["Close"] - df["Close"].shift(1)
    ## possible seasonality, datetime  features (unlikely to me meaningful, given very short time-frames)
    ### to do: add cyclical features for seasonality
    times = pd.to_datetime(df["timestamp"].iloc[:, 0],unit="s",infer_datetime_format=True)
    df_feat["hour"] = pd.DataFrame(np.broadcast_to(times.dt.hour.values.reshape(-1, 1), df["timestamp"].shape), index=df["timestamp"].index, columns=df["timestamp"].columns)  # .dt

    # self-define feature engineering
    if config is not None:
        for feature_name, setting in tqdm(config.items()):
            remove = False
            for feat in removed_features:
                if re.match(feat, feature_name):
                    remove = True
                    break
            if remove:
                continue
            args = ()
            if "timeperiod" not in setting:
                # no argument
                pass
            elif isinstance(setting.get("timeperiod"), str):
                args = eval(setting.get("timeperiod"))
            else:
                args = (setting.get("timeperiod"), )
                
            vec_func_name = setting["func_name"] + "_VEC"
            func = eval(vec_func_name)
            df_feat[feature_name] = func(df, *args)

    for feat in removed_features:
        keys = list(df_feat.keys())
        for k in keys:
            if re.match(feat, k):
                df_feat.pop(k)
                print("remove ", k)
    return df_feat

In [None]:
df = pd.read_feather("../input/filleddataset/train.feather")
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df = df.set_index(["Asset_ID", "datetime"], drop=False)

In [None]:
asset_df = pd.read_csv("../input/c/c/g-research-crypto-forecasting/asset_details.csv", index_col="Asset_Name")
assets = list(asset_df.index)

In [None]:
weights = asset_df["Weight"]
weights = weights / weights.sum()

In [None]:
period1 = slice("2018-1-1", "2018-11-1")  # 178560
# weighted_ret.loc[period1].plot()
period2 = slice("2019-1-1", "2019-7-1")
# weighted_ret.loc[period2].plot()
period3 = slice("2019-8-1", "2019-12-1")
# weighted_ret.loc[period3].plot()
period4 = slice("2020-1-1", "2021-5-1")
# weighted_ret.loc[period4].plot()
period5 = slice("2021-6-1", "2021-10-1")
# weighted_ret.loc[period5].plot()
period6 = slice("2021-10-1", "2022-1-25")
# weighted_ret.loc[period6].plot()

In [None]:
# preprocess the data
# 1) delete the unused data
# 2) forward fill
# 3) normalize some features
new_dfs = {}
for asset_name in assets:
    sub_df = df.loc[asset_name]
    if asset_name == "Maker":
        sub_df = sub_df.loc["2020-08-04":]
    elif asset_name == "Monero":
        sub_df = sub_df.loc["2018-11-05":]
    elif asset_name == "Stellar":
        sub_df = sub_df.loc["2018-07-14":]
#     sub_df = sub_df.loc["2020-1-1":]
    index = sub_df.index
    sub_df = sub_df.fillna(method="ffill").dropna()
    if sub_df.size == 0:
        raise ValueError()
    for c in sub_df.columns:
        if c in ["Close", "High", "Low", "Open", "VWAP"]:
            sub_df[c] = sub_df[c] / sub_df["Close"].iloc[0]
        elif c in ["Volume", "Count"]:
            sub_df[c] = sub_df[c] / sub_df[c].iloc[0]
    new_dfs[asset_name] = sub_df
    
df = pd.concat(new_dfs, axis=0, names=["Asset_ID"])
del new_dfs

In [None]:
all_configs = {
#  "CHuWhiteCUMSUM_1200": {"func_name": "CHuWhiteCUMSUM", "timeperiod": "(1200, 3600)"},
#  "CHuWhiteCUMSUM_240": {"func_name": "CHuWhiteCUMSUM", "timeperiod": "(240, 3600)"},
#  "CHuWhiteCUMSUM_90": {"func_name": "CHuWhiteCUMSUM", "timeperiod": "(90, 3600)"},
#  "HighLowVolatilityEstimator_240": {"func_name": "HighLowVolatilityEstimator", "timeperiod": 240},
#  "HighLowVolatilityEstimator_90": {"func_name": "HighLowVolatilityEstimator", "timeperiod": 90},
#  "HigLowLiquidityEstimator_240": {"func_name": "HigLowLiquidityEstimator", "timeperiod": 240},
#  "HigLowLiquidityEstimator_90": {"func_name": "HigLowLiquidityEstimator", "timeperiod": 90},
    
 'MFI_23': {'timeperiod': 23, 'correlation': 0, 'func_name': 'MFI'},
 'DEMA_2_23': {'timeperiod': '(2, 23)', 'correlation': 0, 'func_name': 'DEMA'},
 'RSI_16': {'timeperiod': 16, 'correlation': 0, 'func_name': 'RSI'},
 'MACD_2_18': {'timeperiod': '(2, 18)', 'correlation': 0, 'func_name': 'MACD'},
 'STD_7': {'timeperiod': 7, 'correlation': 0, 'func_name': 'STD'},
 'EFFICIENCY_84': {'timeperiod': 84, 'correlation': 0,'func_name': 'EFFICIENCY'},
 'MACD_HIST_2_163': {'timeperiod': '(2, 163)','correlation': 0,'func_name': 'MACD_HIST'},
 'EVENT_MACROSS_1_83': {'timeperiod': '(1, 83)','correlation': 0,'func_name': 'EVENT_MACROSS'},
 'RET_STD_90': {'timeperiod': 90, 'correlation': 0, 'func_name': 'RET_STD'},
 'RSI_96': {'timeperiod': 96, 'correlation': 0, 'func_name': 'RSI'},
 'TRIX_81': {'timeperiod': 81, 'correlation': 0, 'func_name': 'TRIX'},
 'EI_20': {'timeperiod': 20, 'correlation': 0, 'func_name': 'EI'},
 'RET_STD_23': {'timeperiod': 23, 'correlation': 0, 'func_name': 'RET_STD'},
 'MFI_74': {'timeperiod': 74, 'correlation': 0, 'func_name': 'MFI'},
 'EI_77': {'timeperiod': 77, 'correlation': 0, 'func_name': 'EI'},
 'RET_15': {'timeperiod': 15, 'correlation': 0, 'func_name': 'RET'},
 'STD_96': {'timeperiod': 96, 'correlation': 0, 'func_name': 'STD'},
 'TRIX_4': {'timeperiod': 4, 'correlation': 0, 'func_name': 'TRIX'},
 'EFFICIENCY_16': {'timeperiod': 16,'correlation': 0,'func_name': 'EFFICIENCY'},
}

In [None]:
# store all data in a dictionary
dv = {}
for feature in tqdm(df.columns):
    dv[feature] = df.pivot(index="timestamp", columns="Asset_ID", values=feature)
    dv[feature] = dv[feature].fillna(method="ffill")

In [None]:
del df

In [None]:
dv_features = get_features_vec(dv, all_configs)

In [None]:
feature_names = list(dv_features)

In [None]:
target = dv["Target"]
del dv

In [None]:
import gc; gc.collect()

In [None]:
averaged_feature = ["DEMA_2_23", "RET60_VWAP", "RSI_16", "RET_15", 
                    "RSI_96", "RET_STD_23", "RET_STD_90", "TRIX_4",
                   "RET240_VWAP", "EFFICIENCY_84", "EFFICIENCY_16", 
                    "MACD_2_18",
                   ]

In [None]:
for feat in dv_features.keys():
    path = os.path.join("{}.parquet".format(feat))
    dv_features[feat].to_parquet(path)
#     if feat not in averaged_feature:
#         del dv_features[feat]  # save memory

In [None]:
rev = list(set(dv_features.keys()) - set(averaged_feature))
for feat in rev:
    print("delete ", feat)
    if feat in dv_features:
        del dv_features[feat]  # save memory

In [None]:
gc.collect()

In [None]:
drop_original = False
for feat in tqdm(averaged_feature):
    new_name = feat + "_residualized"
    feature_names.append(new_name)
    # dv_features[feat] 是一个(T, K)的矩阵，代表所有的asset的所有时间段的某个feature，然后减去mean
    dv_features[new_name] = dv_features[feat].subtract(dv_features[feat].multiply(weights, axis="columns").sum(axis="columns"), axis="index")
    path = os.path.join("{}.parquet".format(new_name))
    dv_features[feat].to_parquet(path)
    if drop_original:
        feature_names.remove(feat)
        
    del dv_features[new_name]
    if feat in dv_features:
        del dv_features[feat]

In [None]:
TEST_SCORE_DF = pd.DataFrame(index=asset_df.index, columns=[period1.start, period2.start, period3.start, period4.start, period5.start, period6.start])
TRAIN_SCORE_DF = pd.DataFrame(index=asset_df.index, columns=[period1.start, period2.start, period3.start, period4.start, period5.start, period6.start])

In [None]:
FEAT_SPLIT_DF = pd.DataFrame()
FEAT_GAIN_DF = pd.DataFrame()

In [None]:
def get_score_for_one_symbol_new_cv(all_df, asset_id, feature_names, dry_run=False, model_params={}, dump_root="ckpt"):
    symbol_df = all_df
    train_score_by_cv = [0] * N_SPLIT
    test_score_by_cv = [0] * N_SPLIT
    train_size_by_cv = [0] * N_SPLIT
    test_size_by_cv = [0] * N_SPLIT
    test_period_by_cv = [0] * N_SPLIT
    test_type_by_cv = [0] * N_SPLIT
    iter_by_cv = [0] * N_SPLIT
    df_proc = symbol_df[feature_names]
    bulls = [period4, period2]
    bears = [period3]
    neutral = [period1, period5, period6]
    for i, period in enumerate([period1, period2, period3, period4, period5, period6]):
        print(period)
        train_features, train_target = df_proc.loc[period], symbol_df["Target"].loc[period]
        test_features, test_target = df_proc.loc[period], symbol_df["Target"].loc[period]
        if test_features.size == 0:
            continue
        part1 = pd.Timestamp(period.start) - pd.Timedelta("30D")
        part2 = pd.Timestamp(period.stop) + pd.Timedelta("30D")
        dfs = []
        targets = []
        _df1 = df_proc.loc[:part1]
        _df2 = df_proc.loc[part2:]
        if _df1.size > 0:
            dfs.append(_df1)
            targets.append(symbol_df["Target"].loc[:part1])
        if _df2.size > 0:
            dfs.append(_df2)
            targets.append(symbol_df["Target"].loc[part2:])
        if len(dfs) == 2:
            train_features = pd.concat(dfs)
            train_target = pd.concat(targets)
        elif len(dfs) == 1:
            train_features = dfs[0]
            train_target = targets[0]
        else:
            continue
        train_size = len(train_features)
        test_size = len(test_features)
        train_features = train_features.replace([np.inf, -np.inf], np.nan)
        test_features = test_features.replace([np.inf, -np.inf], np.nan)
        train_nan_mask = train_features.isnull().any(axis=1)
        test_nan_mask = test_features.isnull().any(axis=1)
        train_features, train_target = train_features.loc[~train_nan_mask], train_target.loc[~train_nan_mask]
        test_features, test_target = test_features.loc[~test_nan_mask], test_target.loc[~test_nan_mask]
        
        train_set = lgb.Dataset(train_features, label=train_target, feature_name=feature_names)
        test_set = lgb.Dataset(test_features, label=test_target, feature_name=feature_names)
        # continuous
        
        assert len(train_features) == len(train_target), "{}_{}".format(len(train_features), len(train_target))
        assert len(test_features) == len(test_target), "{}_{}".format(len(train_features), len(train_target))
        booster = lgb.train(train_set=train_set, params=model_params, valid_sets=[test_set], feval=pearson_eval)
        corr_train = validate_one_symble(booster, train_features, train_target)
        corr_test = validate_one_symble(booster, test_features, test_target)
        TEST_SCORE_DF.loc[asset_id, period.start] = float(corr_test)
        TRAIN_SCORE_DF.loc[asset_id, period.start] = float(corr_train)
        for name in feature_names:
            FEAT_SPLIT_DF.loc[name, "{}_{}".format(asset_id, period.start)] = dict(zip(feature_names, booster.feature_importance("split")))[name]
            FEAT_GAIN_DF.loc[name, "{}_{}".format(asset_id, period.start)] = dict(zip(feature_names, booster.feature_importance("gain")))[name]

        train_score_by_cv[i] = float(corr_train)
        test_score_by_cv[i] = float(corr_test)
        train_size_by_cv[i] = int(train_size)
        test_size_by_cv[i] = int(test_size)
        test_period_by_cv[i] = [period.start, period.stop]
        if period in bulls:
            test_type_by_cv[i] = "bull"
        elif period in bears:
            test_type_by_cv[i] = "bear"
        else:
            test_type_by_cv[i] = "neutral"
        iter_by_cv[i] = booster.best_iteration
        str_path = os.path.join(os.getcwd(), dump_root, asset_id, str(i))
        os.makedirs(str_path, exist_ok=True)
        model_str = booster.model_to_string()
        with open(os.path.join(str_path, "lgb.ckpt"), "w") as f:
            f.write(model_str)
        with open(os.path.join(str_path, "used_features.pickle"), "wb") as f:
            pickle.dump(feature_names, f)
        if dry_run:
            break
    avg_train_score = sum(train_score_by_cv) / N_SPLIT
    avg_test_score = sum(test_score_by_cv) / N_SPLIT
    best_iteration = booster.best_iteration
    meta = {
            "train_score": train_score_by_cv,
            "test_score": test_score_by_cv,
            "train_size_by_cv": train_size_by_cv,
            "test_size_by_cv": test_size_by_cv,
            "test_type_by_cv": test_type_by_cv,
            "test_period_by_cv": test_period_by_cv,
            "model_params": model_params,
            "avg_train_score": avg_train_score,
            "avg_test_score": avg_test_score,
            "iter_by_cv": iter_by_cv
        }
        
    meta_path = os.path.join(os.getcwd(), dump_root, asset_id, "lgb_meta.json")
    with open(meta_path, "w") as f:
        f.write(json.dumps(meta, indent=2))
    return avg_train_score, avg_test_score, meta

In [None]:
print(feature_names)

In [None]:
print(len(feature_names))

In [None]:
train_score_by_symbol = {}
test_score_by_symbol = {}

for asset_id in assets:
    df = {"Target": target[asset_id]}
    for feat in tqdm(feature_names):
        feature_df = pd.read_parquet(os.path.join("{}.parquet".format(feat)), columns=[asset_id])[asset_id]
        df[feat] = feature_df
        del feature_df
    df = pd.concat(df, axis=1)
    df.index = pd.to_datetime(df.index, unit='s')
    df.dropna(subset=feature_names+["Target"], inplace=True)
    print(asset_id + "\n***")
    train_score, test_score, meta = get_score_for_one_symbol_new_cv(df, asset_id, feature_names, dry_run=True, model_params=MODEL_PARAMS, dump_root=CKPT)
    train_score_by_symbol[asset_id] = train_score
    test_score_by_symbol[asset_id] = test_score
    
    print(meta)
    print("\n")

In [None]:
TRAIN_SCORE_DF

In [None]:
TEST_SCORE_DF

In [None]:
TRAIN_SCORE_DF.to_csv(os.path.join(os.getcwd(), CKPT, "train_score_df.csv"))
TEST_SCORE_DF.to_csv(os.path.join(os.getcwd(), CKPT, "test_score_df.csv"))
FEAT_SPLIT_DF.to_csv(os.path.join(os.getcwd(), CKPT, "FEAT_SPLIT_DF.csv"))
FEAT_GAIN_DF.to_csv(os.path.join(os.getcwd(), CKPT, "FEAT_GAIN_DF.csv"))

In [None]:
final_train_score = sum([score * weights[s] for s, score in train_score_by_symbol.items()])
final_test_score = sum([score * weights[s] for s, score in test_score_by_symbol.items()])
print("avg. model score on train: {:.4f}".format(final_train_score))
print("avg. model score on test: {:.4f}".format(final_test_score))

In [None]:
score_by_symbol = pd.DataFrame({"train_score": train_score_by_symbol, "test_score": test_score_by_symbol}).sort_values(by="train_score")

In [None]:
# read baseline model
new_meta_root = CKPT
old_scores = pd.read_csv(os.path.join("../input/baselineresult/ckpt", "test_score_df.csv"), index_col="Asset_Name")
new_scores = pd.read_csv(os.path.join(CKPT, "test_score_df.csv"), index_col="Asset_Name")
score_dif = new_scores - old_scores
# show_dif = pd.DataFrame(index=new_scores.index, columns=new_scores.columns, dtype=float)
# for i in new_scores.index:
#     for j in new_scores.columns:
#         diff = new_scores.loc[i, j] - old_scores.loc[i, j]
#         if diff > 0:
#             show_dif.loc[i,j] = "{:.2%}(+{:.2%})".format(new_scores.loc[i, j], diff)
#         else:
#             show_dif.loc[i,j] = "{:.2%}({:.2%})".format(new_scores.loc[i, j], diff)
# show_dif

In [None]:
print("new fold result")
print(new_scores.multiply(weights, axis="index").sum(axis=0))
print("baseline fold result")
print(old_scores.multiply(weights, axis="index").sum(axis=0))

print("new fold std result")
print(new_scores.multiply(weights, axis="index").sum(axis=0).std())
print("baseline fold std result")
print(old_scores.multiply(weights, axis="index").sum(axis=0).std())

print("new fold mean result")
print(new_scores.multiply(weights, axis="index").sum(axis=0).mean())
print("baseline fold mean result")
print(old_scores.multiply(weights, axis="index").sum(axis=0).mean())

In [None]:
from scipy.stats import ttest_1samp
score_dif = score_dif.astype(float)
dif = np.ravel(score_dif.values)
dif = dif[~np.isnan(dif)]
ttest_1samp(dif, 0)

In [None]:
score_dif.mean(axis=1)

In [None]:
score_dif.mean(axis=0)

In [None]:
np.nanmean(score_dif)

In [None]:
FEAT_GAIN_DF.rank().std(axis=1).sort_values()

In [None]:
FEAT_SPLIT_DF.rank().std(axis=1).sort_values()

In [None]:
FEAT_GAIN_DF.rank().mean(axis=1).sort_values()

In [None]:
FEAT_SPLIT_DF.rank().mean(axis=1).sort_values()