In [None]:
!pip install --no-index --find-links ../input/talibbinary/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl talib-binary

## Very Fast and Accurate feature generator

In [None]:
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgb
import os
from datetime import datetime
import talib
from tqdm.notebook import tqdm
import time
import json
EPS = 1e-18

In [None]:
ASSET_INFO = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv", index_col="Asset_ID")
ASSET_ID_TO_NAME = ASSET_INFO["Asset_Name"].to_dict()
ASSET_NAME_TO_ID = {v: k for k, v in ASSET_ID_TO_NAME.items()}
one_day = 60 * 24

In [None]:
df = pd.read_feather("../input/filleddataset/train.feather")
asset_infos = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv", index_col="Asset_Name")
df = df.set_index(["timestamp"])
asset_infos_dict = asset_infos["Asset_ID"].to_dict()
assets = list(asset_infos.index)
asset_feature_df = {}

In [None]:
def get_feature_name(func_name, meta):
    if "timeperiod" in meta:
        if isinstance(meta["timeperiod"], str):
            name = func_name + "_" + "_".join([str(i) for i in eval(meta["timeperiod"])])
        else:
            name = func_name + "_" + str(meta["timeperiod"])
    else:
        name = func_name
    return name

def read_feature_config(symbol):
    with open(os.path.join("../input/bestparameter", "best_period_{}.json".format(symbol))) as f:
        config = json.load(f)
    new_config = {}
    for func_name, setting in config.items():
        setting["func_name"] = func_name
        feature_name = get_feature_name(func_name, setting)
        new_config[feature_name] = setting
    return new_config

In [None]:
def RET(df, n):
    return df['Close'].pct_change(n)

def RET_C(df, n):
    return df['Close'].pct_change(n)

def RET_H(df, n):
    return df['High'].pct_change(n)
    
def RET_L(df, n):
    return df['Low'].pct_change(n)
    
def RET_O(df, n):
    return df['Open'].pct_change(n)
    
def RET_V(df, n):
    return df['Volume'].pct_change(n)
    
def RET_VWAP(df, n):
    return df['VWAP'].pct_change(n)
    
def RET_Cnt(df, n):
    return df['Count'].pct_change(n)

def STD(df, n):
    return df['Close'].pct_change(1).rolling(n).std()

def RET_STD(df, n):
    return RET(df, n) * STD(df, n)

def RSI(df, n):
    return talib.RSI(df['Close'], n)

def ATR(df, n):
    return talib.ATR(df["High"], df.Low, df.Close, n)

def MFI(df, n):
    return talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'], n)

def VOL(df, n):
    ret = df['Close'].pct_change(1)
    return np.sqrt((ret ** 2).rolling(n).mean())

def TRIX(df, n):
    return talib.TRIX(df['Close'], n)

def MACD(df, fast, slow):
    return talib.MACD(df.Close, fast, slow)[0]

def MACD_HIST(df, fast, slow):
    return talib.MACD(df.Close, fast, slow)[2]

def DEMA(df, n1, n2):
    return np.log(df['Close'].rolling(n1).mean() / (df['Close'].rolling(n2).mean() + EPS))

def EFFICIENCY(df, n):
    speed = (df.Close - df.Close.shift(n))
    volatility = (df.Close - df.Close.shift(1)).abs().rolling(n).sum()
    return speed / (volatility+ EPS)

def EI(df, n):
    neg = df.Close - df.Low.rolling(n).min()
    high = df.High.rolling(n).max() - df.Close
    ei = high / (neg + EPS)
    ei = np.clip(ei, -100, 100)
    return ei

def EVEMT_BIGGER_VOLUME(df):
    return (df.Volume > df.Volume.shift(1)).astype(float)

def EVENT_MACROSS(df, fast, slow):
    fast_s = df.Close.rolling(fast).mean()
    slow_s = df.Close.rolling(slow).mean()
    return (fast_s > slow_s).astype(float)

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

In [None]:
def get_features(df, row=False, config=None):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    d = {"Count": "RET_Cnt", 
         "Open": "RET_O", 
         "Close": "RET_C", 
         "Volume": "RET_V", 
         "VWAP": "RET_VWAP",
         "High": "RET_H",
         "Cnt": "RET_Cnt",
         "Low": "RET_L",
        }
    if config is None:
        config = {}
    for col in ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']:
        config["RET1_{}".format(col)] = {"func_name": d[col], "timeperiod": 1}
        config["RET15_{}".format(col)] = {"func_name": d[col], "timeperiod": 15}
    
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    df_feat["MoM1"] = df_feat["Close"] - df_feat["Close"].shift(1)
    df_feat["MoM2"] = df_feat["High"] - df_feat["Close"].shift(1)
    df_feat["MoM3"] = df_feat["High"] - df_feat["Low"].shift(1)
    
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    if row:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean()
    else:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    
    df_feat['High_div_Mean'] = df_feat['High'] / df_feat['Mean']
    df_feat['Low_div_Mean'] = df_feat['Low'] / df_feat['Mean']
    df_feat['Volume_div_Count'] = df_feat['Volume'] / (df_feat['Count'] + 1)

    ## possible seasonality, datetime  features (unlikely to me meaningful, given very short time-frames)
    ### to do: add cyclical features for seasonality
    times = pd.to_datetime(df["timestamp"],unit="s",infer_datetime_format=True)
    if row:
        df_feat["hour"] = times.hour  # .dt
#         df_feat["dayofweek"] = times.dayofweek 
#         df_feat["day"] = times.day 
    else:
        df_feat["hour"] = times.dt.hour  # .dt
#         df_feat["dayofweek"] = times.dt.dayofweek 
#         df_feat["day"] = times.dt.day 

    # self-define feature engineering
    if config is not None:
        for feature_name, setting in config.items():
            func_name = setting["func_name"]
            func = eval(func_name)
            if "timeperiod" not in setting:
                # no argument
                feature: float = func(df_feat)
            elif isinstance(setting.get("timeperiod"), str):
                args = eval(setting.get("timeperiod"))
                feature: float = func(df_feat, *args)
            else:
                feature: float = func(df_feat, setting.get("timeperiod"))
            df_feat[feature_name] = feature
            
    df_feat.pop("Close")
    df_feat.pop("High")
    df_feat.pop("Low")
    df_feat.pop("Open")
    df_feat.pop("VWAP")
    df_feat.pop("Volume")
    df_feat.pop("Mean")
    return df_feat

In [None]:
def RET_stream(df, n):
    return (df["C"][-1] - df["C"][-(1+n)]) / (df["C"][-(1+n)] + EPS)
    
def RET_H_stream(df, n):
    return (df["H"][-1] - df["H"][-(1+n)]) / (df["H"][-(1+n)] + EPS)
    
def RET_C_stream(df, n):
    return (df["C"][-1] - df["C"][-(1+n)]) / (df["C"][-(1+n)] + EPS)
    
def RET_L_stream(df, n):
    return (df["L"][-1] - df["L"][-(1+n)]) / (df["L"][-(1+n)] + EPS)
    
def RET_O_stream(df, n):
    return (df["O"][-1] - df["O"][-(1+n)]) / (df["O"][-(1+n)] + EPS)
    
def RET_V_stream(df, n):
    return (df["V"][-1] - df["V"][-(1+n)]) / (df["V"][-(1+n)] + EPS)
    
def RET_VWAP_stream(df, n):
    return (df["VWAP"][-1] - df["VWAP"][-(1+n)]) / (df["VWAP"][-(1+n)] + EPS)
    
def RET_Cnt_stream(df, n):
    return (df["Cnt"][-1] - df["Cnt"][-(1+n)]) / (df["Cnt"][-(1+n)] + EPS)
    
def STD_stream(df, n):
    ret1 = (df["C"][1:] - df["C"][:-1]) / df["C"][:-1]
    return np.nanstd(ret1[-n:], ddof=1)
#     return pd.Series(df["C"]).pct_change(1).rolling(n).std().iloc[-1]

def RET_STD_stream(df, n):
    return RET_stream(df, n) * STD_stream(df, n)

def RSI_stream(df, n):
    return talib.RSI(df['C'], n)[-1]

def ATR_stream(df, n):
    return talib.ATR(df["H"], df["L"], df["C"], n)[-1]

def MFI_stream(df, n):
    return talib.MFI(df['H'], df['L'], df['C'], df['V'], n)[-1]

def VOL_stream(df, n):
    ret1 = (df["C"][1:] - df["C"][:-1]) / (df["C"][:-1] + EPS)
    ret1 = ret1[-n:]
    return np.sqrt(np.nanmean(ret1 ** 2))

def TRIX_stream(df, n):
    return talib.TRIX(df['C'], n)[-1]

def MACD_stream(df, fast, slow):
    return talib.MACD(df["C"], fast, slow)[0][-1]

def MACD_HIST_stream(df, fast, slow):
    return talib.MACD(df["C"], fast, slow)[2][-1]

def DEMA_stream(df, n1, n2):
    C1 = df['C'][-n1:]
    C2 = df['C'][-n2:]
    return np.log(np.nanmean(C1) / (np.nanmean(C2)) + EPS)

def EFFICIENCY_stream(df, n):
    speed = df["C"][-1] - df["C"][-(1+n)]
    volatility = np.nansum(np.abs(df["C"][1:] - df["C"][:-1])[-n:])
    return speed / volatility

def EI_stream(df, n):
    neg = df["C"][-1] - df["L"][-n:].min()
    high = df["H"][-n:].max() - df["C"][-1]
    ei = high / (neg + EPS)
    ei = np.clip(ei, -100, 100)
    return ei

def EVEMT_BIGGER_VOLUME_stream(df):
    return float(df["V"][-1] > df["V"][-2])

def EVENT_MACROSS_stream(df, fast, slow):
    fast_s = np.nanmean(df["C"][-fast:])
    slow_s = np.nanmean(df["C"][-slow:])
    return float(fast_s > slow_s)

def upper_shadow_stream(df) -> float:
    return float(df['H'][-1] - np.maximum(df['C'][-1], df['O'][-1]))

def lower_shadow_stream(df) -> float:
    return float(np.minimum(df['C'][-1], df['O'][-1]) - df['L'][-1])

In [None]:
# vecterized version
# 所有的函数接受的feature都是一个的矩阵(T, K), T是历史数据，K是len(assets)
def RET_stream_vec(df, n):
    # df: (T, K)
    # return: (K,)
    return (df["C"][-1] - df["C"][-(1+n)]) / (df["C"][-(1+n)] + EPS)
    
def RET_H_stream_vec(df, n):
    return (df["H"][-1] - df["H"][-(1+n)]) / (df["H"][-(1+n)] + EPS)
    
def RET_C_stream_vec(df, n):
    return (df["C"][-1] - df["C"][-(1+n)]) / (df["C"][-(1+n)] + EPS)
    
def RET_L_stream_vec(df, n):
    return (df["L"][-1] - df["L"][-(1+n)]) / (df["L"][-(1+n)] + EPS)
    
def RET_O_stream_vec(df, n):
    return (df["O"][-1] - df["O"][-(1+n)]) / (df["O"][-(1+n)] + EPS)
    
def RET_V_stream_vec(df, n):
    return (df["V"][-1] - df["V"][-(1+n)]) / (df["V"][-(1+n)] + EPS)
    
def RET_VWAP_stream_vec(df, n):
    return (df["VWAP"][-1] - df["VWAP"][-(1+n)]) / (df["VWAP"][-(1+n)] + EPS)
    
def RET_Cnt_stream_vec(df, n):
    return (df["Cnt"][-1] - df["Cnt"][-(1+n)]) / (df["Cnt"][-(1+n)] + EPS)
    
def STD_stream_vec(df, n):
    ret1 = (df["C"][1:] - df["C"][:-1]) / df["C"][:-1]  # (T, K)
    return np.nanstd(ret1[-n:], ddof=1, axis=0)

def RET_STD_stream_vec(df, n):
    return RET_stream_vec(df, n) * STD_stream_vec(df, n)

def VOL_stream_vec(df, n):
    ret1 = (df["C"][1:] - df["C"][:-1]) / (df["C"][:-1] + EPS)
    ret1 = ret1[-n:]
    return np.sqrt(np.nanmean(ret1 ** 2, axis=0))

def DEMA_stream_vec(df, n1, n2):
    C1 = df['C'][-n1:]
    C2 = df['C'][-n2:]
    return np.log(np.nanmean(C1, axis=0) / (np.nanmean(C2, axis=0)) + EPS)

def EFFICIENCY_stream_vec(df, n):
    speed = df["C"][-1] - df["C"][-(1+n)]
    volatility = np.nansum(np.abs(df["C"][1:] - df["C"][:-1])[-n:], axis=0)
    return speed / volatility

def EI_stream_vec(df, n):
    neg = df["C"][-1] - df["L"][-n:].min(axis=0)
    high = df["H"][-n:].max(axis=0) - df["C"][-1]
    ei = high / (neg + EPS)
    ei = np.clip(ei, -100, 100)
    return ei

def EVEMT_BIGGER_VOLUME_stream_vec(df):
    return (df["V"][-1] > df["V"][-2]).astype(float)

def EVENT_MACROSS_stream_vec(df, fast, slow):
    fast_s = np.nanmean(df["C"][-fast:], axis=0)
    slow_s = np.nanmean(df["C"][-slow:], axis=0)
    return (fast_s > slow_s).astype(float)

def upper_shadow_stream_vec(df) -> float:
    return (df['H'][-1] - np.maximum(df['C'][-1], df['O'][-1])).astype(float)

def lower_shadow_stream_vec(df) -> float:
    return (np.minimum(df['C'][-1], df['O'][-1]) - df['L'][-1]).astype(float)

In [None]:
def get_features_stream(array_dict, config=None) -> np.ndarray:
    """
    array_dict: {Key: Value} mapping. Each Value with shape = (T, K), where K = len(assets)
    
    return:
        features: {Key: Value} mapping. Each Value with shape = (K,)
    """
    removed_features = ["Mean", 'day', 'High_div_Low', 'dayofweek', 'Close_div_Open', "timestamp"]
    d = {"Count": "RET_Cnt", 
         "Open": "RET_O", 
         "Close": "RET_C", 
         "Volume": "RET_V", 
         "VWAP": "RET_VWAP",
         "High": "RET_H",
         "Low": "RET_L",
        }
    features = {}
    if config is None:
        config = {}
    t = time.time()
    for col in ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']:
        config["RET1_{}".format(col)] = {"func_name": d[col], "timeperiod": 1}
        config["RET15_{}".format(col)] = {"func_name": d[col], "timeperiod": 15}
    features['Count'] = array_dict["Cnt"][-1]
    features['Upper_Shadow'] = upper_shadow_stream_vec(array_dict)
    features['Lower_Shadow'] = lower_shadow_stream_vec(array_dict)
    features["MoM1"] = array_dict["C"][-1] - array_dict["C"][-2]
    features["MoM2"] = array_dict["H"][-1] - array_dict["C"][-2]
    features["MoM3"] = array_dict["H"][-1] - array_dict["L"][-2]
    
    features["Close_div_Open"] = array_dict["C"][-1] / (array_dict["O"][-1] + EPS)
    features["Close-Open"] = array_dict["C"][-1] - array_dict["O"][-1]
    features["High-Low"] = array_dict["H"][-1] - array_dict["L"][-1]
    features["High_div_Low"] = array_dict["H"][-1] / (array_dict["L"][-1] + EPS)
    features["Mean"] = (array_dict["H"][-1] + array_dict["L"][-1] + array_dict["C"][-1] + array_dict["O"][-1]) / 4
    features["High_div_Mean"] = array_dict["H"][-1] / (features["Mean"] + EPS)
    features["Low_div_Mean"] = array_dict["L"][-1] / (features["Mean"] + EPS)
    features["Volume_div_Count"] = array_dict["V"][-1] / (array_dict["Cnt"][-1] + 1)
    times = pd.to_datetime(array_dict["T"][:, 0],unit="s",infer_datetime_format=True)
    
    features["hour"] = float(times.hour[-1])
    features["dayofweek"] = float(times.dayofweek[-1])
    features["day"] = float(times.day[-1])
    t = time.time()
    n_assets = array_dict["C"].shape[1]
    if config is not None:
        for feature_name, setting in config.items():
            t = time.time()
            args = ()
            if "timeperiod" not in setting:
                # no argument
                pass
            elif isinstance(setting.get("timeperiod"), str):
                args = eval(setting.get("timeperiod"))
            else:
                args = (setting.get("timeperiod"), )
            
            func_name = setting["func_name"] + "_stream"
            vec_func_name = setting["func_name"] + "_stream_vec"
            if vec_func_name in globals():
                func = eval(vec_func_name)
                features[feature_name] = func(array_dict, *args)
            else:
                func = eval(func_name)
                
                features[feature_name] = np.zeros(n_assets)
                # 对于没办法向量化的就还是用粗糙的办法
                for i in range(n_assets):
                    sub_array_dict = {k: v[:, i] for k, v in array_dict.items()}
                    features[feature_name][i] = func(sub_array_dict, *args)
            
    for feature in removed_features:
        if feature in features:
            features.pop(feature)
    for k, v in features.items():
        if np.isscalar(features[k]):
            features[k] = np.broadcast_to(features[k], n_assets)
        else:
            features[k] = np.ravel(features[k])
#         assert len(features[k]) == n_assets, "{}_{}".format(len(features[k]), k)
#         assert not np.isnan(features[k])
    return features

In [None]:
class BarData:
    def __init__(self, symbol_names, 
                 O: np.array, 
                 H: np.array, 
                 L: np.array, 
                 C: np.array, 
                 V: np.array, 
                 VWAP: np.array, 
                 Cnt: np.array,
                 T: np.array):
        symbol_names = sorted(symbol_names)
#         assert len(O) == len(symbol_names)
#         assert len(H) == len(symbol_names)
#         assert len(L) == len(symbol_names)
#         assert len(C) == len(symbol_names)
#         assert len(VWAP) == len(symbol_names)
#         assert len(V) == len(symbol_names)
#         assert len(Cnt) == len(symbol_names)
        self.O = O
        self.H = H
        self.L = L
        self.C = C
        self.V = V
        self.VWAP = VWAP
        self.Cnt = Cnt
        self.T = T

class ArrayManager(object):
    def __init__(self, size, asset_names, feature_config=None):
        """Constructor"""
        self.count: int = 0
        self.ndim = len(asset_names)
        self.asset_names = asset_names
        self.size: int = size
        self.inited: bool = False
        self.O: np.ndarray = np.zeros((size, self.ndim))
        self.H: np.ndarray = np.zeros((size, self.ndim))
        self.L: np.ndarray = np.zeros((size, self.ndim))
        self.C: np.ndarray = np.zeros((size, self.ndim))
        self.V: np.ndarray = np.zeros((size, self.ndim))
        self.Cnt: np.ndarray = np.zeros((size, self.ndim))
        self.VWAP: np.ndarray = np.zeros((size, self.ndim))
        self.T: np.ndarray = np.zeros((size, self.ndim), dtype=np.int32)
            
        self.feature_config = feature_config
    
    def update_bar(self, bar: BarData, calculate_feature=True) -> None:
        """
        Update new bar data into array manager.
        """
        self.count += 1
        if not self.inited and self.count >= self.size:
            self.inited = True
        self.O[:-1] = self.O[1:]
        self.H[:-1] = self.H[1:]
        self.L[:-1] = self.L[1:]
        self.C[:-1] = self.C[1:]
        self.V[:-1] = self.V[1:]
        self.T[:-1] = self.T[1:]
        self.VWAP[:-1] = self.VWAP[1:]
        self.Cnt[:-1] = self.Cnt[1:]
        
        # forward fill nan automatically
        self.O[-1] = np.where(np.isnan(bar.O), self.O[-1], bar.O)
        self.H[-1] = np.where(np.isnan(bar.H), self.H[-1], bar.H)
        self.L[-1] = np.where(np.isnan(bar.L), self.L[-1], bar.L)
        self.C[-1] = np.where(np.isnan(bar.C), self.C[-1], bar.C)
        self.V[-1] = np.where(np.isnan(bar.V), self.V[-1], bar.V)
        self.VWAP[-1] = np.where(np.isnan(bar.VWAP), self.VWAP[-1], bar.VWAP)
        self.Cnt[-1] = np.where(np.isnan(bar.Cnt), self.Cnt[-1], bar.Cnt)
        self.T[-1] = np.where(np.isnan(bar.T), self.T[-1], bar.T)
        
        self.feature_dict = {}  # Dict[str, vector of size K]
        if calculate_feature:
            bar_data = {"O": self.O, 
                        "C": self.C,
                        "L": self.L,
                        "H": self.H,
                        "V": self.V,
                        "T": self.T,
                        "Cnt": self.Cnt,
                        "VWAP": self.VWAP,
                       }
            self.feature_dict = get_features_stream(bar_data, self.feature_config)


### Test base features

In [None]:
# test base features
asset_names = [0] * len(ASSET_ID_TO_NAME)
for k, v in ASSET_ID_TO_NAME.items():
    asset_names[k] = v
start_time = int(datetime(2021, 6, 1).timestamp())
am = ArrayManager(one_day, asset_names)
for _ in tqdm(range(one_day * 4)):
    data = df.loc[start_time].set_index("Asset_ID").loc[asset_names]
    bar = BarData(asset_names, 
        O=data.Open.values, 
        C=data.Close.values, 
        L=data.Low.values, 
        H=data.High.values, 
        VWAP=data.VWAP.values, 
        V=data.Volume.values,
        T=np.array([start_time] * len(asset_names)).astype(int),
        Cnt=data.Count.values)
    am.update_bar(bar, False)
    start_time += 60
    

prevision = 7
prev_data = None
for i in range(am.T.shape[0]):
    t = am.T[i][0]
    data = df.loc[t].set_index("Asset_ID").loc[asset_names]
    if prev_data is not None:
        data = data.fillna(prev_data)
        if not np.any(np.isnan(prev_data)):
            np.testing.assert_almost_equal(am.O[i], data.Open.values, decimal=prevision)
            np.testing.assert_almost_equal(am.C[i], data.Close.values, decimal=prevision)
            np.testing.assert_almost_equal(am.H[i], data.High.values, decimal=prevision)
            np.testing.assert_almost_equal(am.V[i], data.Volume.values, decimal=prevision)
            np.testing.assert_almost_equal(am.VWAP[i], data.VWAP.values, decimal=prevision)
            np.testing.assert_almost_equal(am.L[i], data.Low.values, decimal=prevision)
            np.testing.assert_almost_equal(am.Cnt[i], data.Count.values, decimal=prevision)
    prev_data = data.copy()

### Test whether we can replicate the features

In [None]:
# we use only one asset
ASSET_NAME = "Binance Coin"
config = read_feature_config(ASSET_NAME)

# get offline features
sub_df = df[df.Asset_ID == ASSET_NAME]
sub_df = sub_df.sort_index()
sub_df["timestamp"] = sub_df.index
sub_df = sub_df.fillna(method="ffill").dropna()
feat_df = get_features(sub_df, config=config)

# get online features
feat_df2 = []
asset_names = [ASSET_NAME]
start_time = int(datetime(2020, 8, 1).timestamp())
one_day = 60 * 24
am = ArrayManager(one_day, asset_names, config)
count = 0
avg_time = 0
for _ in tqdm(range(one_day * 4)):
    data = df.loc[start_time].set_index("Asset_ID").loc[asset_names]
    start = time.time()
    bar = BarData(asset_names, 
        O=data.Open.values, 
        C=data.Close.values, 
        L=data.Low.values, 
        H=data.High.values, 
        VWAP=data.VWAP.values, 
        V=data.Volume.values,
        T=np.array([start_time] * len(asset_names)).astype(int),
        Cnt=data.Count.values)
    am.update_bar(bar)
    end = time.time()
    feature = am.feature_dict
    feature["timestamp"] = np.broadcast_to(start_time, len(asset_names))
    feature = {k: v[0] for k, v in feature.items()}
    feat_df2.append(feature)
    start_time += 60
    count += 1
    avg_time += (end - start)
    
# test
feat_df2 = pd.DataFrame(feat_df2).set_index("timestamp")
feat_df2 = feat_df2.sort_index()
feat_df2 = feat_df2.iloc[-one_day:]  # we only test the latest one day
feat_df = feat_df.reindex(index=feat_df2.index)

# 1) check column diff
print("more feature than needed")
print(set(feat_df2.columns) - set(feat_df.columns))
print("insufficient features")
print(set(feat_df.columns) - set(feat_df2.columns))

# 2) check value diff
for name in feature.keys():
    if name in feat_df.columns:
        try:
            np.testing.assert_almost_equal(feat_df[name].values, feat_df2[name].values)
        except AssertionError as e:
            print("****** error for name {}".format(name))
            print(str(e))
    else:
        print("skip ", name)
        
print("time cost = ", str(avg_time / count))

## StartFromHere: Submitting

In [None]:
all_configs = {}
features_by_asset = {}
for asset_name in assets:
    config = read_feature_config(asset_name)
    path = os.path.join("../input/febaselinev2result/ckpt/", asset_name, "4", "used_features.pickle")
    feat_names = pickle.load(open(path, "rb"))
    assert feat_names == sorted(feat_names)
    features_by_asset[asset_name] = feat_names
    for feature_name, value in config.items():
        all_configs[feature_name] = value

In [None]:
supplement_train = pd.read_csv("../input/g-research-crypto-forecasting/supplemental_train.csv")
supplement_train = supplement_train.set_index("timestamp")

In [None]:
asset_names = list(range(14))

In [None]:
# initialize the ArrayManager using supplemental data
end_time = supplement_train.index.unique()[-1]
am = ArrayManager(one_day, asset_names, all_configs)
prev_data = None
init_days = 7
start_time = end_time - init_days * 24 * 60 * 60
for t in tqdm(range(start_time, end_time+60, 60)):
    if t in supplement_train.index:
        # 强制按照asset_names来排序
        data = supplement_train.loc[t].set_index("Asset_ID").reindex(index=asset_names).loc[asset_names]
    elif prev_data is not None:
        data = prev_data
    else:
        print("no data at {}".format(t))
        continue
    bar = BarData(asset_names, 
            O=data.Open.values, 
            C=data.Close.values, 
            L=data.Low.values, 
            H=data.High.values, 
            VWAP=data.VWAP.values, 
            V=data.Volume.values,
            T=np.array([t] * len(asset_names)).astype(int),
            Cnt=data.Count.values)
    am.update_bar(bar, False)
    prev_data = data.copy()

In [None]:
assert am.inited

In [None]:
# load pretrained model
model_by_asset_id = {}
weight_by_asset_id = {}
model_ckpt = "../input/febaselinev2result/ckpt"
for asset_id, asset_name in ASSET_ID_TO_NAME.items():
    model_by_asset_id.setdefault(asset_id, [])
    weight_by_asset_id.setdefault(asset_id, [])
    for fold in range(5):
        path = os.path.join(model_ckpt, asset_name, str(fold), "lgb.ckpt")
        if os.path.exists(path):
            model_by_asset_id[asset_id].append(lgb.Booster(model_file=path))
            weight_by_asset_id[asset_id].append(1)
    weight_by_asset_id[asset_id] = [i / sum(weight_by_asset_id[asset_id]) for i in weight_by_asset_id[asset_id]]

In [None]:
# start submitting the data
import gresearch_crypto
env = gresearch_crypto.make_env()

In [None]:
avg_process_time = 0
avg_inference_time = 0
time1 = 0
time2 = 0
time3 = 0
count = 1
for df_test, df_pred in env.iter_test():
    start_time = time.time()
    df_test = df_test.set_index("Asset_ID")
    row_ids = dict(zip(df_test.row_id, df_test.index))
    t = df_test.timestamp.iloc[0]
    time1 += (time.time() - start_time)
    start_time = time.time()
    try:
        # 强制按照asset_names来排序， 也就是按照0-14的顺序排好，asset_id就是对应的index
        data = df_test.loc[asset_names]
    except KeyError:
        data = df_test.reindex(index=asset_names).loc[asset_names]
    
    time2 += (time.time() - start_time)
    start_time = time.time()
    bar = BarData(asset_names, 
            O=data.Open.values, 
            C=data.Close.values, 
            L=data.Low.values, 
            H=data.High.values, 
            VWAP=data.VWAP.values, 
            V=data.Volume.values,
            T=np.array([t] * len(asset_names)).astype(int),
            Cnt=data.Count.values)
    am.update_bar(bar)
    time3 += (time.time() - start_time)
    start_time = time.time()
    start_time = time.time()
    df_pred = df_pred.set_index("row_id")
    # asset-specific model
    for row_id, asset_id in row_ids.items():
        feature_names = features_by_asset[ASSET_ID_TO_NAME[asset_id]]
        val = np.array([am.feature_dict[feature_name][asset_id] for feature_name in feature_names]).astype(float).reshape(1, -1)
        for model, weight in zip(model_by_asset_id[asset_id], weight_by_asset_id[asset_id]):
            pred = model.predict(val)[0]
#             pred = model.predict(val)[0]  
            pred = model.predict(val)[0]  # predict double times to test the maximum number of models we could use
            df_pred.loc[row_id, "Target"] += weight * pred

        df_pred.loc[row_id, "Target"] += pred

    df_pred = df_pred.reset_index()
    avg_inference_time += (time.time() - start_time)
    env.predict(df_pred)
    count += 1
print("avg_infer_time=", avg_inference_time/count)
print("t1=", time1/count)
print("t2=", time2/count)
print("t3=", time3/count)
print("count=", str(count))