In [1]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
import lightgbm as lgb  
import catboost as cbt 
from numba import njit, prange
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [4]:
#--------------------------#
# 计算调试开关
is_offline = True 
#--------------------------#

is_train = True  
is_infer = True 
max_lookback = np.nan 
split_day = 435 
lgb_accelerator = 'cuda' if is_offline else 'gpu'

import logging

logger = logging.getLogger('mylogger')

if is_offline:
    data_path =r'/usr/src/kaggle_/optiver-trading-at-the-close'
else:
    data_path =r'/usr/src/kaggle_/optiver-trading-at-the-close'
    # data_path = r'/kaggle/input/optiver-trading-at-the-close'
path_train  = data_path+  '/train.csv'
df_train = pd.read_csv(path_train)

#  生成股票的子预测
@njit(parallel = True)
def calculate_stock_return(wap):
    stock_return = np.zeros_like(wap)
    for i in prange(len(wap)):
        stock_return[i] = (np.log(wap[i] / wap[i - 6]) * 10_000) if i >= 6 else 0
    return stock_return
df_train = df_train.groupby(["stock_id", "date_id"]).apply(lambda frame: frame.assign(
    stock_return=calculate_stock_return(frame["wap"].values),
)).reset_index(drop=True)
df_train['index_return']=df_train["stock_return"] - df_train["target"]

df_train = df_train.dropna(subset= ['index_return'])
print("stocks returns generate finished!.")
df = df_train.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df.shape
print('Data Loaded!')


KeyboardInterrupt: 

In [None]:

def generate_features(df):

    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
               'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 
                'bid_price', 'wap','imb_s1', 'imb_s2']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_
                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    return df[features]

def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)
    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features

def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 7]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
        for window in [1, 2, 3, 7]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    # V4 
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    # V5
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
        for window in [1, 2, 3, 7]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())
    return df

def rolling_features(df,):
    window_size = [2, 3, 5, 7]
    # F_rolling
    rolling_features = ['bid_size', 'ask_size', 'bid_price', 'ask_price', 'imbalance_size', 'matched_size', 'wap']

    for window_size_i in window_size:
        for feature in rolling_features:
            df[f'{feature}_rolling_std_{window_size_i}'] = df.groupby('stock_id')[feature].transform(lambda x: x.rolling(window=window_size_i, min_periods=1).std())
            df[f'{feature}_rolling_median_{window_size_i}'] = df.groupby('stock_id')[feature].transform(lambda x: x.rolling(window=window_size_i, min_periods=1).median())
    return df

def relativedelta_features(df):
    # F_expanding calc_relative_delta
    window_size = [2, 3, 5, 7]
    rolling_features = ['bid_size', 'ask_size', 'bid_price', 'ask_price', 'imbalance_size', 'matched_size', 'wap']
    for window_size_i in window_size:
        for feature in rolling_features:
            denominator_ = df['mid_price'].expanding(window_size_i).max() - df['mid_price'].expanding(window_size_i).min()
            df[f'{feature}_relativedelta_{window_size_i}_upside']  = ( df['mid_price'].expanding(window_size_i).max() - df['mid_price'])/denominator_
            df[f'{feature}_relativedelta_{window_size_i}_downside'] = ( df['mid_price'] - df['mid_price'].expanding(window_size_i).min())/denominator_
    return df


def add_TA_features(df):

    @njit(parallel = True)
    def calculate_rsi(prices, period=14):
        rsi_values = np.zeros_like(prices)

        for col in prange(prices.shape[1]):
            price_data = prices[:, col]
            delta = np.zeros_like(price_data)
            delta[1:] = price_data[1:] - price_data[:-1]
            gain = np.where(delta > 0, delta, 0)
            loss = np.where(delta < 0, -delta, 0)

            avg_gain = np.mean(gain[:period])
            avg_loss = np.mean(loss[:period])
            
            if avg_loss != 0:
                rs = avg_gain / avg_loss
            else:
                rs = 1e-9  # or any other appropriate default value
                
            rsi_values[:period, col] = 100 - (100 / (1 + rs))

            for i in prange(period-1, len(price_data)-1):
                avg_gain = (avg_gain * (period - 1) + gain[i]) / period
                avg_loss = (avg_loss * (period - 1) + loss[i]) / period
                if avg_loss != 0:
                    rs = avg_gain / avg_loss
                else:
                    rs = 1e-9  # or any other appropriate default value
                rsi_values[i+1, col] = 100 - (100 / (1 + rs))
        return rsi_values
    
    @njit(parallel=True)
    def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
        rows, cols = data.shape
        macd_values = np.empty((rows, cols))
        signal_line_values = np.empty((rows, cols))
        histogram_values = np.empty((rows, cols))

        for i in prange(cols):
            short_ema = np.zeros(rows)
            long_ema = np.zeros(rows)

            for j in range(1, rows):
                short_ema[j] = (data[j, i] - short_ema[j - 1]) * (2 / (short_window + 1)) + short_ema[j - 1]
                long_ema[j] = (data[j, i] - long_ema[j - 1]) * (2 / (long_window + 1)) + long_ema[j - 1]

            macd_values[:, i] = short_ema - long_ema

            signal_line = np.zeros(rows)
            for j in range(1, rows):
                signal_line[j] = (macd_values[j, i] - signal_line[j - 1]) * (2 / (signal_window + 1)) + signal_line[j - 1]

            signal_line_values[:, i] = signal_line
            histogram_values[:, i] = macd_values[:, i] - signal_line

        return macd_values, signal_line_values, histogram_values
    
    @njit(parallel=True)
    def calculate_bband(data, window=20, num_std_dev=2):
        num_rows, num_cols = data.shape
        upper_bands = np.zeros_like(data)
        lower_bands = np.zeros_like(data)
        mid_bands = np.zeros_like(data)

        for col in prange(num_cols):
            for i in prange(window - 1, num_rows):
                window_slice = data[i - window + 1 : i + 1, col]
                mid_bands[i, col] = np.mean(window_slice)
                std_dev = np.std(window_slice)
                upper_bands[i, col] = mid_bands[i, col] + num_std_dev * std_dev
                lower_bands[i, col] = mid_bands[i, col] - num_std_dev * std_dev

        return upper_bands, mid_bands, lower_bands
    
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    
    for stock_id, values in df.groupby(['stock_id'])[prices]:
        # RSI
        col_rsi = [f'rsi_{col}' for col in values.columns]
        rsi_values = calculate_rsi(values.values)
        df.loc[values.index, col_rsi] = rsi_values
        gc.collect()
        
        # MACD
        macd_values, signal_line_values, histogram_values = calculate_macd(values.values)
        col_macd = [f'macd_{col}' for col in values.columns]
        col_signal = [f'macd_sig_{col}' for col in values.columns]
        col_hist = [f'macd_hist_{col}' for col in values.columns]
        
        df.loc[values.index, col_macd] = macd_values
        df.loc[values.index, col_signal] = signal_line_values
        df.loc[values.index, col_hist] = histogram_values
        gc.collect()
        
        # Bollinger Bands
        bband_upper_values, bband_mid_values, bband_lower_values = calculate_bband(values.values, window=20, num_std_dev=2)
        col_bband_upper = [f'bband_upper_{col}' for col in values.columns]
        col_bband_mid = [f'bband_mid_{col}' for col in values.columns]
        col_bband_lower = [f'bband_lower_{col}' for col in values.columns]
        
        df.loc[values.index, col_bband_upper] = bband_upper_values
        df.loc[values.index, col_bband_mid] = bband_mid_values
        df.loc[values.index, col_bband_lower] = bband_lower_values
        gc.collect()
    
    return df



def generate_all_features(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    # Generate imbalance features
    df = imbalance_features(df)
    df = reduce_mem_usage(df)

    df = other_features(df)
    df = reduce_mem_usage(df)

    df = rolling_features(df)
    df = reduce_mem_usage(df)

    df = relativedelta_features(df)
    df = reduce_mem_usage(df)
    # df = add_TA_features(df)
    gc.collect()  
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    return df[feature_name]

def select_features(df,method = 'corr',select_ratio = 0.75):

    def pca_feature_selection(df, n_components):
        from sklearn.decomposition import PCA
        from sklearn.preprocessing import StandardScaler
        # 标准化特征矩阵
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(df.values)
        # 创建PCA对象并拟合数据
        pca = PCA(n_components=n_components)
        X_selected = pca.fit_transform(X_scaled)
        # 构造降维后的DataFrame
        columns = [f"Component_{i+1}" for i in range(n_components)]
        df_selected = pd.DataFrame(X_selected, columns=columns)
        return df_selected
    
    def corr_feature_selection(df,n_components):

        corr_se = df.corr().abs().sum()
        correlated_features  = corr_se.sort_values().iloc[int(np.round(n_components)):].index
        df_selected = df.drop(correlated_features,axis=1)
        return df_selected
    
    select_ratio = 0.64

    if method  == 'pca':
        k = len(df.columns)*select_ratio
        df = pca_feature_selection(df, k)
        return df
    
    elif method == "corr":
        k = len(df.columns)*select_ratio
        df = corr_feature_selection(df, k)
        return df
    
    elif method == 'no':
        return df
print('Feature function Loaded!')


Feature function Loaded!


In [None]:
weights = np.array([
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
])

weights = {int(k):v for k,v in enumerate(weights)}

if is_offline:
    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")
else:
    df_train = df
    print("Online mode")

if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")

        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)

        df_valid_feats = select_features(df_valid_feats)
        target_col = df_valid_feats.columns
        df_train_feats = df_train_feats[target_col]
    else:
        df_train_feats = generate_all_features(df_train)
        df_train_feats = select_features(df_train_feats)
        target_col = df_train_feats.columns
        df_train_feats = df_train_feats[target_col]
        print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

print('Processing of all features in the dataframe (df) is completed!')

Offline mode
train : (4225417, 19), valid : (440951, 19)
Build Train Feats Finished.
Build Valid Feats Finished.
Processing of all features in the dataframe (df) is completed!


In [None]:
model_dict_list = [
    #             {
    #     'model': lgb.LGBMRegressor,
    #     'name': 'lgb',
    #     "params":{
    #     "objective": "mae",
    #     "n_estimators": 6000,
    #     "num_leaves": 256,
    #     "subsample": 0.6,
    #     "colsample_bytree": 0.8,
    #     "learning_rate": 0.00871,
    #     'max_depth': 11,
    #     "n_jobs": 4,
    #     "device": "cuda",
    #     "verbosity": 1,
    #     "importance_type": "gain",}
    #     ,
    #     "callbacks": [
    #     lgb.callback.early_stopping(stopping_rounds=100),
    #     lgb.callback.log_evaluation(period=100),
    #     ]
    # },

    #     {
    #     'model': lgb.LGBMRegressor,
    #     'name': 'lgb',
    #     "params":{
    #     "objective": "mae",
    #     "n_estimators": 6000,
    #     "num_leaves": 256,
    #     "subsample": 0.6,
    #     "colsample_bytree": 0.8,
    #     "learning_rate": 0.00871,
    #     'max_depth': 11,
    #     "n_jobs": 8,
    #     "device": "cuda",
    #     "verbosity": 1,
    #     "importance_type": "gain",
    #     "min_child_samples": 15,  # Minimum number of data points in a leaf
    #     "reg_alpha": 0.1,  # L1 regularization term
    #     "reg_lambda": 0.3,  # L2 regularization term
    #     "min_split_gain": 0.2,  # Minimum loss reduction required for further partitioning
    #     "min_child_weight": 0.001,  # Minimum sum of instance weight (hessian) in a leaf
    #     "bagging_fraction": 0.9,  # Fraction of data to be used for training each tree
    #     "bagging_freq": 5,  # Frequency for bagging
    #     "feature_fraction": 0.9,  # Fraction of features to be used for training each tree
    #     "num_threads": 4,  # Number of threads for LightGBM to use
    #     }
    #     ,
    #     "callbacks": [
    #     lgb.callback.early_stopping(stopping_rounds=100),
    #     lgb.callback.log_evaluation(period=100),
    #     ]
    # },


    {
        'model': cbt.CatBoostRegressor,
        'name': 'cat',
        'params': dict(iterations=2000,
                       learning_rate=0.05,
                       depth=12,
                       l2_leaf_reg=30,
                       bootstrap_type='Bernoulli',
                       subsample=0.66,
                       loss_function='MAE',
                       eval_metric='MAE',
                       metric_period=100,
                       od_type='Iter',
                       od_wait=30,
                       task_type='GPU',
                       allow_writing_files=False
                       ),
        'callbacks': []
    }
]
print('Params Loaded!')


Params Loaded!


In [None]:
feature_name = list(df_train_feats.columns)
print(f"Feature length = {len(feature_name)}")

Feature length = 152


In [None]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step 
    return out

In [None]:
print('Now,we are going to training!~')

Now,we are going to training!~


In [None]:
for model_dict in model_dict_list:

    name = model_dict['name']
    print(f'now model is {name}')
    model_ = model_dict['model']
    model_params = model_dict['params']
    call_back_func  = model_dict['callbacks']
    if is_train:
        feature_name = list(df_train_feats.columns)
        print(f"Feature length = {len(feature_name)}")
        stock_id_arr = df.stock_id.values
        offline_split = df_train['date_id']>(split_day - 45)
        df_offline_train = df_train_feats[~offline_split].copy(deep = True)
        df_offline_valid = df_train_feats[offline_split].copy(deep = True)
        df_offline_train_target = df_train['stock_return'][~offline_split].copy(deep = True)
        df_offline_valid_target = df_train['stock_return'][offline_split].copy(deep = True)

        print("Valid Model Trainning.")
        _model = model_(**model_params)
        if name == 'lgb':
            _model.fit(
                df_offline_train[feature_name],
                df_offline_train_target,
                eval_set=[( df_offline_valid[feature_name], df_offline_valid_target)],
                callbacks = call_back_func 
            )
        elif name == 'cat':
                summary = _model.select_features(
                            df_offline_train[feature_name], df_offline_train_target,
                            eval_set=[(df_offline_valid[feature_name], df_offline_valid_target)],
                            features_for_select=feature_name,
                            num_features_to_select=len(feature_name)-24,    # Dropping from 124 to 100
                            steps=3,
                            algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
                            shap_calc_type=EShapCalcType.Regular,
                            train_final_model=False,
                            plot=True,
                        )
                _model.fit(
                        df_offline_train[summary['selected_features_names']], df_offline_train_target,
                        eval_set=[(df_offline_valid[summary['selected_features_names']], df_offline_valid_target)],
                        use_best_model=True,
                    )
        else:
            _model.fit(
                df_offline_train[feature_name],
                df_offline_train_target,
                eval_set=[(df_offline_valid[feature_name], df_offline_valid_target)],
            )
        
        del df_offline_train, df_offline_train_target
        gc.collect()

        # infer
        df_train_target = df_train['stock_return']
        print("Infer Model Trainning.")
        infer_params = model_params.copy()
        best_iter_n = _model.best_iteration_ if hasattr(_model, "best_iteration_") else _model.best_iteration
        infer_params["n_estimators"] = int(1.2 * best_iter_n)
        infer__model =  model_(**model_params)
        if name == 'lgb':
            infer__model.fit(df_train_feats[feature_name],
                            df_train_target,
                            eval_set=[(df_offline_valid[feature_name],
                            df_offline_valid_target)],
                            callbacks = call_back_func 
                            )
        elif name == 'cat':
            infer__model.fit(df_train_feats[summary['selected_features_names']], df_train_target)
        else:
            infer__model.fit(df_train_feats[feature_name],
                            df_train_target,
                            eval_set=[(df_offline_valid[feature_name],
                            df_offline_valid_target)],
                            )

        if is_offline:   
            # offline predictions
            df_valid_target = df_valid['stock_return']
            apd_index  = df_valid['index_return']
            offline_predictions = infer__model.predict(df_valid_feats[feature_name])
            weighted_ = df_valid.stock_id.map(weights).values
            offline_score = mean_absolute_error(offline_predictions*weighted_, df_valid_target)
            print(f"Offline Score {np.round(offline_score, 4)}")

now model is cat
Feature length = 152
Valid Model Trainning.


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Step #1 out of 3
0:	learn: 7.8262533	test: 7.1606446	best: 7.1606446 (0)	total: 35.2ms	remaining: 1m 10s
100:	learn: 5.6892858	test: 5.0579830	best: 5.0579830 (100)	total: 3.33s	remaining: 1m 2s
200:	learn: 4.1200145	test: 3.5548875	best: 3.5548875 (200)	total: 6.39s	remaining: 57.2s
300:	learn: 2.9934928	test: 2.5079768	best: 2.5079768 (300)	total: 9.74s	remaining: 55s
400:	learn: 2.1965677	test: 1.7912426	best: 1.7912426 (400)	total: 13.3s	remaining: 52.9s
500:	learn: 1.6344630	test: 1.3018790	best: 1.3018790 (500)	total: 17.1s	remaining: 51.1s
600:	learn: 1.2364067	test: 0.9674159	best: 0.9674159 (600)	total: 21s	remaining: 48.8s
700:	learn: 0.9547844	test: 0.7381455	best: 0.7381455 (700)	total: 24.9s	remaining: 46.2s
