In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
import seaborn as sns
import os, sys, warnings
from time import time 

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

ModuleNotFoundError: No module named 'seaborn'

# TO DO:
1. Wrap up one function for feature generation 
2. Wrap up one function for model training 
3. Within the submission API, consider proper concat of cache and new test 
    
    * V1 features are row-based --> don't need cache. 
    * V2 features are based on sec_in_bucket (cross-section feat) --> don't need cache. (Perhaps the gen_v2_features function should be changed to not using groupby() ? )
    * V3 features requires timeseries data -->  the cache is needed.
    * The cache should save the timeseries records of all stocks with sufficient length. 
    * The row after concat should be re-ordered. 
    * After calculate features, only the current seconds_in_bucket should be returned. 
    * Standarization: in the training phase, the standarization is implemented on a multi-day scale. In the cached dataset, only limited timesteps are used in Standarization. 
    * Standarization: during training, perhaps we should perform Standarization on cross-section only. 


In [4]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    # Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
                    
    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")

    return df

In [5]:
def gen_v1_features(df, prices):

    # V1 features: directly apply formula to a single row
    v1_features = {
        "volume": "ask_size + bid_size",
        "mid_price": "(ask_price + bid_price)/2",
        "liquidity_imbalance": "(bid_size-ask_size)/(bid_size+ask_size)",
        "matched_imbalance": "(imbalance_size - matched_size)/(matched_size+imbalance_size)",
        "size_imbalance": "bid_size / ask_size",
        "imbalance_intensity": "imbalance_size / volume",
        "matched_intensity": "matched_size / volume",
        "price_spread": "ask_price - bid_price",
        'market_urgency': 'price_spread * liquidity_imbalance',
        'depth_pressure': '(ask_size - bid_size) * (far_price - near_price)',
        'price_pressure': 'imbalance_size * (ask_price - bid_price)',
        'imbalance_with_flag': 'imbalance_size * imbalance_buy_sell_flag',
    }

    # include pair-wise price imbalances
    for c in combinations(prices, 2):
        v1_features[f"{c[0]}_{c[1]}_imbalance"] = f"({c[0]} - {c[1]}) / ({c[0]} + {c[1]})"
    
    for k, v in v1_features.items():
        df[k] = df.eval(v)
    v1_feature_category = {
        'minute': 'seconds_in_bucket / 60',
        'imb_buy_side': "(imbalance_buy_sell_flag == 1)",
        'imb_sell_side': "(imbalance_buy_sell_flag == -1)",
        'first_half_session': '(seconds_in_bucket <= 240)',
        'second_half_session': '(seconds_in_bucket > 240)'
    }
    
    for k, v in v1_feature_category.items():
        df[k] = df.eval(v).astype(np.int8)
        
    df = reduce_mem_usage(df, verbose=0)
        
    return df, list(v1_features.keys()), list(v1_feature_category.keys())
        
    

In [6]:
def gen_v2_features(df, v2_feat_cols):
    
    # V2 features: cross-section features
    # V2 features are generated on the groupby(['date_id', 'seconds_in_bucket'])
    # These features includes:
    # 1. statistics of V1 features (non-categorical)
    # 2. rank of V1 features for each stocks (non-categorical)
    
    group = df.groupby(['date_id', 'seconds_in_bucket'])

    v2_features_stats = ['mean', 'median', 'std', 'min', 'max']

    # calculate statistics of V1 features for each stock
    df_v2 = group[v2_feat_cols].agg(v2_features_stats).reset_index()
    df_v2.columns = ['date_id', 'seconds_in_bucket'] + [f"{c[1]}_{c[0]}" for c in df_v2.columns[2:]]
    df = df.merge(df_v2, on=['date_id', 'seconds_in_bucket'], how='left')
    

    # calculate rank of V1 features for each stock
    df_v2 = group[v2_feat_cols].rank(pct=True).add_prefix('rank_')
    df = df.merge(df_v2, left_index=True, right_index=True, how='left')
    
    df = reduce_mem_usage(df, verbose=0)
    
    v2_features =\
        [f"{s}_{c}" for c in v2_feat_cols for s in v2_features_stats] + \
        [f"rank_{c}" for c in v2_feat_cols]
        
    return df, v2_features
    

In [7]:
# !!! Requrires at least 11 timesteps to calculate all rolling statistics
def gen_v3_features(df, prices, sizes, v1_features):
    # V3 features: rolling statistics of V1 features (non-categorical)
    # V3 features are generated on the groupby(['date_id', 'stock_id'])
    # here we introduce ta-lib functions to calculate TA indicators

    # V3.1 relative change of V1 features by shift(1)
    # for prices, we calculate the change in basis points (*1e4)
    # for other features, we calculate the change in percentage (*1e2)
    group_by_stock = df.groupby(['date_id', 'stock_id'])
    
    relative_price = group_by_stock[prices].pct_change(1).add_prefix('pct_')*1e4
    relative_others = group_by_stock[sizes+v1_features].pct_change(1).add_prefix('pct_')*1e2

    df = pd.concat([df, relative_price, relative_others], axis=1)
    v3_features = list(relative_price.columns) + list(relative_others.columns)
    
    # V3.2 Simple TA indicators
    # Those are simple TA indicators that use only one feature
    df_v3 = group_by_stock[prices + sizes + v1_features].rolling(5).agg(['mean', 'std', 'max', 'min']).reset_index()
    stats_cols = [f"{c[1]}_{c[0]}_5" for c in df_v3.columns[2:]]
    df_v3.columns = ['date_id', 'stock_id'] + stats_cols
    df_v3.set_index('_level_2_5', inplace=True)
    df_v3.drop(columns=['date_id', 'stock_id'], inplace=True)
    
    df = df.merge(df_v3, left_index=True, right_index=True, how='left')
    v3_features += df_v3.columns.tolist()
        
    # # V3.3 TA indicators that use multiple features
#     def composite_ta(df):
#         ad_osc = ta.ADOSC(df['ask_price'], df['bid_price'], df['wap'], df['volume'], fastperiod=3, slowperiod=5)
#         macd, macdsignal, macdhist = ta.MACD(df['wap'], fastperiod=5, slowperiod=11, signalperiod=3)
        
#         return pd.DataFrame({
#             'ema': ta.EMA(df['wap'], timeperiod=5),
#             'rsi': ta.RSI(df['wap'], timeperiod=5),
#             'cci': ta.CCI(df['ask_price'], df['bid_price'], df['wap'], timeperiod=5),
#             'mfi': ta.MFI(df['ask_price'], df['bid_price'], df['wap'], df['volume'], timeperiod=5),
#             'ad_osc': ad_osc,
#             'macd': macd,
#             'macdsignal': macdsignal,
#             'macdhist': macdhist
#         })
    
#     df_v3 = group_by_stock.apply(composite_ta) 
#     v3_features += df_v3.columns.tolist()
    
#     df_v3.reset_index(inplace=True)
#     df_v3.set_index('level_2', inplace=True)
#     df_v3.drop(columns=['date_id', 'stock_id'], inplace=True)
    
#     df = pd.concat([df, df_v3], axis=1)
    
    return df, v3_features

In [8]:
df = pd.read_csv("../../optiver-trading-at-the-close/train.csv")
df = df[~df['target'].isnull()] 

print(df.shape)
print(f"Trading days: {df['date_id'].nunique()}")
print(f"Stocks: {df['stock_id'].nunique()}")

df['far_price'] = df['far_price'].fillna(0)
df['near_price'] = df['near_price'].fillna(0)

df = reduce_mem_usage(df, verbose=1)

df.head()

(5237892, 17)
Trading days: 481
Stocks: 200
Memory usage of dataframe is 719.32 MB
Memory usage after optimization is: 344.67 MB
Decreased by 52.08%


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180603.0,1,0.999812,13380277.0,0.0,0.0,0.999812,60651.5,1.000026,8493.030273,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.9,-1,0.999896,1642214.25,0.0,0.0,0.999896,3233.040039,1.00066,20605.089844,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.9,-1,0.999561,1819368.0,0.0,0.0,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917680.0,-1,1.000171,18389746.0,0.0,0.0,0.999999,2324.899902,1.000214,479032.40625,1.0,-4.010201,0,0_0_3
4,4,0,0,447550.0,-1,0.999532,17860614.0,0.0,0.0,0.999394,16485.539062,1.000016,434.100006,1.0,-7.349849,0,0_0_4


In [9]:
df = df.drop(df[(df['date_id'] == 35) & (df['stock_id'] == 131)].index)
df = df.drop(df[(df['date_id'] == 328) & (df['stock_id'] == 101)].index)
df = df.drop(df[(df['date_id'] == 438) & (df['stock_id'] == 19)].index)


In [10]:
# estimate running time: 9min

prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
categorical_cols = ["stock_id", "seconds_in_bucket", 'imbalance_buy_sell_flag']

feature_cols = prices + sizes

df, v1_features, v1_feature_category = gen_v1_features(df, prices)
feature_cols += v1_features
categorical_cols += v1_feature_category

df, v2_features = gen_v2_features(df, prices+sizes+v1_features)
feature_cols += v2_features

df, v3_features = gen_v3_features(df, prices, sizes, v1_features)
feature_cols += v3_features

df.fillna(0, inplace=True)
df.replace([np.inf, -np.inf], 0, inplace=True)

df = reduce_mem_usage(df, verbose=1)

Memory usage of dataframe is 12866.55 MB
Memory usage after optimization is: 9170.16 MB
Decreased by 28.73%


In [3]:
print(len(feature_cols) + len(categorical_cols))
df.to_csv('features_460.csv', index=False)

NameError: name 'feature_cols' is not defined

In [36]:
import pandas as pd
df = pd.read_csv('features_460.csv')
print(df.dtypes)

stock_id                            int64
date_id                             int64
seconds_in_bucket                   int64
imbalance_size                    float64
imbalance_buy_sell_flag             int64
                                   ...   
min_ask_price_wap_imbalance_5     float64
mean_bid_price_wap_imbalance_5    float64
std_bid_price_wap_imbalance_5     float64
max_bid_price_wap_imbalance_5     float64
min_bid_price_wap_imbalance_5     float64
Length: 456, dtype: object


In [37]:
df = df.iloc[:, :100]
df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,median_ask_size,std_ask_size,min_ask_size,max_ask_size,mean_imbalance_size,median_imbalance_size,std_imbalance_size,min_imbalance_size,max_imbalance_size,mean_volume
0,0,0,0,3180602.8,1,0.999812,13380277.0,0.0,0.0,0.999812,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
1,1,0,0,166603.9,-1,0.999896,1642214.2,0.0,0.0,0.999896,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
2,2,0,0,302879.88,-1,0.999561,1819368.0,0.0,0.0,0.999403,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
3,3,0,0,11917682.0,-1,1.000171,18389746.0,0.0,0.0,0.999999,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
4,4,0,0,447549.97,-1,0.999532,17860614.0,0.0,0.0,0.999394,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164


In [15]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
if len(categorical_cols) > 0:
    print("Categorical columns:", categorical_cols)
else:
    print("No categorical columns found.")

Categorical columns: Index(['row_id'], dtype='object')


In [None]:
# Catboost

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

def catboost_train_and_predict(df):
    predictions = []
    total_samples = len(df)
    chunk_size = 55 * 5
    prediction_chunk_size = 55
    cat_features = ['row_id']

    for start in range(0, total_samples):
        end = min(start + chunk_size, total_samples)
        train_chunk = df.iloc[start:end]

        if len(train_chunk) < chunk_size:
            break  # Skip the last chunk if it's smaller than chunk_size

        X_train = train_chunk.drop('target', axis=1)
        y_train = train_chunk['target']

        # Initialize and train the CatBoost model
        model = CatBoostRegressor(iterations=10, learning_rate=0.1, depth=6)
        model.fit(X_train, y_train, verbose=False, cat_features=cat_features)

        # Predict the next 55 data points
        next_start = end
        next_end = min(end + prediction_chunk_size, total_samples)

        if next_end > next_start:
            X_predict = df.iloc[next_start:next_end].drop('target', axis=1)
            predictions.extend(model.predict(X_predict))

    return predictions

catboost_predictions = catboost_train_and_predict(df)


In [38]:
stock_labels = pd.read_csv('stock_labels.csv') 
filtered_stock_ids = stock_labels[stock_labels['labels'] == 1]['stock_id']
shenxian_feats = df[df['stock_id'].isin(filtered_stock_ids)]
shenxian_feats

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,median_ask_size,std_ask_size,min_ask_size,max_ask_size,mean_imbalance_size,median_imbalance_size,std_imbalance_size,min_imbalance_size,max_imbalance_size,mean_volume
11,11,0,0,1506120.20,-1,0.999968,2001112.50,0.000000,0.000000,0.999840,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
31,31,0,0,744025.20,-1,1.000327,936927.25,0.000000,0.000000,0.999436,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
69,70,0,0,0.00,0,1.000211,1714853.00,0.000000,0.000000,0.999806,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
77,80,0,0,152362.22,1,0.999589,790290.00,0.000000,0.000000,0.998633,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
79,82,0,0,0.00,0,0.999760,780396.30,0.000000,0.000000,0.999760,...,16551.04,64969.004,63.14,483973.72,7625203.5,1603181.5,29044134.0,0.0,365273470.0,67781.164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2596451,78,240,320,11359.02,1,1.006259,1382451.10,1.007427,1.006259,1.005092,...,23893.86,48539.668,190.56,270226.00,1576792.9,475212.3,2839659.5,0.0,22964712.0,81507.860
2596453,80,240,320,413663.38,1,1.008405,2471303.20,1.031258,1.019546,1.008119,...,23893.86,48539.668,190.56,270226.00,1576792.9,475212.3,2839659.5,0.0,22964712.0,81507.860
2596455,82,240,320,0.00,0,1.002578,1353229.20,1.002578,1.002578,1.000577,...,23893.86,48539.668,190.56,270226.00,1576792.9,475212.3,2839659.5,0.0,22964712.0,81507.860
2596458,85,240,320,622720.44,1,1.004109,3457989.00,1.023208,1.010432,1.003319,...,23893.86,48539.668,190.56,270226.00,1576792.9,475212.3,2839659.5,0.0,22964712.0,81507.860


In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor

def mlp_train_and_predict(df, num_features):
    predictions = []
    for shenxian_id in filtered_stock_ids:
        shenxian_df = df[df['stock_id'] == shenxian_id]
        stock_predictions = []
        total_samples = len(df)
        chunk_size = 55 * 5
        prediction_chunk_size = 55

        for start in range(0, total_samples):
            end = min(start + chunk_size, total_samples)
            train_chunk = df.iloc[start:end]
    
            if len(train_chunk) < chunk_size:
                break  # Skip the last chunk if it's smaller than chunk_size
    
            X_train = train_chunk.drop('target', axis=1)
            y_train = train_chunk['target']
    
            # Initialize and train the MLP
            mlp = MLPRegressor(hidden_layer_sizes=(num_features, num_features//2), max_iter=100)
            mlp.fit(X_train, y_train)
    
            # Predict the next 55 data points
            next_start = end
            next_end = min(end + prediction_chunk_size, total_samples)
    
            if next_end > next_start:
                X_predict = df.iloc[next_start:next_end].drop('target', axis=1)
                stock_predictions.append(mlp.predict(X_predict))
                
        predictions.append(stock_predictions)
    return predictions
    
mlp_predictions = mlp_train_and_predict(df, 100)