In [1]:
import os
print("Current DIR: ", os.getcwd())

# from platform import python_version
# print(python_version())

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

Current DIR:  /kaggle/working


In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from itertools import combinations
from time import time

import gc 

import warnings 
warnings.filterwarnings('ignore')

In [3]:
# print("pandas version:", pd.__version__)

In [4]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    # Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
                    
    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")

    return df

In [5]:
def gen_v1_features(df, prices):
    # V1 features: directly apply formula to a single row
    
    v1_features = {
        "volume": "ask_size + bid_size",
        "mid_price": "(ask_price + bid_price)/2",
        "liquidity_imbalance": "(bid_size-ask_size)/(bid_size+ask_size)",
        "matched_imbalance": "(imbalance_size - matched_size)/(matched_size+imbalance_size)",
        "size_imbalance": "bid_size / ask_size",
        "imbalance_intensity": "imbalance_size / volume",
        "matched_intensity": "matched_size / volume",
        "price_spread": "ask_price - bid_price",
        'market_urgency': 'price_spread * liquidity_imbalance',
        'depth_pressure': '(ask_size - bid_size) * (far_price - near_price)',
        'price_pressure': 'imbalance_size * (ask_price - bid_price)',
        'imbalance_with_flag': 'imbalance_size * imbalance_buy_sell_flag',
    }

    # include pair-wise price imbalances
    for c in combinations(prices, 2):
        v1_features[f"{c[0]}_{c[1]}_imbalance"] = f"({c[0]} - {c[1]}) / ({c[0]} + {c[1]})"
    
    for k, v in v1_features.items():
        df[k] = df.eval(v)
        
    v1_feature_category = {
#         'minute': 'seconds_in_bucket // 60',
        'imb_buy_side': "(imbalance_buy_sell_flag == 1)",
        'imb_sell_side': "(imbalance_buy_sell_flag == -1)",
        'first_half_session': '(seconds_in_bucket <= 240)',
        'second_half_session': '(seconds_in_bucket > 240)'
    }
    
    for k, v in v1_feature_category.items():
        df[k] = df.eval(v).astype(np.int8)
    
    df['minute'] = df['seconds_in_bucket'] // 60
        
    df = reduce_mem_usage(df, verbose=0)
        
    return df, list(v1_features.keys()), ['minute']+list(v1_feature_category.keys())

In [6]:
def gen_feature_cols(feature_dicts):
    
    feature_cols = []
    category_cols = []
    
    for k, v in feature_dicts.items():
        feature_cols += v
        if k in ['category', 'v1_feature_category']:
            category_cols += v
            
    return feature_cols, category_cols


In [7]:
def gen_features_49(df, feature_dicts):
    
    df = df.fillna(0)
    df = reduce_mem_usage(df, verbose=0)
    
    df_v1, v1_feat, v1_feat_cat = gen_v1_features(df, feature_dicts['prices'])
    feature_dicts['v1_features'] = v1_feat
    feature_dicts['v1_feature_category'] = v1_feat_cat
    
    df_v1.fillna(0, inplace=True)
    df_v1.replace([np.inf, -np.inf], 0, inplace=True)
    df_v1 = reduce_mem_usage(df_v1, verbose=0)
    
    return df_v1, feature_dicts

In [8]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

In [9]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [10]:
max_ts_len = 12 # max length of ts to keep in cache

feature_dicts = {
    'prices': ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"],
    'sizes':  ["matched_size", "bid_size", "ask_size", "imbalance_size"],
    "category": ["stock_id", "seconds_in_bucket", 'imbalance_buy_sell_flag']
}

# load train_csv to prepare df_records and df_revealed
n_lookback = 50
train_csv = "/kaggle/input/optiver-trading-at-the-close/train.csv"

df = pd.read_csv(train_csv)
df = df[~df['target'].isnull()]

date_list = df['date_id'].unique().tolist()
date_list.sort()

lookback = date_list[-n_lookback:]

# generate df_records
df_records = df[df['date_id'].isin(lookback)].copy()
df_records.drop(columns=['target'], inplace=True)

df_records, feature_dicts = gen_features_49(df_records, feature_dicts)

df_records.fillna(0, inplace=True)
df_records.replace([np.inf, -np.inf], 0, inplace=True)
df_records = reduce_mem_usage(df_records, verbose=1)
df_records.reset_index(drop=True, inplace=True)

# generate df_revealed
df_revealed = df[['date_id', 'stock_id', 'seconds_in_bucket', 'target']]
df_revealed = df_revealed[df_revealed['date_id'].isin(lookback)]
df_revealed.reset_index(drop=True, inplace=True)

del df
gc.collect()

Memory usage of dataframe is 93.36 MB
Memory usage after optimization is: 93.36 MB
Decreased by 0.00%


22

In [11]:
counter = 0 
n_reveals = 0
y_min, y_max = -64, 64

cache = pd.DataFrame()

xy_train_past = df_records.merge(
        df_revealed, 
        left_on=['date_id', 'stock_id', 'seconds_in_bucket'],
        right_on=['date_id', 'stock_id', 'seconds_in_bucket'], 
        how='left')

day_begin = time()

feature_cols, category_cols = gen_feature_cols(feature_dicts)

for (test, revealed_targets, sample_prediction) in iter_test:
    now_time = time()
    
    curr_date = test['date_id'].unique().tolist()[0]
    print("curr_date", curr_date)
    
    if counter == 0:
        print(len(df_records))
        n_reveals += 1        
        print('Targets revealed for day', revealed_targets['revealed_date_id'].unique().tolist())
        if n_reveals > 1:
            tmp = revealed_targets[['revealed_date_id', 'stock_id', 'seconds_in_bucket', 'revealed_target']]
            tmp.columns = ['date_id', 'stock_id', 'seconds_in_bucket', 'target']
            df_revealed = pd.concat([df_revealed, tmp])

        lookback = np.arange(curr_date-n_lookback, curr_date)
        x_train_past = df_records[df_records['date_id'].isin(lookback)]
        y_train_past = df_revealed[df_revealed['date_id'].isin(lookback)]

        xy_train_past = x_train_past.merge(
            y_train_past, 
            left_on=['date_id', 'stock_id', 'seconds_in_bucket'],
            right_on=['date_id', 'stock_id', 'seconds_in_bucket'], 
            how='left')
        xy_train_past = xy_train_past[~xy_train_past['target'].isnull()]
    
    # train knn 
    scaler = StandardScaler()
    neigh = KNeighborsRegressor(n_neighbors=50, weights="distance")
    
    x_train = xy_train_past[feature_cols+category_cols]
    x_train_std = scaler.fit_transform(x_train)
    y_train = xy_train_past['target']
    
    # gen test features
    test, _ = gen_features_49(test, feature_dicts)
    x_pred = test[feature_cols + category_cols]
    x_pred_std = scaler.transform(x_pred)
    
    neigh.fit(x_train_std, y_train)
    
    df_records = pd.concat([df_records, test]) 
                            
#     这个column 不能drop。。。
#     df_test = df_test.drop(columns=["currently_scored"])
#     df_test.fillna(0, inplace=True)
#     df_test.replace([np.inf, -np.inf], 0, inplace=True)
    
#     scaler = StandardScaler()
#     neigh = KNeighborsRegressor(n_neighbors=50, weights="distance")
    
#     used_records = df_records.dropna()
#     used_records = used_records[used_records["date_id"] >= current_date - 30]
#     print(len(used_records))

#     std_records = scaler.fit_transform(used_records.dropna()[used_records.columns.difference(['date_id', "time_id", "target"])])
#     std_tests = scaler.transform(df_test[df_test.columns.difference(['date_id', "currently_scored"])])

#     cur_targets = used_records.loc[:, "target"]
#     neigh.fit(std_records, cur_targets)
    
#     df_records = pd.concat([df_records, df_test.loc[:, df_test.columns.isin(df_records.columns)]], ignore_index=True)
    
#     feature_cols, category_cols = gen_feature_cols(feature_dicts)
    
    # predict target using trained model
    y_pred = neigh.predict(x_pred_std)
    y_pred_zs = zero_sum(y_pred, test['bid_size']+test['ask_size'])
    y_clip = np.clip(y_pred_zs, y_min, y_max)
    
    if np.any(np.isnan(y_clip)):
        df_test.to_csv("error_iter.csv")
        raise Exception("Error! NaN is found in y_clip.")
        
    sample_prediction['target'] = y_clip
    
    env.predict(sample_prediction)

    # after 54 timesteps, a new day starts
    if counter >= 54:
        print(f"New Day! Time used: {time() - day_begin:2f}s.")
        counter = 0
        day_begin = time()
        cache = pd.DataFrame()
        
        # kick out the oldest date
        recorded_dates = df_records['date_id'].unique().tolist()
        recorded_dates.sort()
        df_records = df_records[df_records['date_id'].isin(recorded_dates[1:])]
    else:
        counter += 1
        

            

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
curr_date 478
549999
Targets revealed for day [477]
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
curr_date 478
New Day! Time used: 78.434880s.
curr_date 479
549999
Targets reveal

In [12]:
df_submission = pd.read_csv('submission.csv')
df_submission

Unnamed: 0,row_id,target
0,478_0_0,-1.027586
1,478_0_1,1.597115
2,478_0_2,9.376052
3,478_0_3,0.472221
4,478_0_4,0.611340
...,...,...
32995,480_540_195,-0.763817
32996,480_540_196,-1.388240
32997,480_540_197,1.231299
32998,480_540_198,0.974671
