In [5]:
#@title Imports
import pandas as pd
import numpy as np
import pickle
import time
import os

In [6]:
#@title Setup
kaggle = False
colab = True

is_train = True

if kaggle:
    print("Running in Kaggle environment...")
    comp_path = "/kaggle/input/optiver-trading-at-the-close/"
    train_path = f"{comp_path}/train.csv"
    models_path = f"{comp_path}/models/"

if colab:
    import os
    from google.colab import drive

    mount_path = f"/content/drive/"

    if not os.path.ismount(mount_path):
        print("Mounting Google Drive...")
        drive.mount(mount_path, force_remount=True)
    comp_path = f"{mount_path}MyDrive/optiver"

if is_train:
  dates_train = [0,390]
  dates_test = [391,480]
else:
  dates_train = [0,480]
  dates_test = [-1,-1]

models_path = f"{comp_path}/models/"
train_path = f"{comp_path}/train.csv"
train_eng_path = f"{comp_path}/train_eng.pkl"

In [7]:
#@title Read train data

def read_train() -> pd.DataFrame:
  train_df = pd.read_csv(train_path).drop(['row_id'], axis=1)
  print(train_df.count())
  print(train_df.dtypes)
  print(f"Rows:\n{train_df.isnull().sum()}")
  print(f"Total NaN values: {train_df.isnull().sum().sum()}")

  train_df = train_df.dropna(subset=["ask_price"])
  train_df.loc[train_df['seconds_in_bucket'] <= 300, "near_price"] = 0
  train_df.loc[train_df['seconds_in_bucket'] <= 300, "far_price"] = 0
  train_df['far_price'] = train_df['far_price'].interpolate()

  print(f"Total NaN values after preprocessing: {train_df.isnull().sum().sum()}")

In [8]:
#@title Util functions
def save_pickle(data, file_path):
  directory = os.path.dirname(file_path)
  if not os.path.exists(directory):
      os.makedirs(directory)
  with open(file_path, 'wb') as file:
      pickle.dump(data, file)
  print(f"Data saved to {file_path}")

def load_pickle(file_path):
  if os.path.exists(file_path):
    with open(file_path, 'rb') as file:
      data = pickle.load(file)
    return data
  else:
    raise FileNotFoundError(f"No such file: {file_path}")

def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Mem. usage decreased to {end_mem:.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

# Feature Engineering

In [9]:
# https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/442851
stock_weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
stock_weights = {int(k):v for k,v in enumerate(stock_weights)}

In [10]:
def feature_engineering(df) -> pd.DataFrame:
  df["volume"] = df["ask_size"] + df["bid_size"]
  df["mid_price"] = (df["ask_price"] + df["bid_price"]) / 2

  df['bid_ask_spread'] = df['ask_price'] - df['bid_price']
  df['bid_ask_ratio'] = df['bid_price'] / df['ask_price']
  df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
  df["market_urgency"] = df["bid_ask_spread"] * df["liquidity_imbalance"]

  df["market_urgency_v2"] = (df["ask_price"]+df["bid_price"])/2 - (df["bid_price"]*df["bid_size"]+df["ask_price"]*df["ask_size"]) / (df["bid_size"]+df["ask_size"])
  df["stock_weights"] = df["stock_id"].map(stock_weights)
  df["weighted_wap"] = df["stock_weights"] * df["wap"]

  ss = df.groupby('time_id')['weighted_wap'].sum()/df.groupby('time_id')['stock_weights'].sum()
  ss = ss.reset_index()
  ss.columns = ['time_id','indexwap']

  df = pd.merge(df,ss,how='left',on='time_id')
  df['indexwapdiff'] = df['wap'] - df['indexwap']

  global_stock_id_feats = {
      "median_size": df.groupby("stock_id")["bid_size"].median() + df.groupby("stock_id")["ask_size"].median(),
      "std_size": df.groupby("stock_id")["bid_size"].std() + df.groupby("stock_id")["ask_size"].std(),
      "ptp_size": df.groupby("stock_id")["bid_size"].max() - df.groupby("stock_id")["bid_size"].min(),
      "median_price": df.groupby("stock_id")["bid_price"].median() + df.groupby("stock_id")["ask_price"].median(),
      "std_price": df.groupby("stock_id")["bid_price"].std() + df.groupby("stock_id")["ask_price"].std(),
      "ptp_price": df.groupby("stock_id")["bid_price"].max() - df.groupby("stock_id")["ask_price"].min(),
  }

  for key, value in global_stock_id_feats.items():
      df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

In [12]:
if os.path.exists(train_eng_path):
  print(f'Found existing data saved in {train_eng_path}, loading...')
  train_df = load_pickle(train_eng_path)
else:
  train_df = read_train()
  train_df = feature_engineering(train_df)
  train_df = reduce_mem_usage(train_df, verbose=True)
  save_pickle(train_df, train_eng_path)

print(train_df.dtypes)

Found existing data saved in /content/drive/MyDrive/optiver/train_eng.pkl, loading...
stock_id                     int16
date_id                      int16
seconds_in_bucket            int16
imbalance_size             float32
imbalance_buy_sell_flag       int8
reference_price            float32
matched_size               float32
far_price                  float32
near_price                 float32
bid_price                  float32
bid_size                   float32
ask_price                  float32
ask_size                   float32
wap                        float32
target                     float32
time_id                      int16
volume                     float32
mid_price                  float32
bid_ask_spread             float32
bid_ask_ratio              float32
liquidity_imbalance        float32
market_urgency             float32
market_urgency_v2          float32
stock_weights              float32
weighted_wap               float32
indexwap                   float32
inde