In [None]:
import pandas as pd
import numpy as np
import numba
from sklearn.metrics import r2_score
from tqdm import tqdm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.ensemble import ExtraTreesRegressor
from scipy.stats import kurtosis

In [None]:
class CFG:
    data_dir = '../input/optiver-realized-volatility-prediction/'
#     data_dir = ''

In [None]:
SEED = 0
np.random.seed(SEED)

In [None]:
# book_train = pd.read_parquet(CFG.data_dir + 'book_train.parquet')
# trade_train = pd.read_parquet(CFG.data_dir + 'trade_train.parquet')
# train = pd.read_csv(CFG.data_dir + 'train.csv')

In [None]:
num_iter    = -1     # set to -1 to include all stocks
max_records = 999999 # set to 999999 to include all records

list_train = np.array(['stock_id=' + str(i) for i in range(1, 127)])
np.random.shuffle(list_train)
list_train = np.append(['stock_id=0'], list_train)
list_train

In [None]:
def logreturns(list_stock_prices):
    return np.log(list_stock_prices).diff()

def downside_lr(lr):
    return np.minimum(lr, 0)

def ewma_lr(list_stock_prices):
    lr = logreturns(list_stock_prices)
    return lr.ewm(com=0.5).mean()

def l1(values):
    expout = np.sum(np.abs(values))
    if expout == 0:
        return np.nan
    return np.log(expout)

def l2(values):
    expout = np.sum(np.square(values))
    if expout == 0:
        return np.nan
    return np.log(expout) / 2

def rmspe(y_true, y_pred):
    return -np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def calc_book_data1(df):
    wap1 = (df['bid_price1']*df['ask_size1']+df['ask_price1']*df['bid_size1'])/(df['bid_size1']+df['ask_size1'])
    spread1 = np.log(df['ask_price1']/df['bid_price1'])
    return wap1, spread1, spread1.diff()

def calc_book_data2(df):
    bid_size = df['bid_size1']+df['bid_size2']
    ask_size = df['ask_size1']+df['ask_size2']
    bid_price = (df['bid_price1']*df['bid_size1']+df['bid_price2']*df['bid_size2'])/bid_size
    ask_price = (df['ask_price1']*df['ask_size1']+df['ask_price2']*df['ask_size2'])/ask_size
    wap2 = (bid_price*ask_size+ask_price*bid_size)/(bid_size+ask_size)
    spread2 = np.log(ask_price/bid_price)
    return wap2, spread2, spread2.diff()

def mean_absolute_deviation(values):
    return np.mean(np.abs(values - np.mean(values)))

def median_absolute_deviation(values):
    return np.median(np.abs(values - np.median(values)))

def get_time_lapse(df, col):
    sib = df.groupby(['time_id'])['seconds_in_bucket']
    df[col] = sib.diff()
    return df

metrics = ['median', kurtosis, mean_absolute_deviation, median_absolute_deviation]
book_agg = {x:metrics for x in ['time_lapse_book','spread1','spread2','spread1diff','spread2diff',
                                'logreturns_wap1','logreturns_wap2']}
trade_agg = {x:metrics for x in ['time_lapse_trade','volumepo','volumepodiff','logreturns_actual']}

def realized_volatility_per_time_id(df_book, df_trade, stock_id):
    df_book = get_time_lapse(df_book, 'time_lapse_book')
    df_book['wap1'], df_book['spread1'], df_book['spread1diff'] = calc_book_data1(df_book)
    df_book['wap2'], df_book['spread2'], df_book['spread2diff'] = calc_book_data2(df_book)
    df_book['logreturns_wap1'] = df_book.groupby(['time_id'])['wap1'].apply(logreturns)
    df_book['logreturns_wap2'] = df_book.groupby(['time_id'])['wap2'].apply(logreturns)
    df_book['logreturns_wap1_downside'] = downside_lr(df_book['logreturns_wap1'])
    df_book['logreturns_wap2_downside'] = downside_lr(df_book['logreturns_wap2'])
    df_book['logreturns_wap1_ewma'] = df_book.groupby(['time_id'])['wap1'].apply(ewma_lr)
    df_book['logreturns_wap2_ewma'] = df_book.groupby(['time_id'])['wap2'].apply(ewma_lr)
    df_book = df_book.dropna()
    book_features = pd.DataFrame(df_book.groupby(['time_id']).agg(book_agg).reset_index())
    book_features.columns = [' '.join(col).strip().replace(' ','_')
                             for col in book_features.columns.values]
    
    df_trade = get_time_lapse(df_trade, 'time_lapse_trade')
    df_trade['logreturns_actual'] = df_trade.groupby(['time_id'])['price'].apply(logreturns)
    df_trade['logreturns_actual_downside'] = downside_lr(df_trade['logreturns_actual'])
    df_trade['logreturns_actual_ewma'] = df_trade.groupby(['time_id'])['price'].apply(ewma_lr)
    df_trade['volumepo'] = df_trade['price']*df_trade['size']/df_trade['order_count']
    df_trade['volumepodiff'] = df_trade['volumepo'].diff()
    df_trade = df_trade.dropna()
    trade_features = df_trade.groupby(['time_id']).agg(trade_agg).reset_index()
    trade_features.columns = [' '.join(col).strip().replace(' ','_')
                              for col in trade_features.columns.values]
    
    rvps_book1 = df_book.groupby(['time_id'])['logreturns_wap1'].agg(l1).reset_index()
    rvps_book1 = rvps_book1.rename(columns={'logreturns_wap1':'l1_realized_volatility_wap1'})
    rvps_book2 = df_book.groupby(['time_id'])['logreturns_wap1'].agg(l2).reset_index()
    rvps_book2 = rvps_book2.rename(columns={'logreturns_wap1':'l2_realized_volatility_wap1'})
    rvps_book3 = df_book.groupby(['time_id'])['logreturns_wap2'].agg(l1).reset_index()
    rvps_book3 = rvps_book3.rename(columns={'logreturns_wap2':'l1_realized_volatility_wap2'})
    rvps_book4 = df_book.groupby(['time_id'])['logreturns_wap2'].agg(l2).reset_index()
    rvps_book4 = rvps_book4.rename(columns={'logreturns_wap2':'l2_realized_volatility_wap2'})
    rvps_trade1 = df_trade.groupby(['time_id'])['logreturns_actual'].agg(l1).reset_index()
    rvps_trade1 = rvps_trade1.rename(columns={'logreturns_actual':'l1_realized_volatility_actual'})
    rvps_trade2 = df_trade.groupby(['time_id'])['logreturns_actual'].agg(l2).reset_index()
    rvps_trade2 = rvps_trade2.rename(columns={'logreturns_actual':'l2_realized_volatility_actual'})
    rvps_trade3 = df_trade.groupby(['time_id'])['order_count'].agg('sum').reset_index()
    rvps_trade3 = rvps_trade3.rename(columns={'order_count':'sum_order_count'})
    
    rvps_book = rvps_book1.merge(rvps_book2, on='time_id')
    rvps_book = rvps_book.merge(rvps_book3, on='time_id')
    rvps_book = rvps_book.merge(rvps_book4, on='time_id')
    rvps_trade = rvps_trade1.merge(rvps_trade2, on='time_id')
    rvps_trade = rvps_trade.merge(rvps_trade3, on='time_id')
    
    rvps_book = rvps_book.merge(book_features, on='time_id')
    rvps_trade = rvps_trade.merge(trade_features, on='time_id')
    rvps = rvps_book.merge(rvps_trade, on='time_id')
    
    rvps['row_id'] = rvps['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return rvps.drop(['time_id'], axis=1)

In [None]:
# we don't expect the clock to finish if num_iter is less than the number of unique stock ids
df_past_realized_train = pd.DataFrame()
stocks = []
for file in tqdm(list_train):
    if len(stocks) == num_iter:
        break
    try:
        df_book = pd.read_parquet(CFG.data_dir + 'book_train.parquet/' + file)
        df_trade = pd.read_parquet(CFG.data_dir + 'trade_train.parquet/' + file)
        stock_id = file.split('=')[1]
        df_past_realized_train = pd.concat([df_past_realized_train,
                                      realized_volatility_per_time_id(df_book,df_trade,stock_id)])
        stocks.append(int(stock_id))
    except FileNotFoundError:
        pass

In [None]:
df_past_realized_train

In [None]:
def normalise(df, mu=None, sigma=None):
    X = df[df.columns[:-2]]
    if mu is None: mu = X.mean()
    if sigma is None: sigma = X.std()
    df2 = df.copy()
    df2.loc[:,df.columns[:-2]] = (X - mu) / sigma
    return df2, [mu, sigma]

In [None]:
lasso = Lasso(alpha=1e-4, normalize=True, random_state=SEED)
extra = ExtraTreesRegressor(random_state=SEED)
poly = PolynomialFeatures(2)

train = pd.read_csv(CFG.data_dir + 'train.csv')
train = train[train['stock_id'].isin(stocks)]
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_joined = df_past_realized_train.merge(train, on=['row_id'], how='right')
df_joined = df_joined.dropna().sample(frac=1, random_state=SEED).iloc[:max_records]

num_train = int(0.8*len(df_joined))
df_final, [mu, sigma] = normalise(df_joined)
df_joined_train, _ = normalise(df_joined.iloc[:num_train])
df_joined_val, _ = normalise(df_joined.iloc[num_train:], mu, sigma)

df_joined_val

In [None]:
mu, sigma

In [None]:
def get_sample_weight(y, df):
    sample_weight = 1 / np.square(y)
    sample_weight *= 1 + 0.5*df['row_id'].apply(lambda x:x[:2]=='0-')
    return sample_weight

In [None]:
X_final = df_joined.drop(['row_id', 'target'], axis=1)
X_final_poly = poly.fit_transform(X_final)
columns = poly.get_feature_names(X_final.columns)
X_final_poly = pd.DataFrame(X_final_poly, columns=columns)
y_final = df_joined['target']

sample_weight = get_sample_weight(y_final, df_final)

lasso.fit(X_final_poly, y_final, sample_weight)
coef = pd.Series(lasso.coef_, index=columns)
temp = coef.abs().sort_values()
nonneg_coef = coef.loc[temp.index][temp > 0]
nonneg_coef.to_csv('relevant_features.csv')

len(coef), len(nonneg_coef), columns

In [None]:
R2 = {}
RMSPE = {}

y_train = df_joined_train['target']
y_val = df_joined_val['target']

feature_names = ' '.join(nonneg_coef.index).replace('^2', '')
feature_names = list(set(feature_names.split()))
for column in feature_names + ['no column']:
    
    if column != 'no column':
        X_train = df_joined_train.drop([column, 'row_id', 'target'], axis=1)
        X_val = df_joined_val.drop([column, 'row_id', 'target'], axis=1)
    else:
        X_train = df_joined_train.drop(['row_id', 'target'], axis=1)
        X_val = df_joined_val.drop(['row_id', 'target'], axis=1)
    
    cols = [col for col in columns if column not in col]
    X_train_poly = pd.DataFrame(poly.fit_transform(X_train),index=X_train.index,columns=cols)
    X_val_poly = pd.DataFrame(poly.fit_transform(X_val),index=X_val.index,columns=cols)
    
    cols = [col for col in nonneg_coef.index if column not in col]
    logtarget = np.log(y_train)
    sample_weight_train = get_sample_weight(y_train, df_joined_train)
    extra.fit(X_train_poly[cols], logtarget, sample_weight_train)
    y_hat = np.exp(extra.predict(X_val_poly[cols]))

    print(f'R2 score ({column} removed):{r2_score(y_true=y_val, y_pred=y_hat)}')
    print(f'RMSPE ({column} removed):{rmspe(y_true=y_val, y_pred=y_hat)}')
    
    R2[column] = r2_score(y_true=y_val, y_pred=y_hat)
    RMSPE[column] = rmspe(y_true=y_val, y_pred=y_hat)

In [None]:
r = -pd.Series(RMSPE)

R2_rank = -pd.Series(R2).rank()
RMSPE_rank = r.rank()
combo = 0.1 * R2_rank + 0.9 * RMSPE_rank
combo = (combo - combo.mean()) / combo.std()

import matplotlib.pyplot as plt
plt.bar(range(len(RMSPE)),r)

combo.sort_values()

In [None]:
def run_with_filler(filler):
    X_test_filled = X_test.fillna(filler)
    X_test_poly = pd.DataFrame(poly.fit_transform(X_test_filled), columns=columns)
    y_hat = np.exp(extra.predict(X_test_poly[nonneg_coef.index]))
    return y_hat

def combine_dfs(df,i):
    fm = df[df['row_id'].apply(lambda x:x.split('-')[0]) != str(stock_id)]
    fs = df[df['row_id'].apply(lambda x:x.split('-')[0]) == str(stock_id)]
    fc = (fm.drop('row_id',axis=1).iloc[i]+fs.drop('row_id',axis=1).iloc[i])/2
    y_hat = (run_with_filler(fm.iloc[i])+2*run_with_filler(fc)+run_with_filler(fs.iloc[i]))/4
    return y_hat

In [None]:
extra.fit(X_final_poly[nonneg_coef.index], np.log(y_final), sample_weight)

In [None]:
test = pd.read_csv(CFG.data_dir + 'test.csv')
stock_id = 0

df_book = pd.read_parquet(CFG.data_dir + 'book_test.parquet/stock_id='+str(stock_id))
df_trade = pd.read_parquet(CFG.data_dir + 'trade_test.parquet/stock_id='+str(stock_id))
X_test = test.merge(realized_volatility_per_time_id(df_book, df_trade, stock_id),
                    on=['row_id'], how='left')
X_test.index = X_test['row_id']
X_test = X_test.drop(['stock_id', 'time_id', 'row_id'], axis=1)

X_test = (X_test - mu) / sigma

# swap for second submission
combo0 = (combine_dfs(df_final,0)+combine_dfs(df_final,1)+combine_dfs(df_final,2))/3
combo1 = (combine_dfs(df_final,3)+combine_dfs(df_final,4)+combine_dfs(df_final,5))/3

test['target'] = combo0[0] # equal to combo1[0]
test = test[['row_id', 'target']]
test.iloc[1,1] = combo0[1] # equal to combo0[2]
test.iloc[2,1] = combo1[1] # equal to combo1[2]
test.to_csv('submission.csv', index=False)
test