In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import GroupKFold

import tensorflow as tf
from tensorflow import keras
from keras import backend as K

import warnings
warnings.filterwarnings("ignore")

In [None]:
local_work = False
SEED=2021
N_FOLDS=5
batch_size = 1024

def seed_everything(seed=2021):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


seed_everything(SEED)

In [None]:
if local_work:
    data_dir = '../'
else:
    data_dir ='../input/optiver-realized-volatility-prediction/'

train = pd.read_csv(data_dir +'train.csv')
test = pd.read_csv(data_dir +'test.csv')

In [None]:
#Remove worse time_id and convert target to float32.
time_id_remove=[25504,27174,24034,20439,3668,4851,6274,19260,27876,11579,28319,23792,23030,1544,2139,14447]
train = train.drop(train[train['time_id'].isin(time_id_remove)].index)

In [None]:
def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap


def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

def rmspe_exp(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square(((np.exp(y_true)-1) - (np.exp(y_pred)-1)) / (np.exp(y_true)-1)))))

def metric_rmspe(y_true, y_pred):
         return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))

def metric_rmspe_exp(y_true, y_pred):
         return K.sqrt(K.mean(K.square( ((K.exp(y_true)-1) - (K.exp(y_pred)-1))/ (K.exp(y_true)-1) )))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

# Function to early stop with root mean squared percentage error
def feval_rmspe_exp(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe_exp(y_true, y_pred), False

In [None]:
def preprocessor_book(file_path):
    df = pd.read_parquet(file_path)
    #calculate return etc
    df['wap1'] = calc_wap(df)
    df['log_return1'] = df.groupby('time_id')['wap1'].apply(log_return)    
    
    df['wap2'] = calc_wap2(df)
    df['log_return2'] = df.groupby('time_id')['wap2'].apply(log_return)
    
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))


    df['price_spread1'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    
    
    #dict for aggregate
    create_feature_dict = {
        'log_return1':[realized_volatility],
        'log_return2':[realized_volatility],
        'total_volume':[np.mean],
        'volume_imbalance':[np.mean],
        'price_spread1':[np.max,np.sum],
        'price_spread2':[np.max,np.sum],
        'bid_ask_spread':[np.sum],}
    
    
    #####groupby / all seconds
    df_feature = pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_
    df_feature = df_feature.add_suffix('_T0')
    
    ######groupby / last XX seconds
    last_seconds = [120,300,480]#120
    last_seconds_name = ['T1','T2','T3']#T1
    
    for second,ls_name in zip(last_seconds,last_seconds_name):
        second = 600 - second 
    
        df_feature_sec = pd.DataFrame(df.query(f'seconds_in_bucket >= {second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()

        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns] #time_id is changed to time_id_
     
        df_feature_sec = df_feature_sec.add_suffix('_' + ls_name)

        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id__T0',right_on=f'time_id__{ls_name}')
        df_feature = df_feature.drop([f'time_id__{ls_name}'],axis=1)
        
    
    #create row_id
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id__T0'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['time_id__T0'],axis=1)
    
    return df_feature

In [None]:
def preprocessor_trade(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
       'order_count':[np.sum],
       'amount':[np.max] 
    }
    
    df_feature = df.groupby('time_id').agg(create_feature_dict).reset_index()
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]
    df_feature = df_feature.add_suffix('_T0')
    
     ######groupby / last XX seconds
    last_seconds = [120,300,480]#120
    last_seconds_name = ['T1','T2','T3']#T1
    
    for second,ls_name in zip(last_seconds,last_seconds_name):
        second = 600 - second 
    
        df_feature_sec = pd.DataFrame(df.query(f'seconds_in_bucket >= {second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()

        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns] #time_id is changed to time_id_
     
        df_feature_sec = df_feature_sec.add_suffix('_' + ls_name)

        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id__T0',right_on=f'time_id__{ls_name}')
        df_feature = df_feature.drop([f'time_id__{ls_name}'],axis=1)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id__T0'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id__T0'],axis=1)
    
    return df_feature

In [None]:
def preprocessor(list_stock_ids, is_train = True):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()
    
    def for_joblib(stock_id):
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = pd.merge(preprocessor_book(file_path_book),preprocessor_trade(file_path_trade),on='row_id',how='left')
       # df_tmp=preprocessor_book(file_path_book)
            
        return pd.concat([df,df_tmp])
    
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df

In [None]:
%%time
create_train=False
if create_train:
    
    train_ids = train.stock_id.unique()
    df_train = preprocessor(list_stock_ids= train_ids, is_train = True)

    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    df_train = train.merge(df_train, on = ['row_id'], how = 'left')
else:
    df_train = pd.read_csv('../input/optiverricopue30/df_train_local.csv')
    
df_train.head()

In [None]:
%%time
test_ids = test.stock_id.unique()
df_test = preprocessor(list_stock_ids= test_ids, is_train = False)

test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
df_test = test.merge(df_test, on = ['row_id'], how = 'left')


In [None]:
#Convert from float64 to float32
not_num_cols=['stock_id','target','time_id','row_id']                        
features_num=[col for col in df_train.columns if col not in not_num_cols]
features_images=[col for col in df_train.columns if 'T0' in col]

df_train[features_num]=df_train[features_num].astype('float32')
df_train['target']=df_train['target'].astype('float32')
df_test[features_num]=df_test[features_num].astype('float32')

if create_train:
    df_train.to_csv(data_dir +'df_train_local.csv',index = False)

In [None]:
#fix nan and infitines.
df_train.replace(np.inf, np.nan,inplace=True)
df_test.replace(np.inf, np.nan,inplace=True)

for col in features_num:
    df_train[col] = df_train[col].fillna(df_train.groupby('stock_id')[col].transform('max'))
    df_test[col] = df_test[col].fillna(df_test.groupby('stock_id')[col].transform('max'))
    
    
df_train.replace(-np.inf, np.nan,inplace=True)
df_test.replace(-np.inf, np.nan,inplace=True)

for col in features_num:
    df_train[col] = df_train[col].fillna(df_train.groupby('stock_id')[col].transform('min'))
    df_test[col] = df_test[col].fillna(df_test.groupby('stock_id')[col].transform('min'))

In [None]:
from scipy.special import boxcox1p
lam=0.1
sk_f=1000
for fet in features_num:    
    df_train[fet] = boxcox1p(df_train[fet]*sk_f,lam)
    df_test[fet] = boxcox1p(df_test[fet]*sk_f,lam)

In [None]:
def preprocessor_stock_time (df,tw):    
 
    df_concat=pd.concat([df_train, df_test], ignore_index=True)
    
    cols_names=['log_return1_realized_volatility','log_return2_realized_volatility','trade_log_return_realized_volatility']
    
    for col in cols_names:
        df[f'{col}_stock_{tw}']=df_concat.groupby('stock_id')[f'{col}_{tw}'].transform('mean')
        df[f'rel_{col}_stock_{tw}']=df[f'{col}_{tw}']/df[f'{col}_stock_{tw}']
        
        df[f'{col}_time_{tw}']=df.groupby('time_id')[f'{col}_{tw}'].transform('mean')  
        df[f'rel_{col}_time_{tw}']=df[f'{col}_{tw}']/df[f'{col}_time_{tw}']
        
        df[f'{col}_st_{tw}']=df.groupby('time_id')[f'rel_{col}_stock_{tw}'].transform('mean')
        df[f'rel_{col}_st_{tw}']=df[f'{col}_{tw}']/df[f'{col}_st_{tw}']
            
        df = df.drop([f'{col}_stock_{tw}',f'rel_{col}_stock_{tw}',f'{col}_time_{tw}',f'rel_{col}_time_{tw}',
                     f'{col}_st_{tw}'],axis=1)
        

    del df_concat
    
    return df

In [None]:
def preprocessor_diff(df,tw):
    not_num_cols=['stock_id','target','time_id','row_id']                 
    features_num=[col for col in df.columns if col not in not_num_cols]
    
    cols_names=[col[:-2] for col in features_num if tw in col]

    for col in cols_names:
        df[col+tw]=df[col+tw]/df[col+'T0']
        
    return df

In [None]:
df_train=preprocessor_stock_time(df_train,'T0')
df_test=preprocessor_stock_time(df_test,'T0')

df_train=preprocessor_stock_time(df_train,'T1')
df_test=preprocessor_stock_time(df_test,'T1')

df_train=preprocessor_stock_time(df_train,'T2')
df_test=preprocessor_stock_time(df_test,'T2')

df_train=preprocessor_stock_time(df_train,'T3')
df_test=preprocessor_stock_time(df_test,'T3')

df_train=preprocessor_diff(df_train,'T1')
df_test=preprocessor_diff(df_test,'T1')
df_train=preprocessor_diff(df_train,'T2')
df_test=preprocessor_diff(df_test,'T2')
df_train=preprocessor_diff(df_train,'T3')
df_test=preprocessor_diff(df_test,'T3')

In [None]:
not_num_cols=['stock_id','target','time_id','row_id']                        
features_num=[col for col in df_train.columns if col not in not_num_cols]

df_train.replace(np.inf, np.nan,inplace=True)
df_test.replace(np.inf, np.nan,inplace=True)

for col in features_num:
    df_train[col] = df_train[col].fillna(df_train.groupby('stock_id')[col].transform('max'))
    df_test[col] = df_test[col].fillna(df_test.groupby('stock_id')[col].transform('max'))
    
    
df_train.replace(-np.inf, np.nan,inplace=True)
df_test.replace(-np.inf, np.nan,inplace=True)

for col in features_num:
    df_train[col] = df_train[col].fillna(df_train.groupby('stock_id')[col].transform('min'))
    df_test[col] = df_test[col].fillna(df_test.groupby('stock_id')[col].transform('min'))

In [None]:
from sklearn.preprocessing import MinMaxScaler
df_concat=pd.concat([df_train, df_test], ignore_index=True)

scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(df_concat[features_num])
df_train[features_num]=scaler.transform(df_train[features_num])
df_test[features_num]=scaler.transform(df_test[features_num])

del df_concat

df_train[features_num]=df_train[features_num].astype('float32')
df_test[features_num]=df_test[features_num].astype('float32')

In [None]:
#Process to generate images

def preprocessor_img_2(df_w):
    from joblib import Parallel, delayed # parallel computing to save time       
    
    def Insert_row_pd(time_id,stock_id,df):
    
        row_number=list_of_stocks.index(stock_id)
        listofzeros = [0] * (df.shape[1]-1)
        fake_row=[stock_id]+listofzeros
        df1 = df[0:row_number]
        df2 = df[row_number:]
        df1.loc[row_number]=fake_row
        df1.loc[row_number,'corr_stock']=[[stock_id],[stock_id],[stock_id],[stock_id]]
        df1.loc[row_number,'row_id']=[f'{stock_id}-{time_id}']
        df_result = pd.concat([df1, df2])  
        df_result.index = [*range(df_result.shape[0])]
        return df_result    
    
    list_time_ids=df_w.time_id.unique()
    
    df = pd.DataFrame()
    
    def for_joblib(time_id):  
            
        time_id_df=df_w[df_w.time_id == time_id].sort_values(by=['stock_id'])
        if time_id_df.shape[0]<112:
            missing_stock_ids= [i for i in list_of_stocks if i not in time_id_df.stock_id.to_list()]
            for stock_id in missing_stock_ids:
                time_id_df=Insert_row_pd(time_id,stock_id,time_id_df)
                
        time_id_df['image_matrix']=time_id_df.apply(lambda x: time_id_df[time_id_df['stock_id'].isin(x['corr_stock'])][features_images].values,axis=1)
             
            
        return pd.concat([df,time_id_df[['row_id','image_matrix']]])
    
    df = Parallel(n_jobs=-1, verbose=1)(delayed(for_joblib)(time_id) for time_id in list_time_ids)

    df =  pd.concat(df,ignore_index = True)
    return df

In [None]:
# get list of corrected stock_ids. 30 most correlated 
df_train['target_diff']=(df_train.target-df_train.log_return1_realized_volatility_T0)/df_train.log_return1_realized_volatility_T0

train_p= df_train.pivot(index='time_id', columns='stock_id', values='target_diff')
corr = abs(train_p.corr())
ids = corr.index

corr_labels=[]
for id in ids:
    corr_label=sorted(corr[[id]].sort_values(id,ascending=False)[1:31].index.values)
    corr_labels.append(corr_label)
    
zip_iterator = zip(ids, corr_labels)
corr_dictionary = dict(zip_iterator)

df_train['corr_stock'] = df_train['stock_id'].map(corr_dictionary)
df_test['corr_stock'] = df_test['stock_id'].map(corr_dictionary) 

df_train=df_train.drop(['target_diff'], axis=1)

In [None]:
list_of_stocks=df_train.stock_id.unique().tolist()

df_test =pd.merge(df_test,preprocessor_img_2(df_test),on='row_id',how='left')

create_matrix_train=False

if create_matrix_train:
    df_train =pd.merge(df_train,preprocessor_img_2(df_train),on='row_id',how='left')
    df_train[['row_id','image_matrix']].to_pickle('./df_train_images-30.pkl')
    

else:
    df_train_images = pd.read_pickle('../input/optiverricopue30/df_train_images-30.pkl')
    df_train =pd.merge(df_train,df_train_images,on='row_id',how='left')
    del df_train_images
    
df_test.loc[df_test['image_matrix'].isnull(),['image_matrix']] = df_test.loc[df_test['image_matrix'].isnull(),'image_matrix'].apply(lambda x: np.zeros((30, 14)))
df_train.loc[df_train['image_matrix'].isnull(),['image_matrix']] = df_train.loc[df_train['image_matrix'].isnull(),'image_matrix'].apply(lambda x: np.zeros((30, 14)))

In [None]:
not_num_cols=['stock_id','target','time_id','row_id','image_matrix','corr_stock']                        
features_num=[col for col in df_train.columns if col not in not_num_cols]

target=df_train['target']
groups=df_train['time_id']
features_cat=['stock_id']
feature_img='image_matrix'

img_rows=30
img_cols=14
channnels=1

In [None]:
no_of_unique_cat = max(df_train['stock_id'])+1
embedding_size = 16

def build_model(len_cat,len_num,img_rows,img_cols,channnels):    

    
    img_input = keras.Input(shape=(img_rows,img_cols,channnels))
    cat_input = keras.Input(shape=(len_cat,), name='stock_id')
    num_input = keras.Input(shape=(len_num,), name='num_data')    
    
    
    stock_embedded = tf.keras.layers.Embedding(no_of_unique_cat, embedding_size, 
                                           input_length=1, name='stock_embedding')(cat_input)
    
    stock_flattened = tf.keras.layers.Reshape(target_shape=(embedding_size,))(stock_embedded) 
        
        
    ########################################################
    
    cnn2d = tf.keras.layers.Conv2D(16, kernel_size=3,padding='same', activation='relu')(img_input)
    cnn2d = tf.keras.layers.Conv2D(32, kernel_size=4,padding='same', activation='relu')(cnn2d)
    cnn2d = tf.keras.layers.GlobalMaxPooling2D()(cnn2d)
    cnn2d = tf.keras.layers.BatchNormalization()(cnn2d)
    cnn2d = tf.keras.layers.Dropout(0.5)(cnn2d)
    cnn2d = tf.keras.layers.Flatten()(cnn2d)  
   

    concat1 = tf.keras.layers.Concatenate(name='concatenate1')([stock_flattened,num_input])
    concat2 = tf.keras.layers.Concatenate(name='concatenate2')([stock_flattened,cnn2d]) 
    dense_c2 = tf.keras.layers.Dense(32, activation='swish',name='dense_c2')(concat2)  
    concat3 = tf.keras.layers.Concatenate(name='concatenate3')([concat1,dense_c2])
        
    dense_1 = tf.keras.layers.Dense(128, activation='swish')(concat3) 
    dense_2 = tf.keras.layers.Dense(64, activation='swish')(dense_1)
    dense_3 = tf.keras.layers.Dense(32, activation='swish')(dense_2)
    dense_4 = tf.keras.layers.Dense(16, activation='swish')(dense_3)  
    output=tf.keras.layers.Dense(1)(dense_4)
    
    concat4 = tf.keras.layers.Concatenate(name='concatenate_lgb')([concat1,dense_c2])
                               
    model = tf.keras.Model(inputs=[img_input,cat_input,num_input], outputs=output)   
        
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001),  loss=metric_rmspe_exp)    
                                   

    return model

In [None]:
fold_score_k=[]
oof_k = np.zeros(df_train.shape[0])
preds_k = np.zeros(df_test.shape[0])

target= np.log1p(df_train['target']*1000)

test_img =np.stack(df_test[feature_img].values, axis=0).reshape(len(df_test),img_rows,img_cols,channnels)
   


gkf = GroupKFold(N_FOLDS)

for fold, (train_idx, valid_idx) in enumerate(gkf.split(df_train,target,groups=groups)):

    y_train, y_valid = target.iloc[train_idx], target.iloc[valid_idx]
    
    X_train_num = df_train[features_num].iloc[train_idx]
    X_train_cat = df_train[features_cat].iloc[train_idx]
    X_train_img =np.stack(df_train[feature_img].iloc[train_idx].values, axis=0).reshape(len(train_idx),img_rows,img_cols,channnels)
    
    X_valid_num = df_train[features_num].iloc[valid_idx]
    X_valid_cat = df_train[features_cat].iloc[valid_idx]
    X_valid_img =np.stack(df_train[feature_img].iloc[valid_idx].values, axis=0).reshape(len(valid_idx),img_rows,img_cols,channnels)
   
    
    
    
    model = build_model(len(features_cat),len(features_num),img_rows,img_cols,channnels)
    
    rlr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 6, 
                                                 verbose = 0, min_delta = 1e-10, mode = 'min')    
  

    es = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 1e-10, patience = 16,
                                              mode = 'min',  baseline = None, 
                                              restore_best_weights = True, verbose = 0)  
    
    model.fit([X_train_img,X_train_cat,X_train_num], y_train, 
                sample_weight = 1/np.square(y_train),
                validation_data = ([X_valid_img,X_valid_cat,X_valid_num], y_valid), 
                epochs = 300, 
                batch_size = batch_size,               
              callbacks = [rlr, es], 
              verbose = 1)      
    
    
    
    oof_k[valid_idx] = model.predict([X_valid_img,X_valid_cat,X_valid_num]).ravel().clip(1.0e-5,1e10)
    preds_k += model.predict([test_img,df_test[features_cat], df_test[features_num]]).ravel().clip(1.0e-5,1e10)/N_FOLDS
    print('***********************')
    rmspe_score = round(rmspe(np.exp(y_valid.values)-1, np.exp(oof_k[valid_idx])-1),6)
    print(f"#Keras_fold_{fold}_Original rmse: {rmspe_score}")
    fold_score_k.append(rmspe_score)
    
    del model

In [None]:
target=(np.exp(target)-1)/1000
oof_k=((np.exp(oof_k)-1)/1000).clip(1.0e-5,1e10)

print('#',round(rmspe(df_train['target'].values,oof_k),6))
print('# Fold Scores_keras: ', fold_score_k)

In [None]:
preds_k=((np.exp(preds_k)-1)/1000).clip(1.0e-5,1e10)
df_test['target']=preds_k

print(df_test[['row_id','log_return1_realized_volatility_T0','target']].head(3))
df_test[['row_id', 'target']].to_csv('submission.csv',index = False)