In [1]:
import glob 

BASE_DIR = 'D:/'

# Paths to book and trade data
TRAIN_BOOK_PATHS  = glob.glob(f'{BASE_DIR}book_train.parquet/*')
TEST_BOOK_PATHS   = glob.glob(f'{BASE_DIR}book_test.parquet/*')
TRAIN_TRADE_PATHS = glob.glob(f'{BASE_DIR}trade_train.parquet/*')
TEST_TRADE_PATHS  = glob.glob(f'{BASE_DIR}trade_test.parquet/*')

# Plotting
import matplotlib.pyplot as plt 

# Working with dataframes and sequences
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow import keras 
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [2]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

    

In [3]:
class DataManager:
    """ Used for processing the input data so the model can be fitted on it. """
    def __init__(self, train=True):
        self._train = train
        self._book_file_list = TRAIN_BOOK_PATHS if train else TEST_BOOK_PATHS
        self._trade_file_list = TRAIN_TRADE_PATHS if train else TEST_TRADE_PATHS
        self.measures_list = []
    
    def _traverse_book(self):
        """ Goes through each of the training files. """
        for book_file_path, trade_file_path in zip(self._book_file_list, self._trade_file_list):
            stock_id = book_file_path.split("=")[1] # Getting the stock_id
            print(stock_id)
            # Reading the book info and preparing it for aggregation
            book = pd.read_parquet(book_file_path)
            
            book.sort_values(by=['time_id', 'seconds_in_bucket'])
            
            book['wap1'] = (book['bid_price1'] * book['ask_size1'] + book['ask_price1'] * book['bid_size1']) / (book['bid_size1']+ book['ask_size1'])
            book['log_return1'] = book.groupby(['time_id'])['wap1'].apply(log_return) 
            book = book[~book['log_return1'].isnull()]
            
            book['wap2'] = (book['bid_price2'] * book['ask_size2'] + book['ask_price2'] * book['bid_size2']) / (book['bid_size2']+ book['ask_size2'])
            book['log_return2'] = book.groupby(['time_id'])['wap2'].apply(log_return)
            book = book[~book['log_return2'].isnull()]
            
            # Different spreads: Get the max of these for each time_id
            book['h_spread_l1'] = book['ask_price1'] - book['bid_price1']
            book['h_spread_l2'] = book['ask_price2'] - book['bid_price2']
            book['v_spread_b']  = book['bid_price1'] - book['bid_price2']
            book['v_spread_a']  = book['ask_price1'] - book['bid_price2']
            
            book.loc[:, 'bas'] = (book.loc[:, ('ask_price1', 'ask_price2')].min(axis = 1) / book.loc[:, ('bid_price1', 'bid_price2')].max(axis = 1) - 1) 
            
            trade = pd.read_parquet(trade_file_path)
            
            # Slicing the train data based on stock_id
            book_stock_slice = train[train['stock_id'] == int(stock_id)]
            
            for time_id in book['time_id'].unique():
                book_slice = book[book['time_id'] == time_id] # Slicing based on time_id
                trade_slice = trade[trade['time_id'] == time_id]
                p3 = p4 = k10 = k5 = k = q5 = q10 = 1
                if  not trade_slice.empty:
                    trade_slice_1st = trade_slice['seconds_in_bucket'].iloc[0]
                    trade_slice_last = trade_slice['seconds_in_bucket'].iloc[-1]
            
                    # Feature constrution
                    mid_index = int(len(trade_slice['seconds_in_bucket'])/2)
                    for i in book_slice['seconds_in_bucket']:
                        if i <=trade_slice_1st:
                            k = k+1
                        if i <= 300:
                            k5 = k5+1
                        if i <= trade_slice_last:
                            k10 = k10+1
                    q10 = np.sum(np.multiply(trade_slice['order_count'],trade_slice['size']))
                    for j in range(trade_slice.shape[0]):
                        if trade_slice['seconds_in_bucket'].iloc[j] <= 300:
                            q5 += trade_slice['order_count'].iloc[j]*trade_slice['size'].iloc[j]
                    no_of_orders_in_book1 = k
                    no_of_orders_in_book5 = k5
                    no_of_orders_in_book10 = k10
                    os_just_be_5 = q5
                    os_10 = q10
                    p1 = k5/k
                    p2 = k10/k
                    p3 = np.sum(book_slice['bid_size1'])/np.sum(trade_slice['order_count'])
                    p4 = np.sum(book_slice['ask_size1'])/np.sum(trade_slice['order_count'])
                    p5 = q10/q5
                
                # features
                dic = {
                    'stock_id' : stock_id,
                    'time_id'  : time_id,
                    'row_id': f"{stock_id}-{time_id}", # Fixing row-id from here
                    
                    'wap1_mean': book_slice['wap1'].mean(),
                    'wap1_std':book_slice['wap1'].std(),
                    'wap1_max':book_slice['wap1'].max(),
                    
                    'wap2_mean': book_slice['wap2'].mean(),
                    'wap2_std':book_slice['wap2'].std(),
                    'wap2_max':book_slice['wap2'].max(),

                    'h_spread_l1_mean': book_slice['h_spread_l1'].mean(),
                    'h_spread_l1_std': book_slice['h_spread_l1'].std(),
                    'h_spread_l1_std': book_slice['h_spread_l1'].max(),
                    
                    'h_spread_l2_mean': book_slice['h_spread_l2'].mean(),
                    'h_spread_l2_std': book_slice['h_spread_l2'].std(),
                    'h_spread_l2_max': book_slice['h_spread_l2'].max(),
                    
                    'v_spread_b_mean': book_slice['v_spread_b'].mean(),
                    'v_spread_b_std': book_slice['v_spread_b'].std(),
                    'v_spread_b_max': book_slice['v_spread_b'].max(),
                    
                    'v_spread_a_mean': book_slice['v_spread_a'].mean(),
                    'v_spread_a_std': book_slice['v_spread_a'].std(),
                    'v_spread_a_max': book_slice['v_spread_a'].max(),
                    
                    'log_return1_mean': book_slice['log_return1'].mean(),
                    'log_return1_std':book_slice['log_return1'].std(),
                    'log_return1_max':book_slice['log_return1'].max(),
                    
                    'log_return2_mean': book_slice['log_return2'].mean(),
                    'log_return2_std':book_slice['log_return2'].std(),
                    'log_return2_max':book_slice['log_return2'].max(),
                    
                    'bas_mean': book_slice['bas'].mean(),
                    'bas_std': book_slice['bas'].std(),
                    'bas_max': book_slice['bas'].max(),
                    
                    'ask_size_mean': book_slice['ask_size1'].mean(),
                    'ask_size_std': book_slice['ask_size1'].std(),
                    
                    'ask_price_mean': book_slice['ask_price1'].mean(),
                    'ask_price_std': book_slice['ask_price1'].std(),
                    
                    'bid_size_mean': book_slice['bid_size1'].mean(),
                    'bid_size_std': book_slice['bid_size1'].std(),
                    
                    'bid_price_mean': book_slice['bid_price1'].mean(),
                    'bid_price_std': book_slice['bid_price1'].std(),
                    
                    'order_count_mean': trade_slice['order_count'].mean(),
                    'order_count_std': trade_slice['order_count'].std(),
                    
                    'orders_time_book5' : p1,
                    'orders_time_book10' : p2,
                    
                    'ratio_bid_count' :p3,
                    'ratio_ask_count' :p4,
                    
                    'order_size_time_trade' :p5
                }
                
                # Note: When getting the test_data ready, there is no target column.
                if self._train: dic['target'] = book_stock_slice[book_stock_slice['time_id'] == time_id]['target'].values[0]
                
                self.measures_list.append(dic)
            
            
    def get_processed(self):
        """ Returns the processed the data. """
        self._traverse_book() 
        
        return pd.DataFrame(self.measures_list)

<a href="./train_sp2.csv"> Download File </a>

In [4]:
#feature preposs
#bookma = DataManager().get_processed()
#bookma.to_csv('/kaggle/working/train_sp2.csv', index=False)

In [5]:
data = pd.read_csv('../input/fea-preposs2/train_sp2.csv')
test_data = DataManager(train=False).get_processed()

data1 = data.drop(['row_id'], axis=1)
data2 = test_data.drop(['row_id'],axis= 1)

# Min-Max Scaling the data for better models
for col_name in data1.columns[1:-1]:
    data2[col_name] = (data2[col_name] - data1[col_name].min()) / (data1[col_name].max() - data1[col_name].min())
    data1[col_name] = (data1[col_name] - data1[col_name].min()) / (data1[col_name].max() - data1[col_name].min())


0


In [6]:

#Training Data
X, y =  data1.iloc[:,:-4], data1['target']

#Test Data
X_test = data2.iloc[:,:-3]

#Getting training and validations plits to check for overfitting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [7]:
data1.fillna(method = 'pad', inplace = True)

In [8]:
def Build_model():
    fea_input = keras.Input(shape = (39,), name = 'one_timeid')
    out = keras.layers.Dense(128,activation='tanh')(fea_input)
    out = keras.layers.Dense(64,activation='tanh')(out)
    out = keras.layers.Dense(32,activation='tanh')(out)
    out = keras.layers.Dense(16,activation='tanh')(out)
    out = keras.layers.Dense(8,activation='tanh')(out)
    out = keras.layers.Dense(2,activation='tanh')(out)
    out = keras.layers.Dense(1)(out)
     
    model = keras.Model(
    inputs = [fea_input],
    outputs = out
    )
    
    return model


In [9]:
X_val.fillna(method ='pad',inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [10]:

X_test.fillna(0.0012,inplace = True)

In [11]:
X_train.fillna(method ='pad',inplace = True)


In [12]:
model = Build_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
one_timeid (InputLayer)      [(None, 39)]              0         
_________________________________________________________________
dense (Dense)                (None, 128)               5120      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 18    

In [13]:
Earl_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                             patience=10, 
                                             verbose=1,
                                             mode='min')

lrc = reduce_lr =  tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                        factor=0.5,
                                                        patience=5, 
                                                        min_lr=0.000001, 
                                                        verbose=1,
                                                        mode='min')

In [14]:
    model = Build_model()
    
    model.compile(
        keras.optimizers.Adam(learning_rate=0.01),
        loss=tf.keras.metrics.mean_squared_error,
        metrics=['MSE']
    )


    fea_data = X_train.iloc[:,1:]
    stock_data1 = X_train['stock_id']
    target =  y_train
    stock_data2 = X_val['stock_id']
    fea_data2 = X_val.iloc[:,1:]
    
    fea_data_test = X_test.iloc[:,1:]
    stock_data_test = test_data['stock_id']

    model.fit(fea_data, 
              target, 
              sample_weight = 1/np.square(target),
              batch_size=1024,
              epochs=1000,
              validation_data=(fea_data2, y_val, 1/np.square(y_val)),
              callbacks=[Earl_stop, lrc],
              shuffle=True,
             verbose = 1)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000

Epoch 00022: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.0012499999720603228.
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000

Epoch 00039: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000


<tensorflow.python.keras.callbacks.History at 0x7ffaaecb7590>

In [15]:
fea_data_test

Unnamed: 0,time_id,wap1_mean,wap1_std,wap1_max,wap2_mean,wap2_std,wap2_max,h_spread_l1_mean,h_spread_l1_std,h_spread_l2_mean,...,ask_price_mean,ask_price_std,bid_size_mean,bid_size_std,bid_price_mean,bid_price_std,order_count_mean,order_count_std,orders_time_book5,orders_time_book10
0,-3.1e-05,0.514342,0.0012,0.294189,0.514913,0.0012,0.296383,0.029246,0.011454,0.045824,...,0.507889,0.0012,0.000385,0.0012,0.525247,0.0012,0.02963,0.015816,0.002151,0.0


In [16]:
prediction= model.predict(fea_data_test)

In [17]:
prediction

array([[0.00048494]], dtype=float32)