# Feature Engineering for High-Frequency Data

## Imports & Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

from scipy.stats import spearmanr
import talib

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
sns.set_style('whitegrid')
idx = pd.IndexSlice

## Data prep

We use the 'Trade and Quote' dataset.

In [5]:
as_path = Path('../data/nasdaq100')

In [6]:
tcols = ['openbartime', 'firsttradetime',
         'highbidtime', 'highasktime', 'hightradetime',
         'lowbidtime', 'lowasktime', 'lowtradetime',
         'closebartime', 'lasttradetime']

In [7]:
drop_cols = ['unknowntickvolume',
             'cancelsize',
             'tradeatcrossorlocked']

In [8]:
keep = ['firsttradeprice', 'hightradeprice', 'lowtradeprice', 'lasttradeprice', 
        'minspread', 'maxspread',
        'volumeweightprice', 'nbboquotecount', 
        'tradeatbid', 'tradeatbidmid', 'tradeatmid', 'tradeatmidask', 'tradeatask', 
        'volume', 'totaltrades', 'finravolume', 
        'finravolumeweightprice', 
        'uptickvolume', 'downtickvolume', 
        'repeatuptickvolume', 'repeatdowntickvolume', 
        'tradetomidvolweight', 'tradetomidvolweightrelative']

In [9]:
columns = {'volumeweightprice'          : 'price',
           'finravolume'                : 'fvolume',
           'finravolumeweightprice'     : 'fprice',
           'uptickvolume'               : 'up',
           'downtickvolume'             : 'down',
           'repeatuptickvolume'         : 'rup',
           'repeatdowntickvolume'       : 'rdown',
           'firsttradeprice'            : 'first',
           'hightradeprice'             : 'high',
           'lowtradeprice'              : 'low',
           'lasttradeprice'             : 'last',
           'nbboquotecount'             : 'nbbo',
           'totaltrades'                : 'ntrades',
           'openbidprice'               : 'obprice',
           'openbidsize'                : 'obsize',
           'openaskprice'               : 'oaprice',
           'openasksize'                : 'oasize',
           'highbidprice'               : 'hbprice',
           'highbidsize'                : 'hbsize',
           'highaskprice'               : 'haprice',
           'highasksize'                : 'hasize',
           'lowbidprice'                : 'lbprice',
           'lowbidsize'                 : 'lbsize',
           'lowaskprice'                : 'laprice',
           'lowasksize'                 : 'lasize',
           'closebidprice'              : 'cbprice',
           'closebidsize'               : 'cbsize',
           'closeaskprice'              : 'caprice',
           'closeasksize'               : 'casize',
           'firsttradesize'             : 'firstsize',
           'hightradesize'              : 'highsize',
           'lowtradesize'               : 'lowsize',
           'lasttradesize'              : 'lastsize',
           'tradetomidvolweight'        : 'volweight',
           'tradetomidvolweightrelative': 'volweightrel'}

In [10]:
parquet_path = as_path / '1min_taq' / 'parquet'

In [11]:
files = list(parquet_path.glob('*.parquet'))

In [12]:
files[:5]

[PosixPath('../data/nasdaq100/1min_taq/parquet/20140421.parquet'),
 PosixPath('../data/nasdaq100/1min_taq/parquet/20130905.parquet'),
 PosixPath('../data/nasdaq100/1min_taq/parquet/20151117.parquet'),
 PosixPath('../data/nasdaq100/1min_taq/parquet/20150629.parquet'),
 PosixPath('../data/nasdaq100/1min_taq/parquet/20170817.parquet')]

In [13]:
df = pd.read_parquet(files[0]).drop(tcols + drop_cols, axis=1)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 91735 entries, ('PAYX', Timestamp('2014-04-21 04:00:00')) to ('GOOG', Timestamp('2014-04-21 20:00:00'))
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   openbidprice                 89735 non-null  float64
 1   openbidsize                  89735 non-null  float64
 2   openaskprice                 88141 non-null  float64
 3   openasksize                  88141 non-null  float64
 4   firsttradeprice              41715 non-null  float64
 5   firsttradesize               41715 non-null  float64
 6   highbidprice                 89836 non-null  float64
 7   highbidsize                  89836 non-null  float64
 8   highaskprice                 88242 non-null  float64
 9   highasksize                  88242 non-null  float64
 10  hightradeprice               41715 non-null  float64
 11  hightradesize                41715 non-null  float64
 

In [9]:
def data_to_hdf():
    parquet_path = as_path / '1min_taq' / 'parquet'
    files = list(parquet_path.glob('*.parquet'))
    for year in range(2013, 2018):
        print(year)
        data = []
        for f in files:
            if f.stem.startswith(str(year)):
                # print(f.stem, end=' ', flush=True)
                data.append(pd.read_parquet(f).drop(tcols + drop_cols, axis=1))
        data = (pd.concat(data).sort_index()
                .reset_index('ticker')
                .between_time('9:30', '16:00')
                .set_index('ticker', append=True)
                .swaplevel()
                .rename(columns=columns)
                .rename(columns=lambda x: x.replace('tradeat', 'at')))
        print(data.info(null_counts=True))
        data.to_hdf('algoseek.h5', f'data/{year}')

In [10]:
data_to_hdf()

In [16]:
with pd.HDFStore('algoseek.h5') as store:
    df = store['data/2013']
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9853200 entries, ('AAPL', Timestamp('2013-01-02 09:30:00')) to ('YHOO', Timestamp('2013-12-31 16:00:00'))
Data columns (total 43 columns):
 #   Column        Dtype  
---  ------        -----  
 0   obprice       float64
 1   obsize        float64
 2   oaprice       float64
 3   oasize        float64
 4   first         float64
 5   firstsize     float64
 6   hbprice       float64
 7   hbsize        float64
 8   haprice       float64
 9   hasize        float64
 10  high          float64
 11  highsize      float64
 12  lbprice       float64
 13  lbsize        float64
 14  laprice       float64
 15  lasize        float64
 16  low           float64
 17  lowsize       float64
 18  cbprice       float64
 19  cbsize        float64
 20  caprice       float64
 21  casize        float64
 22  last          float64
 23  lastsize      float64
 24  minspread     float64
 25  maxspread     float64
 26  price         float64
 27  nbbo          int64  
 

## Loading Algoseek Data

In [11]:
ohlcv_cols = ['first', 'high', 'low', 'last', 'price', 'volume']

In [12]:
data_cols = ohlcv_cols + ['up', 'down', 'rup', 'rdown', 'atask', 'atbid']

In [13]:
years = range(2013, 2018)
with pd.HDFStore('algoseek.h5') as store:
    df = (pd.concat([store[f'data/{year}']
                     .loc[:, data_cols] for year in years])
          .sort_index())

KeyError: 'No object named data/2013 in the file'

In [None]:
df.loc[:, ohlcv_cols[:4]] = df.loc[:, ohlcv_cols[:4]].groupby('ticker').fillna(method='ffill')

In [None]:
df.info(null_counts=True)

In [None]:
df.to_hdf('hf_data.h5', 'data')

In [None]:
df = pd.read_hdf('hf_data.h5', 'data')
# .loc[idx['AAPL', '2013'], :]

## Feature Engineering

All of the features above were normalized in a standard fashion
by subtracting their means, dividing by their standard deviations, and time-averaging over a recent
interval. In order to obtain a finite state space, features were discretized into bins in multiples of
standard deviation units

In [None]:
df = df.sort_index()
df['date'] = pd.to_datetime(df.index.get_level_values('date_time').date)

In [None]:
df.info(null_counts=True)

In [None]:
by_ticker = df.groupby('ticker', group_keys=False)
by_ticker_date = df.groupby(['ticker', 'date'])

In [None]:
data = pd.DataFrame(index=df.index)

In [None]:
data['date'] = pd.factorize(df['date'], sort=True)[0]

In [None]:
data['minute'] = pd.to_timedelta(data.index.get_level_values('date_time').time.astype(str))
data.minute = (data.minute.dt.seconds.sub(data.minute.dt.seconds.min()).div(60).astype(int))

### Lagged Returns

In [None]:
for t in range(1, 11):
    print(t, end=' ', flush=True)
    data[f'ret{t}min'] = (df
                          .sort_index()
                          .groupby(['ticker', 'date'])
                          .price
                          .pct_change(periods=t, fill_method=None)
                          .shift())

### Forward Returns

In [None]:
data['fwd1min'] = (data
                   .sort_index()
                   .groupby(['ticker', 'date'])
                   .ret1min
                   .shift(-1))

In [None]:
data.info(null_counts=True)

### Normalized up/downtick volume

In [None]:
data['rup'] = df.rup.div(df.volume)
data['up'] = df.up.div(df.volume)
data['down'] = df.down.div(df.volume)
data['rdown'] = df.rdown.div(df.volume)
for f in ['up', 'down', 'rup', 'rdown']:
    data[f] = data.groupby(['ticker', 'date'])[f].shift()

### Balance of Power

In [None]:
data['BOP'] = (by_ticker
               .apply(lambda x: talib.BOP(x['first'],
                                          x.high,
                                          x.low,
                                          x['last'])
                      .shift()))

###  Commodity Channel Index

In [None]:
data['CCI'] = (by_ticker
               .apply(lambda x: talib.CCI(x.high,
                                          x.low,
                                          x['last'],
                                          timeperiod=14).shift()))

### Money Flow Index

In [None]:
data['MFI'] = (by_ticker
               .apply(lambda x: talib.MFI(x.high,
                                          x.low,
                                          x['last'],
                                          x.volume,
                                          timeperiod=14)
                      .shift()))

### Stochastic RSI

In [None]:
data['STOCHRSI'] = (by_ticker.apply(lambda x: talib.STOCHRSI(x['last'],
                                                             timeperiod=14,
                                                             fastk_period=14,
                                                             fastd_period=3,
                                                             fastd_matype=0)[0]
                                    .shift()))

### Stochastic

In [None]:
def compute_stoch(x, fastk_period=14, slowk_period=3, 
                  slowk_matype=0, slowd_period=3, slowd_matype=0):
    slowk, slowd = talib.STOCH(x.high, x.low, x['last'],
                           fastk_period=fastk_period,
                           slowk_period=slowk_period,
                           slowk_matype=slowk_matype,
                           slowd_period=slowd_period,
                           slowd_matype=slowd_matype)
    return (slowd/slowk-1).shift()

In [None]:
data['STOCH'] = by_ticker.apply(compute_stoch).replace((np.inf, -np.inf), np.nan)

### Transaction Volume by price point

In [None]:
data['trades_bid_ask'] = df.atask.sub(df.atbid).div(df.volume).replace((np.inf, -np.inf), np.nan)
data['trades_bid_ask'] = data.groupby(['ticker', 'date']).trades_bid_ask.shift()

### Evaluate features

In [None]:
features = ['ret1min', 'ret2min', 'ret3min', 'ret4min', 'ret5min', 
            'ret6min', 'ret7min', 'ret8min', 'ret9min', 'ret10min',
            'rup', 'up', 'down', 'rdown', 
            'BOP', 'CCI', 'MFI', 'STOCHRSI', 'STOCH', 
            'trades_bid_ask']

In [None]:
ic = {}
for feature in features:
    print(feature)
    df_ = data[['fwd1min', feature]].dropna()
    ic[feature] = spearmanr(df_.fwd1min, df_[feature])
ic = pd.Series(ic).apply(pd.Series)
ic.columns = ['IC', 'p-value']    

In [None]:
ic.sort_values('IC')

In [None]:
ic.index = ic.index.map(str.upper)
ax = ic['IC'].sort_values(ascending=False).mul(100).plot.bar(figsize=(14, 4), 
                                                        title='Information Coeficient for HF Features (1-min forward returns)',
                                                       rot=0)
ax.set_ylabel('Information Coefficient')
plt.tight_layout()
plt.savefig('figures/hft_ic', dpi=300);

In [None]:
ic.sort_values('IC').to_csv('hf_ic.csv')

### Store results

In [None]:
data.drop(['date', 'up', 'down'], axis=1).to_hdf('hf_data.h5', 'model_data')

In [None]:
data.info(null_counts=True)