# WQU Capstone project - Short-term trading strategy on G10 currencies
## Notebook five - Putting previous steps together and Feature Engineering

* Sergey Chigrinov - chigrinov.s.88@gmail.com
* Dhruv Agraval -  dhruva1@stanfordalumni.org
* Man Sing Ho - mshoalbert@gmail.com

### Jun-Aug-2020

After these steps the data should be ready for classification

In [7]:
import sys
import os
#insert you own path or use relative path
path_to_project = os.path.realpath('..') # r'C:\WQU\Capstone\Working_files'
sys.path.append(path_to_project)

In [8]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas.tseries.offsets import BMonthEnd
from multiprocessing import cpu_count
#from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [9]:
from WQUcapstoneCode.sampling import sampling
from WQUcapstoneCode.labeling import labeling
from WQUcapstoneCode.technical import technical
from WQUcapstoneCode.fracdif.fracdif import frac_diff_ffd

In [10]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-talk')
plt.style.use('bmh')

Settings

In [11]:
tickers = ['AUD/USD','AUD/CAD','AUD/JPY','EUR/USD','GBP/USD','NZD/USD','USD/CAD', 'USD/JPY']
max_holding_period = 999 #days 
ticks_multiplyer = 1. #arbitrary
min_ret_target_vol_multiplier = 0.7 #
cpus = cpu_count()-1
d = 0.35 #fracdiff parameter. 1=simple first order differencing
#we can use ADF to find non-stationary features, however, 
#fractionally differentiated featured may hold useful information for further analysis
#therefore, we'll use default list of features to apply fracDiff
non_stationary_feat = {'price','fast','slow','average','upper_band','lower_band','tenka_sen','kijun_sen','senkou_span_a','senkou_span_b'}

Data preparation

In [13]:
offset = BMonthEnd()
for ticker in tickers:
    input_path = os.path.join(path_to_project, 'input_data', ''.join(ticker.split('/')) + '.csv')
    pair = pd.read_csv(input_path)
    pair.index =[dt.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in pair.date]
    pair = pair.drop(columns=['date'])
    m_ticks = ticks_multiplyer * pair.tickqty.sum()/pair.shape[0]
    #print('Sampling')
    tick_df = sampling.sampled_bar_df(pair, 'tickqty',m_ticks)
    
    #print('Labeling')   
    dailyVol = labeling.getDailyVol(tick_df.bidclose) 
    dailyVol.name = 'volatility'
    close = tick_df[['bidclose','askclose']]
    close.columns = ['bid','ask']  
    tEvents = labeling.getTEvents(close,h=dailyVol.mean())
    t1 = labeling.addVerticalBarrier(tEvents, close, numDays=max_holding_period)
    ptsl = [1, 1] #symmetric take-profit and stop-loss
    target = dailyVol
    # select minRet
    minRet = dailyVol.mean()*min_ret_target_vol_multiplier
    close = (close['bid'] + close['ask']) / 2 #to simplify we'll work with the mid price
    events = labeling.getEvents(close, tEvents, ptsl, target, minRet, cpus, t1=t1)
    labels = labeling.getBins(events, close)
    labels = labeling.dropLabels(labels)
    
    #print('Calculating technical features')
    ema = technical.EMA(close)
    bb = technical.BollingerBands(close)
    cci = technical.CCI(close)
    so = technical.Stochastic(close)
    wr = technical.wr(close)
    ic = technical.Ichimoku(close)
    rsi = technical.RSI(close)
    
    lagged_px = pd.concat([close.shift(1), close.shift(2),close.shift(12)],axis=1)
    lagged_px.columns = ['T-1','T-2','T-12']
    lagged_returns = pd.DataFrame(np.diff(np.log(lagged_px), axis=0), 
                                  index = lagged_px.index[1:], 
                                  columns=[c+'_1per_rtn' for c in lagged_px])
    period_returns = pd.concat([np.log(close/close.shift(1)), 
                                np.log(close/close.shift(1)), 
                                np.log(close/close.shift(1))],axis=1)
    period_returns.columns = ['T-1_rtn','T-2_rtn','T-12_rtn']
    lags = pd.concat([technical.rolling_autocorr(close,lag=1),
                      technical.rolling_autocorr(close,lag=2),
                      technical.rolling_autocorr(close,lag=4),
                      technical.rolling_autocorr(close,lag=6)], axis=1) #2,4,6 were found to be correlated earlier
    
    ema.data = ema.data.rename(columns={'side':'ema_side'})
    bb.data = bb.data.rename(columns={'side':'bb_side'})
    so.data = so.data.rename(columns={'side':'so_side'})
    cci.data = cci.data.rename(columns={'side':'cci_side'})
    wr.data = cci.data.rename(columns={'side':'wr_side'})
    ic.data = ic.data.rename(columns={'side':'ic_side'})
    rsi.data = rsi.data.rename(columns={'side':'rsi_side'})
    feat = lambda x: [col for col in x.columns if col != 'price'] 
    features = pd.concat([ema(), bb()[feat], so()[feat], cci()[feat],wr()[feat],
                          ic()[feat], rsi()[feat], lagged_px, lags,
                          lagged_returns, period_returns], axis=1)
    #day of the week and month may be useful features as well because of rebalancing flows
    #we one-hot-encode them
    features = pd.concat([features,
                          pd.get_dummies(features.index.day_name()).set_index(features.index),
                          pd.get_dummies(features.index.month_name()).set_index(features.index)], axis=1)
    #features['day'], features['month'] = features.index.dayofweek, features.index.month
    features['EOM']=features.index.map(lambda x: 1 if (offset.rollforward(x).day==x.day) else 0)
    #features['EOQ']=features.index.map(lambda x: 1 if ((offset.rollforward(x).day==x.day) and (x.month in (3,6,9,12))) else 0)
    #print('Applying fracDiff')
    df = pd.concat([frac_diff_ffd(pd.DataFrame(features[c].dropna()), diff_amt=d, thresh=1e-5)  for c in tqdm(non_stationary_feat)], axis = 1)
    df.columns = [f'{c}_frdif' for c in df]
    
    result = pd.concat([features,df,dailyVol, labels], axis = 1).dropna()
    preprocessed_path= os.path.join(path_to_project, 'preprocessed_data', ''.join(ticker.split('/')) + '_feat.csv')
    result.to_csv(preprocessed_path)

100%|████████████████████████████████████████████████████████████████████████| 19050/19050 [00:00<00:00, 489619.47it/s]
100%|██████████████████████████████████████████████████████████████████████████| 11722/11722 [00:03<00:00, 3618.58it/s]
2020-06-28 18:59:53.229695 100.0% applyPtSlOnT1 done after 0.06 minutes. Remaining 0.0 minutes..
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:21<00:00,  2.17s/it]
100%|████████████████████████████████████████████████████████████████████████| 19027/19027 [00:00<00:00, 681342.98it/s]
100%|██████████████████████████████████████████████████████████████████████████| 11500/11500 [00:02<00:00, 4656.20it/s]
2020-06-28 19:00:58.918743 100.0% applyPtSlOnT1 done after 0.05 minutes. Remaining 0.0 minutes..
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:23<00:00,  2.33s/it]
100%|███████████████████████████████████████████████████████████████████████| 19056/19

### Conclusion
Now as we have all features ready we can start experimenting with machine learning models.