# WQU Capstone project - Short-term trading strategy on G10 currencies
## Notebook five - Putting previous steps together and Feature Engineering

* Sergey Chigrinov - chigrinov.s.88@gmail.com
* Dhruv Agraval -  dhruva1@stanfordalumni.org
* Man Sing Ho - mshoalbert@gmail.com

### Jun-Aug-2020

After these steps the data should be ready for classification

In [7]:
import sys
import os
#insert you own path or use relative path
path_to_project = os.path.realpath('..') # r'C:\WQU\Capstone\Working_files'
sys.path.append(path_to_project)

In [8]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas.tseries.offsets import BMonthEnd
from multiprocessing import cpu_count
#from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [9]:
from WQUcapstoneCode.sampling import sampling
from WQUcapstoneCode.labeling import labeling
from WQUcapstoneCode.technical import technical
from WQUcapstoneCode.fracdif.fracdif import frac_diff_ffd

In [10]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-talk')
plt.style.use('bmh')

Settings

In [11]:
tickers = ['AUD/USD','AUD/CAD','AUD/JPY','EUR/USD','GBP/USD','NZD/USD','USD/CAD', 'USD/JPY']
max_holding_period = 999 #days 
ticks_multiplyer = 1. #arbitrary
min_ret_target_vol_multiplier = 0.7 #
cpus = cpu_count()-1
d = 0.35 #fracdiff parameter. 1=simple first order differencing
#we can use ADF to find non-stationary features, however, 
#fractionally differentiated featured may hold useful information for further analysis
#therefore, we'll use default list of features to apply fracDiff
non_stationary_feat = {'price','fast','slow','average','upper_band','lower_band','tenka_sen','kijun_sen','senkou_span_a','senkou_span_b'}

Data preparation

In [15]:
offset = BMonthEnd()
for ticker in tickers:
    
    # Print ticket the loop is running
    print (ticker+": Data preparating in progres...")
    
    # Import FX spot data
    # --------------------------------------------------------------------------------
    input_path = os.path.join(path_to_project, 'input_data', ''.join(ticker.split('/')) + '.csv')
    pair = pd.read_csv(input_path)
    pair.index =[dt.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in pair.date]
    pair = pair.drop(columns=['date'])
    
    # Import interbank interest rate data
    # --------------------------------------------------------------------------------
    input_path_ir_data = os.path.join(path_to_project, 'input_data', 'interbank_IR_3m.xlsx')
    ir = pd.read_excel(input_path_ir_data,sheetname = 'consolidatedData')
    ir.index =[dt.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in ir.date]
    
    # Assign CCY1 and CCY2 from ticker
    # --------------------------------------------------------------------------------
    CCY1 = ticker[:3]
    CCY2 = ticker[-3:]
    
    # calculate interest rate differentials
    # --------------------------------------------------------------------------------
    ir_d1 = pd.DataFrame(index=ir.index, columns=['ir_d1'])
    ir_d1['ir_d1']= ir[CCY1] - ir[CCY2]
        
    m_ticks = ticks_multiplyer * pair.tickqty.sum()/pair.shape[0]
    
    #print('Sampling')
    # --------------------------------------------------------------------------------
    tick_df = sampling.sampled_bar_df(pair, 'tickqty',m_ticks)
    
    #print('Labeling')   
    # --------------------------------------------------------------------------------    
    dailyVol = labeling.getDailyVol(tick_df.bidclose) 
    dailyVol.name = 'volatility'
    close = tick_df[['bidclose','askclose']]
    close.columns = ['bid','ask']  
    tEvents = labeling.getTEvents(close,h=dailyVol.mean())
    t1 = labeling.addVerticalBarrier(tEvents, close, numDays=max_holding_period)
    ptsl = [1, 1] #symmetric take-profit and stop-loss
    target = dailyVol
    # select minRet
    minRet = dailyVol.mean()*min_ret_target_vol_multiplier
    close = (close['bid'] + close['ask']) / 2 #to simplify we'll work with the mid price
    events = labeling.getEvents(close, tEvents, ptsl, target, minRet, cpus, t1=t1)
    labels = labeling.getBins(events, close)
    labels = labeling.dropLabels(labels)
    
    #print('Calculating technical features')
    ema = technical.EMA(close)
    bb = technical.BollingerBands(close)
    cci = technical.CCI(close)
    so = technical.Stochastic(close)
    wr = technical.wr(close)
    ic = technical.Ichimoku(close)
    rsi = technical.RSI(close)
    
    lagged_px = pd.concat([close.shift(1), close.shift(2),close.shift(12)],axis=1)
    lagged_px.columns = ['T-1','T-2','T-12']
    lagged_returns = pd.DataFrame(np.diff(np.log(lagged_px), axis=0), 
                                  index = lagged_px.index[1:], 
                                  columns=[c+'_1per_rtn' for c in lagged_px])
    period_returns = pd.concat([np.log(close/close.shift(1)), 
                                np.log(close/close.shift(1)), 
                                np.log(close/close.shift(1))],axis=1)
    period_returns.columns = ['T-1_rtn','T-2_rtn','T-12_rtn']
    lags = pd.concat([technical.rolling_autocorr(close,lag=1),
                      technical.rolling_autocorr(close,lag=2),
                      technical.rolling_autocorr(close,lag=4),
                      technical.rolling_autocorr(close,lag=6)], axis=1) #2,4,6 were found to be correlated earlier
    
    ema.data = ema.data.rename(columns={'side':'ema_side'})
    bb.data = bb.data.rename(columns={'side':'bb_side'})
    so.data = so.data.rename(columns={'side':'so_side'})
    cci.data = cci.data.rename(columns={'side':'cci_side'})
    wr.data = wr.data.rename(columns={'side':'wr_side'})
    ic.data = ic.data.rename(columns={'side':'ic_side'})
    rsi.data = rsi.data.rename(columns={'side':'rsi_side'})
    feat = lambda x: [col for col in x.columns if col != 'price'] 
    features = pd.concat([ema(), 
                          bb()[feat], 
                          so()[feat], 
                          cci()[feat],
                          wr()[feat],
                          ic()[feat], 
                          rsi()[feat], 
                          lagged_px, 
                          lags,
                          lagged_returns, 
                          period_returns,
                          ir_d1], axis=1)
    #day of the week and month may be useful features as well because of rebalancing flows
    #we one-hot-encode them
    features = pd.concat([features,
                          pd.get_dummies(features.index.day_name()).set_index(features.index),
                          pd.get_dummies(features.index.month_name()).set_index(features.index)], axis=1)
    #features['day'], features['month'] = features.index.dayofweek, features.index.month
    features['EOM']=features.index.map(lambda x: 1 if (offset.rollforward(x).day==x.day) else 0)
    #features['EOQ']=features.index.map(lambda x: 1 if ((offset.rollforward(x).day==x.day) and (x.month in (3,6,9,12))) else 0)
    #print('Applying fracDiff')
    df = pd.concat([frac_diff_ffd(pd.DataFrame(features[c].dropna()), diff_amt=d, thresh=1e-5)  for c in tqdm(non_stationary_feat)], axis = 1)
    df.columns = [f'{c}_frdif' for c in df]
    
    result = pd.concat([features,df,dailyVol, labels], axis = 1).dropna()
    preprocessed_path= os.path.join(path_to_project, 'preprocessed_data', ''.join(ticker.split('/')) + '_feat.csv')
    result.to_csv(preprocessed_path)

AUD/USD: Data preparating in progres...



100%|████████████████████████████████████████████████████████████████████████| 19050/19050 [00:00<00:00, 313134.11it/s][A

  0%|                                                                                        | 0/11722 [00:00<?, ?it/s][A
  2%|█▌                                                                          | 250/11722 [00:00<00:04, 2481.90it/s][A
  5%|███▌                                                                        | 555/11722 [00:00<00:04, 2623.83it/s][A
  8%|█████▉                                                                      | 920/11722 [00:00<00:03, 2860.59it/s][A
 10%|███████▌                                                                   | 1183/11722 [00:00<00:03, 2780.82it/s][A
 13%|█████████▋                                                                 | 1509/11722 [00:00<00:03, 2903.42it/s][A
 16%|███████████▊                                                               | 1849/11722 [00:00<00:03, 3030.54it/s][A
 19%|█████████

AUD/CAD: Data preparating in progres...



100%|████████████████████████████████████████████████████████████████████████| 19027/19027 [00:00<00:00, 276498.81it/s][A

  0%|                                                                                        | 0/11500 [00:00<?, ?it/s][A
  3%|█▉                                                                          | 293/11500 [00:00<00:03, 2908.79it/s][A
  6%|████▏                                                                       | 635/11500 [00:00<00:03, 3039.42it/s][A
  9%|██████▍                                                                     | 983/11500 [00:00<00:03, 3153.14it/s][A
 11%|████████▌                                                                  | 1314/11500 [00:00<00:03, 3191.78it/s][A
 14%|██████████▌                                                                | 1623/11500 [00:00<00:03, 3153.56it/s][A
 17%|████████████▊                                                              | 1970/11500 [00:00<00:02, 3235.63it/s][A
 20%|█████████

AUD/JPY: Data preparating in progres...



100%|████████████████████████████████████████████████████████████████████████| 19056/19056 [00:00<00:00, 796137.75it/s][A

  0%|                                                                                        | 0/11570 [00:00<?, ?it/s][A
  3%|██                                                                          | 319/11570 [00:00<00:03, 3166.91it/s][A
  6%|████▌                                                                       | 700/11570 [00:00<00:03, 3329.43it/s][A
  8%|██████▍                                                                     | 983/11570 [00:00<00:03, 3154.29it/s][A
 10%|███████▋                                                                   | 1194/11570 [00:00<00:04, 2394.61it/s][A
 12%|█████████▎                                                                 | 1430/11570 [00:00<00:04, 2378.87it/s][A
 15%|███████████▌                                                               | 1793/11570 [00:00<00:03, 2648.95it/s][A
 18%|█████████

EUR/USD: Data preparating in progres...



100%|████████████████████████████████████████████████████████████████████████| 19066/19066 [00:00<00:00, 331488.43it/s][A

  0%|                                                                                        | 0/11090 [00:00<?, ?it/s][A
  1%|▊                                                                           | 123/11090 [00:00<00:08, 1229.70it/s][A
  2%|█▏                                                                           | 177/11090 [00:00<00:12, 889.01it/s][A
  3%|██▏                                                                          | 313/11090 [00:00<00:10, 989.97it/s][A
  4%|███                                                                         | 444/11090 [00:00<00:09, 1065.67it/s][A
  5%|███▋                                                                        | 547/11090 [00:00<00:10, 1052.98it/s][A
  6%|████▍                                                                        | 640/11090 [00:00<00:14, 701.37it/s][A
  6%|████▉    


  0%|                                                                                           | 0/10 [00:00<?, ?it/s][A
 10%|████████▎                                                                          | 1/10 [00:04<00:42,  4.77s/it][A
 20%|████████████████▌                                                                  | 2/10 [00:08<00:36,  4.56s/it][A
 30%|████████████████████████▉                                                          | 3/10 [00:12<00:31,  4.43s/it][A
 40%|█████████████████████████████████▏                                                 | 4/10 [00:18<00:28,  4.78s/it][A
 50%|█████████████████████████████████████████▌                                         | 5/10 [00:22<00:23,  4.65s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:27<00:18,  4.52s/it][A
 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:31<00:13,  4.34s/it][A
 80%|██████████

GBP/USD: Data preparating in progres...



100%|████████████████████████████████████████████████████████████████████████| 19031/19031 [00:00<00:00, 643682.66it/s][A

  0%|                                                                                        | 0/11346 [00:00<?, ?it/s][A
  3%|██                                                                          | 300/11346 [00:00<00:03, 2997.46it/s][A
  6%|████▏                                                                       | 628/11346 [00:00<00:03, 3074.73it/s][A
  8%|█████▉                                                                      | 893/11346 [00:00<00:03, 2931.28it/s][A
 11%|████████                                                                   | 1228/11346 [00:00<00:03, 3043.43it/s][A
 14%|██████████▎                                                                | 1563/11346 [00:00<00:03, 3122.94it/s][A
 17%|████████████▍                                                              | 1887/11346 [00:00<00:03, 3150.43it/s][A
 20%|█████████

NZD/USD: Data preparating in progres...



100%|███████████████████████████████████████████████████████████████████████| 19026/19026 [00:00<00:00, 1059884.55it/s][A

  0%|                                                                                        | 0/11630 [00:00<?, ?it/s][A
  3%|██                                                                          | 320/11630 [00:00<00:03, 3176.83it/s][A
  4%|██▊                                                                         | 433/11630 [00:00<00:05, 2050.13it/s][A
  6%|████▍                                                                       | 686/11630 [00:00<00:05, 2169.74it/s][A
  9%|███████                                                                    | 1096/11630 [00:00<00:04, 2523.19it/s][A
 12%|█████████▎                                                                 | 1451/11630 [00:00<00:03, 2758.23it/s][A
 16%|████████████▏                                                              | 1883/11630 [00:00<00:03, 3088.91it/s][A
 20%|█████████

USD/CAD: Data preparating in progres...



100%|████████████████████████████████████████████████████████████████████████| 19040/19040 [00:00<00:00, 763650.82it/s][A

  0%|                                                                                        | 0/11020 [00:00<?, ?it/s][A
  4%|██▉                                                                         | 432/11020 [00:00<00:02, 4288.71it/s][A
  6%|████▊                                                                       | 696/11020 [00:00<00:02, 3601.19it/s][A
  9%|██████▊                                                                    | 1000/11020 [00:00<00:02, 3403.85it/s][A
 13%|█████████▊                                                                 | 1435/11020 [00:00<00:02, 3634.80it/s][A
 17%|████████████▉                                                              | 1900/11020 [00:00<00:02, 3882.44it/s][A
 21%|███████████████▍                                                           | 2275/11020 [00:00<00:02, 3833.12it/s][A
 25%|█████████

USD/JPY: Data preparating in progres...



100%|████████████████████████████████████████████████████████████████████████| 19058/19058 [00:00<00:00, 406562.40it/s][A

  0%|                                                                                        | 0/11329 [00:00<?, ?it/s][A
  3%|██▍                                                                         | 366/11329 [00:00<00:03, 3633.48it/s][A
  7%|████▉                                                                       | 741/11329 [00:00<00:02, 3659.83it/s][A
 10%|███████▌                                                                   | 1145/11329 [00:00<00:02, 3758.47it/s][A
 13%|██████████                                                                 | 1526/11329 [00:00<00:02, 3765.61it/s][A
 17%|████████████▊                                                              | 1929/11329 [00:00<00:02, 3833.20it/s][A
 21%|███████████████▍                                                           | 2328/11329 [00:00<00:02, 3870.67it/s][A
 24%|█████████

### Conclusion
Now as we have all features ready we can start experimenting with machine learning models.