In [26]:
import numpy as np
import pandas as pd
import matplotlib as plt
import statsmodels

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.describe()

Unnamed: 0,Open,High,Low,Close
count,7098.0,7098.0,7098.0,7098.0
mean,110.649592,111.10139,110.155538,110.644679
std,14.894489,14.980412,14.806835,14.894081
min,75.75,75.973999,75.57,75.739998
25%,104.18,104.61425,103.71225,104.176752
50%,110.698498,111.100502,110.308498,110.7085
75%,119.260752,119.782247,118.720001,119.262251
max,151.645996,151.936996,151.240005,151.645996


In [4]:
df

Unnamed: 0,Date,Open,High,Low,Close
0,1996-10-29 19:00:00-05:00,114.370003,114.480003,113.610001,114.180000
1,1996-10-30 19:00:00-05:00,114.180000,114.180000,114.180000,114.180000
2,1996-10-31 19:00:00-05:00,113.500000,113.500000,113.500000,113.500000
3,1996-11-03 19:00:00-05:00,113.279999,113.980003,112.949997,113.879997
4,1996-11-04 19:00:00-05:00,113.709999,114.330002,113.449997,114.250000
...,...,...,...,...,...
7093,2024-01-07 19:00:00-05:00,144.716003,144.792999,143.669998,144.716003
7094,2024-01-08 19:00:00-05:00,144.229996,144.432007,143.453003,144.229996
7095,2024-01-09 19:00:00-05:00,144.427994,145.809998,144.438995,144.427994
7096,2024-01-10 19:00:00-05:00,145.733994,146.358994,145.279007,145.733994


In [5]:
df.set_index('Date', inplace=True)

# split train，validation, test set

In [6]:
#split the train, validation and test set
l = len(df)
split_point = int(0.6*l)
split_point2 = int(0.8*l)
train = df[:split_point]
validation = df[split_point: split_point2]
test = df[split_point2:]

In [7]:
len(train), len(validation), len(test)

(4258, 1420, 1420)

# The target: 3-days forward forecast 

In [8]:
y_train = train['Close'].pct_change(3).shift(-3)
y_train

Date
1996-10-29 19:00:00-05:00   -0.002627
1996-10-30 19:00:00-05:00    0.000613
1996-10-31 19:00:00-05:00    0.003965
1996-11-03 19:00:00-05:00    0.000615
1996-11-04 19:00:00-05:00   -0.021969
                               ...   
2013-02-17 19:00:00-05:00   -0.000661
2013-02-18 19:00:00-05:00   -0.005339
2013-02-19 19:00:00-05:00         NaN
2013-02-20 19:00:00-05:00         NaN
2013-02-21 19:00:00-05:00         NaN
Name: Close, Length: 4258, dtype: float64

# Characteristics of fx daily ohlc

In [9]:


class OHLCAnalyzer:
    def __init__(self, ohlc_data):
        self.ohlc_data = ohlc_data

    def calculate_return_feature(self, periods, forward=False):
        return_feature = self.ohlc_data['Close'].pct_change(periods)
        return return_feature.shift(-periods) if forward else return_feature

    def calculate_volatility_feature(self, window):
        daily_returns = self.ohlc_data['Close'].pct_change()
        return daily_returns.rolling(window=window).std() * (252 ** 0.5)

    def prepare_features(self):
        feature_data = pd.DataFrame(index=self.ohlc_data.index)

        for period in [1, 3, 5, 10]:
            feature_data[f'forward_return_{period}'] = self.calculate_return_feature(period, forward=True)
            feature_data[f'return_{period}'] = self.calculate_return_feature(period)

        for window in [5, 10, 20]:
            feature_data[f'volatility_{window}'] = self.calculate_volatility_feature(window)

        return feature_data

In [10]:
analyzer = OHLCAnalyzer(train)
features_characteristics = analyzer.prepare_features()
features_characteristics 


Unnamed: 0_level_0,forward_return_1,return_1,forward_return_3,return_3,forward_return_5,return_5,forward_return_10,return_10,volatility_5,volatility_10,volatility_20
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1996-10-29 19:00:00-05:00,0.000000,,-0.002627,,-0.002014,,-0.024785,,,,
1996-10-30 19:00:00-05:00,-0.005956,0.000000,0.000613,,-0.002014,,-0.024785,,,,
1996-10-31 19:00:00-05:00,0.003348,-0.005956,0.003965,,-0.015507,,-0.021586,,,,
1996-11-03 19:00:00-05:00,0.003249,0.003348,0.000615,-0.002627,-0.024236,,-0.021777,,,,
1996-11-04 19:00:00-05:00,-0.002626,0.003249,-0.021969,0.000613,-0.025558,,-0.023370,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2013-02-17 19:00:00-05:00,-0.000768,0.008230,-0.000661,0.004469,,0.012828,,0.010262,0.168989,0.158782,0.156108
2013-02-18 19:00:00-05:00,0.000619,-0.000768,-0.005339,0.004225,,-0.005744,,0.016245,0.100684,0.153432,0.153326
2013-02-19 19:00:00-05:00,-0.000512,0.000619,,0.008079,,0.004319,,0.001475,0.068955,0.133002,0.144088
2013-02-20 19:00:00-05:00,-0.005445,-0.000512,,-0.000661,,0.004332,,0.002408,0.068938,0.132766,0.144371


In [11]:
features_characteristics.columns

Index(['forward_return_1', 'return_1', 'forward_return_3', 'return_3',
       'forward_return_5', 'return_5', 'forward_return_10', 'return_10',
       'volatility_5', 'volatility_10', 'volatility_20'],
      dtype='object')

# using technical indicators to build features

In [12]:
!pip install ta-lib



In [22]:
import talib
from technical_indicators import TAFeatures

In [34]:
class TAFeatures:
    def __init__(self, ohlc_data):
        self.ohlc_data = ohlc_data
        self.open = ohlc_data['Open']
        self.high = ohlc_data['High']
        self.low = ohlc_data['Low']
        self.close = ohlc_data['Close']

    def ADX(self,timeperiod=14):
        return talib.ADX(self.high, self.low, self.close, timeperiod)

    def ADXR(self,timeperiod=14):
        return talib.ADXR(self.high, self.low, self.close, timeperiod)

    def APO(self, fastperiod=12, slowperiod=26, matype=0):
        return talib.APO(self.close, fastperiod, slowperiod, matype)

    def AROONOSC(self,timeperiod=14):
        return talib.AROONOSC(self.high, self.low, timeperiod)

    def BOP(self):
        return talib.BOP(self.open, self.high, self.low, self.close)

    def CCI(self,timeperiod=14):
        return talib.CCI(self.high, self.low, self.close, timeperiod)

    def MACD(self,fastperiod=12, slowperiod=26, signalperiod=9):
        macd, macdsignal, macdhist = talib.MACD(self.close, fastperiod, slowperiod, signalperiod)
        return macd

    def MOM(self,timeperiod=10):
        return talib.MOM(self.close, timeperiod)

    def RSI(self, timeperiod=14):
        return talib.RSI(self.close, timeperiod=14)

    def ULTOSC(self, timeperiod1=7, timeperiod2=14, timeperiod3=28):
        return talib.ULTOSC(self.high, self.low, self.close, timeperiod1, timeperiod2, timeperiod3)

    def BBANDS(self, timeperiod=5, nbdevup=2, nbdevdn=2, matype=0):
        return talib.BBANDS(self.close, timeperiod, nbdevup, nbdevdn, matype)

    def DEMA(self, timeperiod=30):
        return talib.DEMA(self.close, timeperiod)

    def EMA(self, timeperiod=30):
        return talib.EMA(self.close, timeperiod)

    def MA(self, timeperiod=30, matype=0):
        return talib.MA(self.close, timeperiod, matype)

    def get_all_indicators(self):
        indicators_df = pd.DataFrame(index=self.ohlc_data.index)

        indicators_df['ADX'] = self.ADX()
        indicators_df['ADXR'] = self.ADXR()
        indicators_df['APO'] = self.APO()
        indicators_df['AROONOSC'] = self.AROONOSC()
        indicators_df['BOP'] = self.BOP()
        indicators_df['CCI'] = self.CCI()
        indicators_df['MACD'] = self.MACD()
        indicators_df['MOM'] = self.MOM()
        indicators_df['RSI'] = self.RSI()
        indicators_df['ULTOSC'] = self.ULTOSC()
        indicators_df['DEMA'] = self.DEMA()
        indicators_df['EMA'] = self.EMA()
        indicators_df['MA'] = self.MA()
        return indicators_df

In [35]:
ta_indicators = TAFeatures(train)
ta_features = ta_indicators.get_all_indicators()
ta_features


Unnamed: 0_level_0,ADX,ADXR,APO,AROONOSC,BOP,CCI,MACD,MOM,RSI,ULTOSC,DEMA,EMA,MA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1996-10-29 19:00:00-05:00,,,,,-0.218394,,,,,,,,
1996-10-30 19:00:00-05:00,,,,,0.000000,,,,,,,,
1996-10-31 19:00:00-05:00,,,,,0.000000,,,,,,,,
1996-11-03 19:00:00-05:00,,,,,0.582519,,,,,,,,
1996-11-04 19:00:00-05:00,,,,,0.613634,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-02-17 19:00:00-05:00,38.938878,42.493046,1.782987,71.428571,-0.058334,98.338755,1.484784,0.952003,64.476351,56.615309,93.911771,90.916215,90.860533
2013-02-18 19:00:00-05:00,37.348868,41.424805,1.786173,57.142857,0.073262,70.691302,1.440997,1.497002,63.941581,52.788137,94.059877,91.092588,91.073767
2013-02-19 19:00:00-05:00,35.580728,40.685470,1.700956,57.142857,0.000000,51.605218,1.394897,0.138000,64.199175,58.649570,94.195025,91.261324,91.301767
2013-02-20 19:00:00-05:00,33.504768,39.931169,1.637693,50.000000,0.009636,25.561699,1.339053,0.225006,63.793031,56.914405,94.305277,91.416078,91.486000
