In [10]:
import numpy as np
import pandas as pd
import matplotlib as plt
import statsmodels

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.describe()

Unnamed: 0,Open,High,Low,Close
count,7098.0,7098.0,7098.0,7098.0
mean,110.649592,111.10139,110.155538,110.644679
std,14.894489,14.980412,14.806835,14.894081
min,75.75,75.973999,75.57,75.739998
25%,104.18,104.61425,103.71225,104.176752
50%,110.698498,111.100502,110.308498,110.7085
75%,119.260752,119.782247,118.720001,119.262251
max,151.645996,151.936996,151.240005,151.645996


In [4]:
df

Unnamed: 0,Date,Open,High,Low,Close
0,1996-10-29 19:00:00-05:00,114.370003,114.480003,113.610001,114.180000
1,1996-10-30 19:00:00-05:00,114.180000,114.180000,114.180000,114.180000
2,1996-10-31 19:00:00-05:00,113.500000,113.500000,113.500000,113.500000
3,1996-11-03 19:00:00-05:00,113.279999,113.980003,112.949997,113.879997
4,1996-11-04 19:00:00-05:00,113.709999,114.330002,113.449997,114.250000
...,...,...,...,...,...
7093,2024-01-07 19:00:00-05:00,144.716003,144.792999,143.669998,144.716003
7094,2024-01-08 19:00:00-05:00,144.229996,144.432007,143.453003,144.229996
7095,2024-01-09 19:00:00-05:00,144.427994,145.809998,144.438995,144.427994
7096,2024-01-10 19:00:00-05:00,145.733994,146.358994,145.279007,145.733994


In [5]:
df.set_index('Date', inplace=True)

# split train，validation, test set

In [6]:
#split the train, validation and test set
l = len(df)
split_point = int(0.6*l)
split_point2 = int(0.8*l)
train = df[:split_point]
validation = df[split_point: split_point2]
test = df[split_point2:]

In [7]:
len(train), len(validation), len(test)

(4258, 1420, 1420)

# The target: 3-days forward forecast 

In [38]:
y_train = train['Close'].pct_change(3).shift(-3)
y_train

Date
1996-10-29 19:00:00-05:00   -0.002627
1996-10-30 19:00:00-05:00    0.000613
1996-10-31 19:00:00-05:00    0.003965
1996-11-03 19:00:00-05:00    0.000615
1996-11-04 19:00:00-05:00   -0.021969
                               ...   
2013-02-17 19:00:00-05:00   -0.000661
2013-02-18 19:00:00-05:00   -0.005339
2013-02-19 19:00:00-05:00         NaN
2013-02-20 19:00:00-05:00         NaN
2013-02-21 19:00:00-05:00         NaN
Name: Close, Length: 4258, dtype: float64

# Characteristics of fx daily ohlc

In [47]:
class OHLCAnalyzer:
    def __init__(self, ohlc_data):
        self.ohlc_data = ohlc_data

    def add_forward_return_feature(self, periods):
        return_name = f'forward_return_{periods}'
        self.ohlc_data[return_name] = self.ohlc_data['Close'].pct_change(periods).shift(-periods)

    def add_return_feature(self, periods):
        return_name = f'return_{periods}'
        self.ohlc_data[return_name] = self.ohlc_data['Close'].pct_change(periods)

    def add_volatility_feature(self, window):
        vol_name = f'volatility_{window}'
        daily_returns = self.ohlc_data['Close'].pct_change()
        self.ohlc_data[vol_name] = daily_returns.rolling(window=window).std() * (252 ** 0.5)


    def prepare_features(self):

        for period in [1,3,5,10]:
            self.add_forward_return_feature(period)
            self.add_return_feature(period)

        for window in [5,10,20]:
            self.add_volatility_feature(window)

    def get_data(self):
        return self.ohlc_data
    
        

In [48]:
analyzer = OHLCAnalyzer(train)
analyzer.prepare_features()
train = analyzer.get_data()

In [49]:
train

Unnamed: 0_level_0,Open,High,Low,Close,return,forward_return_1,return_1,forward_return_3,return_3,forward_return_5,return_5,forward_return_10,return_10,volatility_5,volatility_10,volatility_20
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1996-10-29 19:00:00-05:00,114.370003,114.480003,113.610001,114.180000,,0.000000,,-0.002627,,-0.002014,,-0.024785,,,,
1996-10-30 19:00:00-05:00,114.180000,114.180000,114.180000,114.180000,0.000000,-0.005956,0.000000,0.000613,,-0.002014,,-0.024785,,,,
1996-10-31 19:00:00-05:00,113.500000,113.500000,113.500000,113.500000,-0.005956,0.003348,-0.005956,0.003965,,-0.015507,,-0.021586,,,,
1996-11-03 19:00:00-05:00,113.279999,113.980003,112.949997,113.879997,0.003348,0.003249,0.003348,0.000615,-0.002627,-0.024236,,-0.021777,,,,
1996-11-04 19:00:00-05:00,113.709999,114.330002,113.449997,114.250000,0.003249,-0.002626,0.003249,-0.021969,0.000613,-0.025558,,-0.023370,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-02-17 19:00:00-05:00,93.750000,94.181999,93.702003,93.722000,0.008230,-0.000768,0.008230,-0.000661,0.004469,,0.012828,,0.010262,0.168989,0.158782,0.156108
2013-02-18 19:00:00-05:00,93.610001,93.903999,93.358002,93.650002,-0.000768,0.000619,-0.000768,-0.005339,0.004225,,-0.005744,,0.016245,0.100684,0.153432,0.153326
2013-02-19 19:00:00-05:00,93.708000,93.751999,93.139999,93.708000,0.000619,-0.000512,0.000619,,0.008079,,0.004319,,0.001475,0.068955,0.133002,0.144088
2013-02-20 19:00:00-05:00,93.650002,93.848000,92.809998,93.660004,-0.000512,-0.005445,-0.000512,,-0.000661,,0.004332,,0.002408,0.068938,0.132766,0.144371


In [51]:
train.columns

Index(['Open', 'High', 'Low', 'Close', 'return', 'forward_return_1',
       'return_1', 'forward_return_3', 'return_3', 'forward_return_5',
       'return_5', 'forward_return_10', 'return_10', 'volatility_5',
       'volatility_10', 'volatility_20'],
      dtype='object')