In [2]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import time
import json

from datetime import datetime, timedelta

In [3]:
# Read bloomberg data
raw_bloomberg_df = pd.read_csv('usd_jpy_data.csv', header=1)

# Rename header
raw_bloomberg_df = raw_bloomberg_df.rename(columns={'Unnamed: 0': 'Datetime'})

raw_bloomberg_df

Unnamed: 0,Datetime,PX_OPEN,PX_HIGH,PX_LOW,PX_LAST,BID,ASK,PX_MID,high,low
0,2011-01-03,81.34,81.75,80.93,81.74,81.73,81.74,81.735,81.75,80.93
1,2011-01-04,81.74,82.28,81.61,82.04,82.03,82.05,82.038,82.28,81.61
2,2011-01-05,82.04,83.38,81.89,83.25,83.24,83.26,83.248,83.38,81.89
3,2011-01-06,83.25,83.40,82.88,83.33,83.32,83.34,83.327,83.40,82.88
4,2011-01-07,83.33,83.68,82.86,83.15,83.13,83.17,83.149,83.68,82.86
...,...,...,...,...,...,...,...,...,...,...
3601,2024-10-22,150.84,151.20,150.50,151.08,151.06,151.09,151.079,151.20,150.50
3602,2024-10-23,151.08,153.19,151.03,152.76,152.75,152.77,152.760,153.19,151.03
3603,2024-10-24,152.76,152.83,151.55,151.83,151.82,151.84,151.829,152.83,151.55
3604,2024-10-25,151.83,152.38,151.46,152.31,152.23,152.39,152.313,152.38,151.46


In [4]:
# # Find rows where PX_OPEN, PX_LAST, PX_HIGH, and PX_LOW have the same value
# same_value_rows = raw_bloomberg_df[
#     (raw_bloomberg_df['PX_OPEN'] == raw_bloomberg_df['PX_LAST']) &
#     (raw_bloomberg_df['PX_OPEN'] == raw_bloomberg_df['PX_HIGH']) &
#     (raw_bloomberg_df['PX_OPEN'] == raw_bloomberg_df['PX_LOW'])
# ]

# # Display the rows with the same values
# same_value_rows

Unnamed: 0,Datetime,PX_OPEN,PX_HIGH,PX_LOW,PX_LAST,BID,ASK,PX_MID,high,low


In [5]:
# Make sure that 'Datetime; is in the correct format
raw_bloomberg_df['Datetime'] = pd.to_datetime(raw_bloomberg_df['Datetime'])

# Filter the data from year 1995 only because the years before has the same value for the OHLC
df = raw_bloomberg_df[raw_bloomberg_df['Datetime'].dt.year >= 1994]

# Take necessary columns
df = df[['Datetime','PX_OPEN','PX_HIGH','PX_LOW','PX_LAST']]

# Rename the columns
df.columns = ['date','open', 'high', 'low', 'close']

df

Unnamed: 0,date,open,high,low,close
0,2011-01-03,81.34,81.75,80.93,81.74
1,2011-01-04,81.74,82.28,81.61,82.04
2,2011-01-05,82.04,83.38,81.89,83.25
3,2011-01-06,83.25,83.40,82.88,83.33
4,2011-01-07,83.33,83.68,82.86,83.15
...,...,...,...,...,...
3601,2024-10-22,150.84,151.20,150.50,151.08
3602,2024-10-23,151.08,153.19,151.03,152.76
3603,2024-10-24,152.76,152.83,151.55,151.83
3604,2024-10-25,151.83,152.38,151.46,152.31


In [6]:
# Display rows with any null values in the 'open', 'high', 'low', or 'close' columns
missing_values_df = df[df[['open', 'high', 'low', 'close']].isnull().any(axis=1)]
missing_values_df

Unnamed: 0,date,open,high,low,close


In [145]:
# # only the data from 2008-2011 has this issue

# # Drop rows where both 'open' and 'high' are null
# df = df.dropna(subset=['open', 'high'], how='all')

# # Step 2: Fill null values in 'low' and 'close' with values from 'open'
# df['low'] = df['low'].fillna(df['open']) # open == high
# df['close'] = df['close'].fillna(df['open']) # open == high

# # Display the modified DataFrame
# df

In [7]:
# Check for any missing values
print(df.isnull().sum())

date     0
open     0
high     0
low      0
close    0
dtype: int64


In [None]:
# Index(['date', 'open', 'high', 'low', 'close', 'gap_day', 'next_close',
#        'open_change', 'high_change', 'low_change', 'close_change',
#        'ma_close_5', 'ma_close_10', 'prev_open', 'prev_high', 'prev_low',
#        'prev_close', 'daily_range', 'gap_change', 'close_ma5_ratio',
#        'close_ma10_ratio'],
#       dtype='object')

In [8]:
# Ensure 'date' is consistently used as a datetime format
df['date'] = pd.to_datetime(df['date'])

# Features Engineering

# Create the next_close column as the target variable to be predicted
df['next_close'] = df['close'].shift(-1)

# Count the next day gap
df['gap_day'] = df['date'].shift(-1) - df['date']
# Convert gap_day to just the number of days and make it an integer
df['gap_day'] = df['gap_day'].dt.days.fillna(0).astype(int)

# Calculate the changes from the previous row for OHLC
df['open_change'] = df['open'].diff()
df['high_change'] = df['high'].diff()
df['low_change'] = df['low'].diff()
df['close_change'] = df['close'].diff()

# Function to calculate moving average over a custom date range
def custom_moving_average(df, close_col, date_col, days):
    averages = []
    for index, row in df.iterrows():
        current_date = row[date_col]
        # Get the date range
        start_date = current_date - pd.Timedelta(days=days)
        # Filter the DataFrame for the desired date range
        mask = (df[date_col] >= start_date) & (df[date_col] <= current_date)
        # Calculate the mean for the filtered values
        avg = df.loc[mask, close_col].mean()
        averages.append(avg)
    return averages

# Calculate the custom moving averages
df['ma_close_5'] = custom_moving_average(df, 'close', 'date', days=4)  # 4 previous days + today
df['ma_close_10'] = custom_moving_average(df, 'close', 'date', days=9)  # 9 previous days + today

# Capture previous values
df['prev_open'] = df['open'].shift(1)
df['prev_high'] = df['high'].shift(1)
df['prev_low'] = df['low'].shift(1)
df['prev_close'] = df['close'].shift(1)

# Add daily range (high-low)
df['daily_range'] = df['high'] - df['low']

# Add gap change (today's open - previous close)
df['gap_change'] = df['open'] - df['prev_close']

# Add close-to-moving average ratios
df['close_ma5_ratio'] = df['close'] / df['ma_close_5']
df['close_ma10_ratio'] = df['close'] / df['ma_close_10']

# Drop columns that are not needed for prediction
df = df[['date', 'open', 'high', 'low', 'close', 'gap_day', 
         'next_close', 'open_change', 'high_change', 
         'low_change', 'close_change', 'ma_close_5', 
         'ma_close_10', 'prev_open', 'prev_high', 
         'prev_low', 'prev_close', 'daily_range', 'gap_change',
         'close_ma5_ratio', 'close_ma10_ratio']]

# Filter the DataFrame to include only rows from the year 2012 onwards
df = df[df['date'] >= '2012-01-01']

# Dropping last row
df = df.drop(df.index[-1])

df

Unnamed: 0,date,open,high,low,close,gap_day,next_close,open_change,high_change,low_change,...,ma_close_5,ma_close_10,prev_open,prev_high,prev_low,prev_close,daily_range,gap_change,close_ma5_ratio,close_ma10_ratio
260,2012-01-02,76.84,77.17,76.33,76.90,1,76.74,-0.80,-0.57,-0.56,...,77.150000,77.541667,77.64,77.74,76.89,76.91,0.84,-0.07,0.996760,0.991725
261,2012-01-03,76.90,76.97,76.63,76.74,1,76.72,0.06,-0.20,0.30,...,76.850000,77.427143,76.84,77.17,76.33,76.90,0.34,0.00,0.998569,0.991125
262,2012-01-04,76.74,76.83,76.61,76.72,1,77.12,-0.16,-0.14,-0.02,...,76.786667,77.338750,76.90,76.97,76.63,76.74,0.22,0.00,0.999132,0.991999
263,2012-01-05,76.72,77.25,76.66,77.12,1,76.97,-0.02,0.42,0.05,...,76.870000,77.231250,76.74,76.83,76.61,76.72,0.59,0.00,1.003252,0.998560
264,2012-01-06,77.12,77.34,76.96,76.97,3,76.86,0.40,0.09,0.30,...,76.890000,77.117500,76.72,77.25,76.66,77.12,0.38,0.00,1.001040,0.998087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600,2024-10-21,149.56,150.89,149.09,150.84,1,151.08,-0.65,0.61,-0.28,...,150.193333,149.863333,150.21,150.28,149.37,149.53,1.80,0.03,1.004306,1.006517
3601,2024-10-22,150.84,151.20,150.50,151.08,1,152.76,1.28,0.31,1.41,...,150.483333,150.037143,149.56,150.89,149.09,150.84,0.70,0.00,1.003965,1.006951
3602,2024-10-23,151.08,153.19,151.03,152.76,1,151.83,0.24,1.99,0.53,...,151.560000,150.377500,150.84,151.20,150.50,151.08,2.16,0.00,1.007918,1.015843
3603,2024-10-24,152.76,152.83,151.55,151.83,1,152.31,1.68,-0.36,0.52,...,151.627500,150.636250,151.08,153.19,151.03,152.76,1.28,0.00,1.001336,1.007925


In [9]:
# Count the frequency of each unique value in the 'gap_day' column
gap_day_frequency = df['gap_day'].value_counts()

# Display the frequency counts
print(gap_day_frequency)

gap_day
1    2676
3     669
Name: count, dtype: int64


In [10]:
# Specify the filename
filename = 'bloomberg_usd_jpy_2012_2024_processed.csv'

# Write the DataFrame to a CSV file
df.to_csv(filename, index=False)

print(f'DataFrame written to {filename}')

DataFrame written to bloomberg_usd_jpy_2012_2024_processed.csv
