In this notebook, we'll start to construct our dataset and explore a few modeling options so that later, we can create an optimized version of our code for production

# Dataset and feature construction

## Step 1: Get the high-level structure of our dataset in place

In [18]:
MinuteFrequencyToSubsample = 15
HoursInTradingDay = 6.5
MinutesInHour = 60
RecordsPerDay = (MinutesInHour * HoursInTradingDay / MinuteFrequencyToSubsample) + 1

In [11]:
import os
from price_modeling.src.main.code.file_based_historical_data_fetcher import FileBasedHistoricalDataFetcher
file_path = os.path.expanduser('~/Documents/projects/spyndicator-data/SPX_1min.txt')
data_fetcher = FileBasedHistoricalDataFetcher(file_path = file_path)

In [4]:
source_data = data_fetcher.get_historical_data()

In [17]:
evaluation_data_candidates = None
evaluation_data = None

In [6]:
from datetime import datetime
import pandas as pd
from dateutil.relativedelta import relativedelta
one_year_ago = datetime.today() - relativedelta(years = 1)
training_data_candidates = source_data[source_data.index < one_year_ago]
evaluation_data_candidates = source_data[source_data.index >= one_year_ago]

In [9]:
training_data_candidates = training_data_candidates[training_data_candidates.index.minute % 15 == 0]

In [19]:
def _get_time_for_prediction_given_timerange_in_future(trading_days_in_future: float) -> str:
    """
    This method calculates the time at which we are making a prediction given the parameter for trading days in the future

    :param trading_days_in_future (float): The number of trading days in the future that we need to predict for
    :return (str): The string representation of the time for which we are predicting
    """

    if trading_days_in_future < 1.0:
        number_of_minutes_after_open = trading_days_in_future * MinutesInHour * HoursInTradingDay
        hours_to_add = int(number_of_minutes_after_open / MinutesInHour)
        minutes_to_add = number_of_minutes_after_open - (hours_to_add * MinutesInHour)
        hours = 9 + hours_to_add
        minutes = minutes_to_add
        times_for_training_data_to_keep = str(hours) + ":" + str(minutes)
        return times_for_training_data_to_keep

    return "09:30"

In [20]:
def _get_forward_period_for_percentage_return(trading_days_in_future: float) -> int:
    """
    This method calculates the number of future periods in a dataframe that must be looked at, given a sub-sampled dataframe,
    in order to calculate the percentage return after `trading_days_in_future` in the future with respect to any given record

    :param trading_days_in_future (float): The number of trading days in the future that we need to predict for
    :return (int): The number of time periods in the future to look at given a sub-sampled dataframe
    """

    if trading_days_in_future >= 1.0:
        return trading_days_in_future * RecordsPerDay

    minutes_ahead = trading_days_in_future * MinutesInHour * HoursInTradingDay
    return minutes_ahead / MinuteFrequencyToSubsample

In [23]:
time_at_prediction = _get_time_for_prediction_given_timerange_in_future(trading_days_in_future = 0.5/6.5)

In [24]:
time_at_prediction

'9:30.000000000000004'