In this notebook, we'll start to construct our dataset and explore a few modeling options so that later, we can create an optimized version of our code for production

# Dataset and feature construction

## Step 1: Get the high-level structure of our dataset in place

In [91]:
MinuteFrequencyToSubsample = 15
HoursInTradingDay = 6.5
MinutesInHour = 60
RecordsPerDay = (MinutesInHour * HoursInTradingDay / MinuteFrequencyToSubsample) + 1
NameOfColumnForTradingDaysInFuture = "Trading days in future"
LabelName = "returns"


TimeframesInTheFutureInTradingDays = [
    0.5 / HoursInTradingDay, # 3:30 PM
    1.0 / HoursInTradingDay, # 3:00 PM
    2.0 / HoursInTradingDay, # 2:00 PM
    3.0 / HoursInTradingDay, # 1:00 PM
    4.0 / HoursInTradingDay, # 12:00 PM
    5.0 / HoursInTradingDay, # 11:00 AM
    6.0 / HoursInTradingDay, # 10:00 AM
    1.0, # 9:30 AM
    2.0,
    3.0,
    4.0,
    5.0
]

In [92]:
import os
from price_modeling.src.main.code.file_based_historical_data_fetcher import FileBasedHistoricalDataFetcher
file_path = os.path.expanduser('~/Documents/projects/spyndicator-data/SPX_1min.txt')
data_fetcher = FileBasedHistoricalDataFetcher(file_path = file_path)

In [93]:
source_data = data_fetcher.get_historical_data()

In [94]:
evaluation_data_candidates = None
evaluation_data = None

In [95]:
from datetime import datetime
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
one_year_ago = datetime.today() - relativedelta(years = 1)
training_data_candidates = source_data[source_data.index < one_year_ago]
evaluation_data_candidates = source_data[source_data.index >= one_year_ago]

In [96]:
candidates = training_data_candidates[training_data_candidates.index.minute % 15 == 0]

In [97]:
def _get_forward_period_for_percentage_return(trading_days_in_future: float) -> int:
    """
    This method calculates the number of future periods in a dataframe that must be looked at, given a sub-sampled dataframe,
    in order to calculate the percentage return after `trading_days_in_future` in the future with respect to any given record

    :param trading_days_in_future (float): The number of trading days in the future that we need to predict for
    :return (int): The number of time periods in the future to look at given a sub-sampled dataframe
    """

    if trading_days_in_future >= 1.0:
        return int(round(trading_days_in_future * RecordsPerDay, 0))

    minutes_ahead = trading_days_in_future * MinutesInHour * HoursInTradingDay
    return int(round(minutes_ahead / MinuteFrequencyToSubsample, 0))

In [98]:
def _get_time_for_prediction_given_timerange_in_future(trading_days_in_future: float) -> str:
    """
    This method calculates the time at which we are making a prediction given the parameter for trading days in the future

    :param trading_days_in_future (float): The number of trading days in the future that we need to predict for
    :return (str): The string representation of the time for which we are predicting
    """

    if trading_days_in_future < 1.0:
        number_of_minutes_until_close = round(trading_days_in_future * MinutesInHour * HoursInTradingDay, 1)
        timestamp = datetime.now()
        timestamp = timestamp.replace(hour = 16, minute = 00)
        timestamp = timestamp - timedelta(minutes = number_of_minutes_until_close)
        return datetime.strftime(timestamp, "%H:%M")

    return "09:30"

In [99]:
def _create_dataset_with_future_percentage_return(candidates: pd.DataFrame, trading_days_in_future: float) -> pd.DataFrame:
    """
    This method is responsible for taking a source list of candidates and creating a new dataset from them, one which contains the
    percentage return of the stock given the timeframe in the future

    :param candidates (obj:`pd.DataFrame`): A dataframe that contains a list of OHLC data
    :param trading_days_in_future (float): The number of trading days in the future that we need to create labels for
    :return (obj:`pd.DataFrame`): A dataframe that contains a datetime column, the stock price, the days in the future, and the percentage return
    """

    time_at_prediction = _get_time_for_prediction_given_timerange_in_future(trading_days_in_future = trading_days_in_future)
    forward_periods_for_percentage_return = _get_forward_period_for_percentage_return(trading_days_in_future = trading_days_in_future)
    percentage_changes = candidates.pct_change(-forward_periods_for_percentage_return).between_time(time_at_prediction, time_at_prediction, inclusive = "both")
    percentage_changes[NameOfColumnForTradingDaysInFuture] = trading_days_in_future
    percentage_changes.rename(columns = {"Close": "returns"}, inplace = True)
    return percentage_changes[["returns", NameOfColumnForTradingDaysInFuture]]

In [100]:
def _create_dataset_with_future_percentage_return(candidates: pd.DataFrame, trading_days_in_future: float) -> pd.DataFrame:
    """
    This method is responsible for taking a source list of candidates and creating a new dataset from them, one which contains the
    percentage return of the stock given the timeframe in the future

    :param candidates (obj:`pd.DataFrame`): A dataframe that contains a list of OHLC data
    :param trading_days_in_future (float): The number of trading days in the future that we need to create labels for
    :return (obj:`pd.DataFrame`): A dataframe that contains a datetime column, the stock price, the days in the future, and the percentage return
    """

    time_at_prediction = _get_time_for_prediction_given_timerange_in_future(trading_days_in_future = trading_days_in_future)
    forward_periods_for_percentage_return = _get_forward_period_for_percentage_return(trading_days_in_future = trading_days_in_future)
    percentage_changes = candidates.pct_change(-forward_periods_for_percentage_return).between_time(time_at_prediction, time_at_prediction, inclusive = "both")
    percentage_changes[NameOfColumnForTradingDaysInFuture] = trading_days_in_future
    percentage_changes.rename(columns = {"Close": LabelName}, inplace = True)
    percentage_changes = percentage_changes[[LabelName, NameOfColumnForTradingDaysInFuture]]
    percentage_changes.dropna(subset = [LabelName])
    return percentage_changes

In [101]:
import numpy as np

In [102]:
def _construct_dataset_with_labels_from_candidates(candidates: pd.DataFrame) -> pd.DataFrame:
    """
    This method constructs a new dataset that uses the data from the candidates dataframe to act as the basis for creating training
    instances. Multiple new instances, one for each future time period that we would like to predict for, are created based on each
    record within the candidates dataframe.

    :param candidates (obj:`pd.DataFrame`): A dataframe that contains OHLC data
    :return (obj:`pd.DataFrame`): A dataframe in which each record represents a point in time and contains labels for the future return
    of the stock price at a specific point in the future (respective to the time of the instance)
    """

    dataset = None
    for prediction_timeframe in TimeframesInTheFutureInTradingDays:
        dataset_with_labels_for_future = _create_dataset_with_future_percentage_return(candidates = candidates, trading_days_in_future = prediction_timeframe)
        if dataset is None:
            dataset = dataset_with_labels_for_future
        else:
            dataset = pd.concat([dataset, dataset_with_labels_for_future])
    dataset.replace([np.inf, -np.inf], np.nan)
    dataset.dropna(subset = [LabelName], inplace = True)
    return dataset

In [103]:
# Here we subsample by minute to reduce the dataset size and all of the duplicative data that tends to exist from one minute to the next
training_data_candidates = training_data_candidates[training_data_candidates.index.minute % MinuteFrequencyToSubsample == 0]
training_data_candidates = _construct_dataset_with_labels_from_candidates(candidates = training_data_candidates)
# training_data = _add_features_to_base_instances_and_return_data(base_instances = training_data_candidates)

In [104]:
training_data_candidates

Unnamed: 0_level_0,returns,Trading days in future
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2007-04-30 15:30:00,0.003798,0.076923
2007-05-01 15:30:00,-0.001440,0.076923
2007-05-02 15:30:00,0.000628,0.076923
2007-05-03 15:30:00,0.000479,0.076923
2007-05-04 15:30:00,0.000306,0.076923
...,...,...
2022-02-17 09:30:00,0.033967,5.000000
2022-02-18 09:30:00,0.010634,5.000000
2022-02-22 09:30:00,-0.008057,5.000000
2022-02-23 09:30:00,0.000870,5.000000
