In [2]:
import os
import sys
import yaml

sys.path.append(os.getenv("CODE_PATH"))
sys.path.append(os.getenv("FIN_DATABASE_PATH"))

import plotly.graph_objects as go
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
from dotenv import load_dotenv
from log_config import setup_logging
from Data.connect import engine, DailyStockData, HourlyStockData, OneMinuteStockData, FiveMinuteStockData,FifteenMinuteStockData, StockSplits, StockNews, CompanyFinancials
from data_fetcher import DataFetcher
import yfinance as yf
import matplotlib.pyplot as plt
from pandas.tseries.offsets import DateOffset
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression



In [None]:
#Fetching the data from SQL for AAPL and MSFT
tickers = ['AAPL','MSFT', 'DIS', 'JPM', 'DIS', 'SPY']
data = DataFetcher(tickers)

In [None]:
vix = yf.download('^VIX',period = 'max', interval= '1d')

In [None]:
minute_data = data.get_stock_data(timespan = 'hour')
apple_min = minute_data['AAPL']

In [None]:
apple_min

In [None]:
# daily_data = data.get_stock_data() #default timespan is daily
# apple_daily = daily_data['AAPL']

In [None]:
class BaseProcessing:
    def __init__(self, data: pd.DataFrame, iv_data: pd.DataFrame = None, period: str = '5T'):
        self.original_data = data.copy() 
        self.data = data.copy()  
        self.iv_data = iv_data 
        self.period = period
    
    def reset_data(self):
        """Reset data to the original state."""
        self.data = self.original_data.copy()
        
    def set_period(self, period: str = '5T'):
        """Set a new period for data aggregation."""
        self.period = period


In [None]:
class PreProcessing(BaseProcessing):
    def __init__(self, data: pd.DataFrame, iv_data: pd.DataFrame, period: str = '5T'):
        super().__init__(data, iv_data,period)
        
        
    def filter_market_hours(self):
        """Filtering the dataframe by market hours. Including 09:30 to 16:05 as
        per Gallo and Browlees 2006 paper recommendaton """
        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data['day'] = self.data['date'].dt.date
        self.data['time'] = self.data['date'].dt.time
        start_time = datetime.strptime('09:30', '%H:%M').time()
        end_time = datetime.strptime('16:05', '%H:%M').time()
        self.data = self.data[
            (self.data['time'] >= start_time) & (self.data['time'] <= end_time)].copy()
        self.data.drop(columns=['time'], inplace = True)
        return self
    
    
    def setting_index(self):
        """Set date column as index for the dataframe"""
        self.data.set_index('date', inplace = True)
        return self
    
    def aggregate_min_data(self):
        """ Aggregating the data into intervals."""
        self.data = self.data.groupby(self.data.index.floor(self.period)).aggregate({
            'open': 'first',
            'high': 'max',
            'low' : 'min',
            'close': 'last'
        })
        return self
    
    def preprocess_iv(self):
        self.iv_data.index = pd.to_datetime(self.iv_data.index)
        ticker_dates = self.data.index.date.unique()
        self.iv_data = self.iv_data[self.iv_data.index.isin(ticker_dates)]
        return self.iv_data
    

    def threshold_filter(self, threshold_ratio = 0.85):
        """
        Filter the data for days with at least 85% of data
        (85% used as suggested in Hensen and Huang's paper).
        
        Args:
            threshold (int): Minimum number of  intervals required for a day to be considered valid.
        """
        trading_day = 395 # 9:30 to 16:05
        intervals_per_day = trading_day // (pd.Timedelta(self.period).seconds // 60) #intervals per day
        
        threshold = int(intervals_per_day * threshold_ratio)
        
        daily_counts = self.data.groupby(self.data.index.normalize()).size()
        valid_days = daily_counts[daily_counts >= threshold].index
        self.data = self.data[self.data.index.normalize().isin(valid_days)]
        
        return self
    
    
    def pre_processor(self):
        """Executes the full preprocessing pipeline.
        """
        return self.filter_market_hours().setting_index().aggregate_min_data().threshold_filter().data

In [None]:
class ModelInputs(BaseProcessing):
    """ Class to compute the necessary inputs for the HAR model
        data: Pre Processed data """
    def __init__(self, data: pd.DataFrame, period: str = '5T'):
        super().__init__(data, period)
        
    
    def calculate_return(self):
        self.data['return'] = self.data['close'].pct_change()
        self.data.dropna(subset =['return'], inplace = True)
        self.data['squared_return'] = self.data['return']**2
        
        return self
    
    def set_target(self):
        self.data['target'] = 0.0  # Initialize as float
        
        non_zero_mask = self.data['squared_return'] != 0
        self.data.loc[non_zero_mask, 'target'] = np.log(self.data.loc[non_zero_mask, 'squared_return'])
        
        self.data['target'] = self.data['target'].shift(-1)
        self.data.dropna(subset=['target'], inplace=True)
        
        return self

    
    def realised_vol(self):
        """Calculating daily Realised Volatility and HAR components."""
        
        trading_day = 395 # 9:30 to 16:05
        intervals_per_day = trading_day // (pd.Timedelta(self.period).seconds // 60) #intervals per day
        self.data['LRV_lag1'] = self.data['squared_return'].shift(1)
        self.data['LRV_lag1'] = np.where(self.data['squared_return'].shift(1) == 0, 0, self.data['LRV_lag1'])

        self.data['LRV_daily'] = np.log(self.data['squared_return'].rolling(window = intervals_per_day).sum())
        self.data['LRV_weekly'] = np.log(self.data['squared_return'].shift(1).rolling(window = intervals_per_day * 4).mean())
        self.data['LRV_monthly'] = np.log(self.data['squared_return'].shift(5).rolling(window = intervals_per_day * 17).mean())
        
        return self

        
    
    def diurnal_profile(self, window = 21):
        """Calculate diurnal profile based on the Garman-Klass vol for 
        each time bucket.
        Using quantiles as it is more robust to outliers."""
        self.data['garma_klass_vol'] = np.sqrt(0.5*(np.log(self.data['high']) - np.log(self.data['low']))**2 - (2 * np.log(2) -1) * 
        (np.log(self.data['close']) - np.log(self.data['open']))**2)
        
        grouped = self.data.groupby(self.data.index.time)['garma_klass_vol']
        
        q25 = grouped.apply(lambda x: x.rolling(window=window).quantile(0.25))
        q50 = grouped.apply(lambda x: x.rolling(window=window).quantile(0.5))
        q75 = grouped.apply(lambda x: x.rolling(window=window).quantile(0.75))
        
        diurnal_profile = (q25 + q50 + q75) / 3
        diurnal_profile_standardized = diurnal_profile / diurnal_profile.mean() # standardising it
        
        # This returns a multiindex dataframe, we only need the second index to merge
        diurnal_profile_df = pd.DataFrame({'diurnal_profile': diurnal_profile_standardized}).reset_index(level = 0, drop = True) 
        
        self.data['time_of_day'] = self.data.index.time
        
        self.data = self.data.merge(diurnal_profile_df, how='left', right_index=True, left_index = True)
        self.data.dropna(subset = ['diurnal_profile'], inplace = True) # Dropping nan from the dataset. We will lose roughly a month of observations 
        
        return self
    
    def preprocess(self):
        return self.calculate_return().set_target().realised_vol().diurnal_profile().data
    

In [None]:
preprocess = PreProcessing(apple_min,vix)

In [None]:
apple_prepocessed = preprocess.pre_processor()

In [None]:
inputs = ModelInputs(apple_prepocessed)

In [None]:
apple_data = inputs.preprocess()
apple_data


In [None]:
apple_data[apple_data['time_of_day'] == '09:35:00']

In [None]:
plt.plot(apple_data['diurnal_profile'])

In [None]:
features = ['LRV_lag1', 'LRV_daily', 'LRV_weekly', 'LRV_monthly', 'diurnal_profile']
target = 'target'


We start the training at the beginning of our dataset. We then train for one year test on the other one.
Then we add one year to the training set and test on the following one

In [None]:
class VolPrediction:
    def __init__(self, data: pd.DataFrame, train_size:int = 1, val_size:int = None, test_size:int = 1):
        self.data = data.copy()
        self.train_size = train_size
        self.val_size = val_size
        self.test_size = test_size
        
        self.predictions = []
        self.true_values = []

    def rolling_predict(self, features, target):
        
        self.data['prediction'] = np.nan
        
        current_train_start = self.data.index.min()
        end_date = self.data.index.max()
        
        while current_train_start + DateOffset(years = self.train_size + self.test_size) <= end_date:
        
            current_train_end = current_train_start + DateOffset(years = self.train_size) - DateOffset(days = 1)
            current_test_end = current_train_end + DateOffset(years = self.test_size)
            
            train_data = self.data.loc[current_train_start:current_train_end]
            test_data = self.data.loc[current_train_end + DateOffset(days = 1):current_test_end]
            
            means = train_data[features].mean()
            stds = train_data[features].std()
            
            X_train = (train_data[features] - means) / stds
            y_train = train_data[target]    
            
            X_test = (test_data[features] - means) / stds
            y_test = test_data[target]

            model = LinearRegression()
            model.fit(X_train, y_train)
            
            y_pred = model.predict(X_test)  
            
            self.predictions.extend(y_pred)
            self.true_values.extend(y_test.values)
            
            self.data.loc[test_data.index,'prediction']  = y_pred 
            
            print(f'Training on data from {current_train_start} to {current_train_end}')
            print(f'Testing on data from {current_train_end + DateOffset(days=1)} to {current_test_end}')
            
            current_train_start = current_train_start + DateOffset(years=self.test_size)
            
        return self.data, self.predictions, self.true_values



In [None]:
predicting = HAR(apple_data)

In [None]:
extended HAR - Prediction
RV(i,t+h) = alpha + B * Diurnal component 

here, t is a time, like 10:30, 10:35 etc... h is the window, so t+h is 
the next prediction bucket, so if we it is 11 am and h is 5 min, the prediction
is 11:05, and the Diurnal component of 11:05 is one of the features to incorporate
into the model