In [None]:
# #Importing vol indices
# #source https://www.cboe.com/tradable_products/vix/vix_historical_data/
# vix = pd.read_csv('https://cdn.cboe.com/api/global/us_indices/daily_prices/VIX_History.csv')
# amz_vix = pd.read_csv('https://cdn.cboe.com/api/global/us_indices/daily_prices/VXAZN_History.csv')
# appl_vix = pd.read_csv('https://cdn.cboe.com/api/global/us_indices/daily_prices/VXAPL_History.csv')


In [3]:
import pandas as pd
import numpy as np
import yfinance as yf
from polygon import RESTClient
import datetime as dt
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import os
import shutil
import requests
import aiohttp
import asyncio
from datetime import datetime, timedelta

In [4]:
load_dotenv()
api_key = os.getenv("API_KEY")
client = RESTClient(api_key)


In [5]:
wiki = 'http://en.wikipedia.org/wiki'
djia_ticker_list = wiki + '/Dow_Jones_Industrial_Average'
sp500_tickers_list = wiki + '/List_of_S%26P_500_companies'
tickersSP500 = pd.read_html(sp500_tickers_list)[0].Symbol.to_list()
djia_tickers = pd.read_html(djia_ticker_list)[1].Symbol.to_list()

In [22]:
class DataPipeline:
    def __init__(self, client, tickers, start_date = '2004-01-01', end_date = '2024-04-02', timespan = 'minute', multiplier = 5, limit = 50000):
        self.client = client
        self.tickers = tickers
        self.start_date = start_date
        self.end_date = end_date
        self.timespan = timespan
        self.multiplier = multiplier
        self.limit = limit
    
    def fetch_data(self, ticker):
            data= pd.DataFrame(self.client.list_aggs(ticker,timespan = self.timespan, multiplier = self.multiplier,from_ = self.start_date, to = self.end_date,limit=self.limit))
            data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('US/Eastern').dt.tz_localize(None)
            data = data.rename(columns = {'timestamp': 'Date'}).set_index('Date')
            filtered_data = data.between_time('09:30', '16:05')
            filtered_data = filtered_data.drop(columns = ['open', 'vwap', 'high', 'low', 'volume', 'transactions', 'otc'])
            data = data.drop(columns = ['open', 'vwap', 'high', 'low', 'volume', 'transactions', 'otc'])
            return data, filtered_data
        
    

            
    def process_data(self,df):
        df['Day'] = pd.to_datetime(df.index).normalize()
        df['log_ret'] = np.log(df['close']).diff()
        df['squared_log_ret'] = df['log_ret']**2
        df['returns'] = df['close'].pct_change()
        df['squared_ret'] = df['returns']**2
        return df
    
    def calculate_realized_var(self,df):
        realized_var = df.groupby('Day').agg({'squared_log_ret': 'sum'})
        return realized_var.rename(columns ={'squared_log_ret': 'daily_realised_log_var'})

    
    def processing_pipeline(self):
        results = {}
        for ticker in self.tickers:
            data, filtered_data = self.fetch_data(ticker)
            processed_data = self.process_data(data)
            processed_filtered_data = self.process_data(filtered_data)
            
            realized_var = self.calculate_realized_var(processed_data)
            realized_var_filtered = self.calculate_realized_var(processed_filtered_data)
            
            combined_data = realized_var.join(realized_var_filtered, lsuffix='', rsuffix='_filtered')
            
            
            combined_data.to_csv(f'{ticker}_realized_combined.csv')
            processed_data.to_csv(f'{ticker}_data.csv')
            processed_filtered_data.to_csv(f'{ticker}_filtered_data.csv')
           
            results[ticker] = combined_data
            

        return results
        


In [23]:
update_data = DataPipeline(client=client, tickers= ['SPY'])

In [24]:
result = update_data.processing_pipeline()

In [19]:
# #Moving FILES
# directory = '/Users/raphaelravinet/Code/BSE/Thesis'

# # Target directory 
# directory2 = '/Users/raphaelravinet/Code/BSE/Thesis/data'

# files = [f for f in os.listdir(directory) if f.endswith('.csv')]


# tickers = set(f.split('_')[0] for f in files)


# for ticker in tickers:

#     ticker_folder = os.path.join(directory2, ticker)
    
#     # Create folder if it doesn't exist
#     if not os.path.exists(ticker_folder):
#         os.makedirs(ticker_folder)
    
#     for file in files:
#         if file.startswith(ticker):
#             shutil.move(os.path.join(directory, file), os.path.join(ticker_folder, file))

# print("Folders created and files moved successfully.")


Folders created and files moved successfully.


In [15]:
# def calculate_returns(df):
#     df['returns'] = df['close'].pct_change()
#     return df


# def process_folder(base_dir):
#     for root, dirs, files in os.walk(base_dir):
#         for file in files:
#             if file.endswith("_filtered_data.csv") or file.endswith("_daily_data.csv"):
#                 file_path = os.path.join(root, file)
#                 df = pd.read_csv(file_path)
#                 df = calculate_returns(df)
#                 df.to_csv(file_path, index=False)


# # Example usage:
# base_dir = "/Users/raphaelravinet/Code/BSE/Thesis/data"
# process_folder(base_dir)
