In [10]:
import pandas as pd
import numpy as np
import os

In [12]:
class Preprocessing_data:
    
    def __init__(self, path, output_path):
        self.path = path
        self.output_path = output_path
    
    
    def open_daily_data(self, ticker):
        ticker_path = os.path.join(self.path, ticker)
        for file in os.listdir(ticker_path):
            if file.endswith('daily_data.csv'):
                data = pd.read_csv(os.path.join(ticker_path, file))
                return data
        print(f"No daily data found for {ticker}.")
        return pd.DataFrame()

        
    def open_filtered_data(self, ticker):
        ticker_path = os.path.join(self.path, ticker)
        for file in os.listdir(ticker_path):
            if file.endswith('filtered_data.csv'):
                data = pd.read_csv(os.path.join(ticker_path, file))
                return data
        print(f"No daily data found for {ticker}.")
        return pd.DataFrame() 
    
    

    def applying_threshold_filter(self, data, threshold=66):
        daily_counts = data.groupby('Day').size()
        valid_days = daily_counts[daily_counts > threshold].index
        filtered_data = data[data['Day'].isin(valid_days)]
        return filtered_data
    
    
    def calculating_RV(self, data):
        data['RM'] = data['returns'] ** 2
        rv_data = data.groupby('Day').sum()
        rv_data['log_x'] = np.log(rv_data['RM'])
        # rv_data['sum_returns'] = rv_data['returns'].rolling(window=10).sum().shift(-9)
        return rv_data
    
    def adjusting_RV(self,data):
        data['RM_adj'] = ((data['returns']** 2).sum() / data['RM'].sum()) * data['RM']
        data['log_x_adj'] = np.log(data['RM_adj'])
        return data
        
        
    def calculating_lags(self,data):
        
        data['RM_day'] = data['log_x'].shift(1)
        data['RM_week'] = data['log_x'].rolling(window=5).apply(lambda x: x[:4].mean(), raw = False).shift(1)
        data['RM_month'] = data['log_x'].rolling(window=22).apply(lambda x: x[:17].mean(), raw = False).shift(1)
        return data
    
    def preprocess_vix_data(self, vix_data):
        vix_data['IV_day'] = vix_data['log_VIX'].shift(1)
        vix_data['IV_week'] = vix_data['log_VIX'].rolling(window=5).apply(lambda x: x[:4].mean(), raw=False).shift(1)
        vix_data['IV_month'] = vix_data['log_VIX'].rolling(window=22).apply(lambda x: x[:17].mean(), raw=False).shift(1)
        return vix_data
        
    def merge_with_vix(self,data,vix_data):
        final_data = data.merge(vix_data, on='Date', how='left')
        return final_data
    
    
    def process_ticker(self, ticker, vix_data):
        daily_data = self.open_daily_data(ticker)
        filtered_data = self.open_filtered_data(ticker)
        filtered_data = self.applying_threshold_filter(filtered_data)
        rv_data = self.calculating_RV(filtered_data)
        rv_data = self.calculating_lags(rv_data)
        rv_data = self.adjusting_RV(rv_data)
        rv_data = rv_data.reset_index() #Day is the index, so bringing that back to a column

        rv_data_selected = rv_data[['Day','RM', 'log_x', 'RM_day', 'RM_week', 'RM_month','RM_adj','log_x_adj']]
        daily_data_selected = daily_data[['Day', 'log_ret', 'returns']]

        # Merge the selected data
        final_data = daily_data_selected.merge(rv_data_selected, on='Day', how='inner')
        print(final_data.isnull().sum()) # just making sure we only have the nans in rv weekly, monthly...
        final_data = final_data.dropna()
        vix_data = vix_data.rename(columns={'log_VIX': 'IV'})
        
        final_data = final_data.merge(vix_data[['Date', 'IV_day','IV_week','IV_month']], left_on='Day', right_on='Date', how='inner')
        final_data = final_data.drop(columns=['Date'])
        
        return final_data
    
    
    def save_data(self, data, ticker):
        data.to_csv(os.path.join(self.output_path, f'{ticker}.csv'), index=False)
    
    def final_dataset(self, vix_path):
        vix_data = pd.read_csv(vix_path)
        vix_data = self.preprocess_vix_data(vix_data)
        for ticker in os.listdir(self.path):
            ticker_path = os.path.join(self.path, ticker)
            if os.path.isdir(ticker_path):
                final_data = self.process_ticker(ticker, vix_data)
                self.save_data(final_data, ticker)        

In [21]:
data_path = '/Users/raphaelravinet/Code/BSE/Thesis/qqq'
output_path = '/Users/raphaelravinet/Code/BSE/Thesis/qqqfinal'
vix_path = '/Users/raphaelravinet/Code/BSE/Thesis/Vol_indices/vix.csv'
preprocess = Preprocessing_data(data_path, output_path)

In [14]:
output_path = '/Users/raphaelravinet/Code/BSE/Thesis/final_datasets'
data_path = '/Users/raphaelravinet/Code/BSE/Thesis/data'
vix_path = '/Users/raphaelravinet/Code/BSE/Thesis/Vol_indices/vix.csv'

preprocess = Preprocessing_data(data_path, output_path)
vix_data = pd.read_csv(vix_path)

# # Call the process_ticker method for a specific ticker, e.g., 'AAPL'
# final_data = preprocess.process_ticker('AAPL', vix_data)



In [22]:
preprocess = Preprocessing_data(data_path, output_path)


In [23]:
final_data = preprocess.final_dataset(vix_path)