In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import os
from datetime import time
import warnings
warnings.filterwarnings("ignore")

from new_regression_model import *

In [None]:
constituents = pd.read_csv('constituents.csv')
weights_df = pd.read_excel('holdings-daily-us-en-spy.xlsx', skiprows=4, skipfooter=10)

In [3]:
st = pd.read_csv('sentiment_all_symbols.csv')
st.rename(columns={'created_at':'time'},inplace=True)
cons = constituents['Symbol']
st = st[st['symbol'].isin(cons)]
st['normalized_sentiment'] = st.groupby('time')['sentiment'].transform(lambda x: (x - x.mean()) / x.std() if x.std() != 0 else 0)
st['normalized_volume'] = st.groupby('time')['message_volume'].transform(lambda x: (x - x.mean()) / x.std() if x.std() != 0 else 0)
st['sentiment_change'] = st.groupby('symbol')['sentiment'].transform(
    lambda x: np.where(x != 0, x.shift(-1)/x - 1, 0)
)
st['normalized_sentiment_change'] = st.groupby('time')['sentiment_change'].transform(lambda x: (x - x.mean()) / x.std() if x.std() != 0 else 0)
st['volume_change'] = st.groupby('symbol')['message_volume'].transform(
    lambda x: np.where(x != 0, x.shift(-1)/x - 1, 0)
)
st['normalized_volume_change'] = st.groupby('time')['volume_change'].transform(lambda x: (x - x.mean()) / x.std() if x.std() != 0 else 0)
st.fillna(0,inplace=True)
st.index = pd.to_datetime(st.index,unit='ns', utc=True)
st_factors = st[['symbol','normalized_sentiment','normalized_volume','normalized_sentiment_change','normalized_volume_change']]

In [None]:
# extract data for 2023
base_dir = "validityBase/stock_data/2023/"
months = ['0'+str(i)+'/'  if i < 10 else str(i)+'/' for i in range(1,13)]

# Initialize empty DataFrames
all_stocks = pd.DataFrame()
symbols = constituents['Symbol']

# Loop through directories and concatenate
for month in months:
    directory = base_dir + month
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    csv_files.sort()
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        df = df[df['ticker'].isin(symbols)]
        df['time'] = pd.to_datetime(df['window_start'],unit='ns', utc=True)
        df.set_index('time',inplace=True)
        df = df[df['ticker'].isin(symbols)]
        df = df[df.index.time==time(16, 0)]
        all_stocks = pd.concat([all_stocks, df])

In [None]:
# extract data for 2024
base_dir = "validityBase/stock_data/2024/"
months = ['0'+str(i)+'/'  if i < 10 else str(i)+'/' for i in range(1,13)]

# Loop through directories and concatenate
for month in months:
    directory = base_dir + month
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    csv_files.sort()
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        df = df[df['ticker'].isin(symbols)]
        df['time'] = pd.to_datetime(df['window_start'],unit='ns', utc=True)
        df.set_index('time',inplace=True)
        df = df[df['ticker'].isin(symbols)]
        df = df[df.index.time==time(16, 0)]
        all_stocks = pd.concat([all_stocks, df])

In [None]:
all_stocks.sort_index(inplace=True)
all_stocks['close'] = all_stocks.groupby('ticker')['close'].transform(lambda x: x.ffill())
all_stocks['close'].fillna(0,inplace=True)

In [None]:
all_stocks['time'] = all_stocks.index
all_stocks['returns'] = list(all_stocks.groupby('ticker').apply(lambda x:x['close'].shift(-1) / x['close'] - 1).reset_index()['close'])
returns = all_stocks[['ticker','returns','time']]
returns.rename(columns={'ticker':'symbol'},inplace=True)
returns.index.name = 'date'
returns['time'] = returns['time'].dt.floor('D')
returns.dropna(inplace=True)
all_returns = returns.copy()

all_returns = all_returns.pivot_table(values='returns',index='time',columns='symbol')
all_returns.fillna(0,inplace=True)

In [None]:
symbols = all_returns.columns
all_betas = {}
all_residuals = {}
for i in range(0,len(all_returns)-252,21):
    betas = []
    residuals = []
    real_symbols = []
    returns_cur = all_returns[i:i+252]
    for s in symbols:
        returns = returns_cur[s]
        df_fact_rets =  st_factors[st_factors['symbol'] == s][['normalized_sentiment','normalized_volume','normalized_sentiment_change','normalized_volume_change']]
        df_fact_rets = df_fact_rets[(df_fact_rets.index<=returns.index[-1])&(df_fact_rets.index>=returns.index[0])]

        if len(returns) > 0 and len(df_fact_rets) > 0:
            betas_final, residuals_final = robust_matrix_regression(
                returns,
                df_fact_rets,
                0.5,
                90, # half-life
                10
            )
            betas.append(pd.Series(betas_final))
            residuals.append(pd.Series(residuals_final))
            real_symbols.append(s)
    betas = pd.concat(betas,axis=1)
    betas.columns = real_symbols
    residuals = pd.concat(residuals,axis=1)
    residuals.columns = real_symbols
    all_betas[returns.index[0]] = betas
    all_residuals[returns.index[0]] = residuals