In [5]:
import pandas as pd
df = pd.read_csv('final_merged_data.csv')

In [3]:
!pip install linearmodels



In [6]:
# If linearmodels is not installed, run first: !pip install linearmodels
from linearmodels.panel import PanelOLS
import pandas as pd

# ==========================================
# Step 1: Prepare panel data index
# ==========================================
# PanelOLS requires index to be in [entity, time] format
# df_final is the final table generated from the previous data cleaning step
df['date'] = pd.to_datetime(df['date'])
df_reg = df.set_index(['stock_name', 'date'])

# ==========================================
# Step 2: Define regression formula
# ==========================================
# Explanation:
# Log_Volume      : Dependent variable (Y)
# 1               : Intercept
# Sentiment_Lag1  : Core variable 1 (retail sentiment)
# Log_Buzz_Lag1   : Core variable 2 (discussion buzz/news attention)
# Return_Abs_Lag1 : Control variable (previous day's absolute return - attracts attention)
# Volatility_Lag1 : Control variable (previous day's volatility - opinion divergence)
# RiskAppetite    : Macro control (market risk appetite)
# Fed_Rate        : Macro control (interest rate environment)
# unemployment    : Macro control (economic fundamentals)
# FOMC            : Event control (Fed meeting days)
# EntityEffects   : Entity fixed effects (controls for inherent trading volume differences between AAPL and AMZN)

formula = """
Log_Volume ~ 1 + 
             Sentiment_Lag1 + 
             Log_Buzz_Lag1 + 
             Return_Abs_Lag1 + 
             Volatility_Lag1 + 
             RiskAppetite + 
             Fed_Rate + 
             unemployment + 
             FOMC + 
             EntityEffects
"""

# ==========================================
# Step 3: Run the model
# ==========================================
# drop_absorbed=True: Automatically drop macro variables that are fully absorbed by fixed effects (avoids errors)
mod = PanelOLS.from_formula(formula, data=df_reg, drop_absorbed=True)

# Use 'clustered' standard errors (clustered by Entity)
# This prevents serial correlation within the same stock from inflating P-values, making results more robust
res = mod.fit(cov_type='clustered', cluster_entity=True)

# ==========================================
# Step 4: View results
# ==========================================
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:             Log_Volume   R-squared:                        0.4091
Estimator:                   PanelOLS   R-squared (Between):             -0.1720
No. Observations:                2705   R-squared (Within):               0.4091
Date:                Tue, Dec 02 2025   R-squared (Overall):             -0.0528
Time:                        22:22:44   Log-likelihood                   -1168.0
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      232.97
Entities:                           5   P-value                           0.0000
Avg Obs:                       541.00   Distribution:                  F(8,2692)
Min Obs:                       541.00                                           
Max Obs:                       541.00   F-statistic (robust):         -6.571e+16
                            