<a href="https://colab.research.google.com/github/rpjena/random_matrix/blob/main/stock_residuals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd


class StockResiduals:
    """
    Compute OLS residuals from regressing each stock's return series against
    a market (or any single-factor) return series.

    For each stock i the model is:
        R_i(t) = alpha_i + beta_i * R_m(t) + epsilon_i(t)

    The class stores alpha, beta, R-squared for every stock and the full
    residual matrix as a DataFrame that mirrors the input shape.

    Parameters
    ----------
    stocks : pd.DataFrame
        (T x N) DataFrame of stock return series. Index is the time axis
        (dates or integers), columns are stock identifiers.
    market : pd.Series
        (T,) Series of market returns aligned to the same index as `stocks`.

    Attributes
    ----------
    residuals : pd.DataFrame
        (T x N) residual returns, same index/columns as `stocks`.
    betas : pd.Series
        Beta coefficient for each stock (indexed by stock name).
    alphas : pd.Series
        Intercept for each stock.
    r_squared : pd.Series
        R-squared for each stock regression.
    """

    def __init__(self, stocks, market):
        if stocks.shape[0] != market.shape[0]:
            raise ValueError(
                f"Length mismatch: stocks has {stocks.shape[0]} rows, "
                f"market has {market.shape[0]} rows."
            )

        self.stocks = stocks
        self.market = market

        # Run the regression
        self.alphas, self.betas, self.r_squared, self.residuals = self._fit()

    def _fit(self):
        """
        Vectorised OLS: regress every column of `stocks` on `market` in one
        shot using the normal equations.

        Returns
        -------
        alphas : pd.Series
        betas : pd.Series
        r_squared : pd.Series
        residuals : pd.DataFrame
        """
        Y = self.stocks.values                       # (T, N)
        x = self.market.values.astype(float)         # (T,)
        T, N = Y.shape

        # Design matrix [1, x]
        X = np.column_stack([np.ones(T), x])         # (T, 2)

        # Normal equations: (X'X)^{-1} X'Y  -> (2, N)
        XtX_inv = np.linalg.inv(X.T @ X)            # (2, 2)
        coeffs = XtX_inv @ (X.T @ Y)                # (2, N)

        alphas_arr = coeffs[0]                       # (N,)
        betas_arr  = coeffs[1]                       # (N,)

        # Fitted values and residuals
        Y_hat = X @ coeffs                           # (T, N)
        resid = Y - Y_hat                            # (T, N)

        # R-squared = 1 - SS_res / SS_tot
        ss_res = np.sum(resid ** 2, axis=0)          # (N,)
        ss_tot = np.sum((Y - Y.mean(axis=0)) ** 2, axis=0)  # (N,)
        r2 = 1.0 - ss_res / ss_tot                  # (N,)

        cols = self.stocks.columns
        idx  = self.stocks.index

        return (
            pd.Series(alphas_arr, index=cols, name='alpha'),
            pd.Series(betas_arr,  index=cols, name='beta'),
            pd.Series(r2,         index=cols, name='r_squared'),
            pd.DataFrame(resid,   index=idx,  columns=cols),
        )

    def summary(self):
        """
        Return a compact DataFrame with alpha, beta, and R-squared per stock.

        Returns
        -------
        pd.DataFrame
            (N x 3) summary table.
        """
        return pd.DataFrame({
            'alpha':     self.alphas,
            'beta':      self.betas,
            'r_squared': self.r_squared,
        })

## Example usage with synthetic data

Generate a panel of T = 2000, N = 5000 with known betas, then verify recovery.

In [None]:
np.random.seed(42)

T = 2000
N = 5000

# True parameters
true_betas  = np.random.uniform(0.5, 1.8, size=N)
true_alphas = np.random.normal(0.0001, 0.0005, size=N)

# Market returns
market_returns = np.random.normal(0.0005, 0.01, size=T)

# Stock returns: R_i = alpha_i + beta_i * R_m + noise
noise = np.random.normal(0, 0.02, size=(T, N))
stock_returns = true_alphas + np.outer(market_returns, true_betas) + noise

dates = pd.date_range('2017-01-01', periods=T, freq='B')
tickers = [f'S{i:04d}' for i in range(N)]

stocks_df = pd.DataFrame(stock_returns, index=dates, columns=tickers)
market_sr = pd.Series(market_returns, index=dates, name='MKT')

print(f'stocks shape: {stocks_df.shape}')
print(f'market shape: {market_sr.shape}')

In [None]:
model = StockResiduals(stocks_df, market_sr)

print('--- Summary (first 10 stocks) ---')
print(model.summary().head(10))
print()
print('--- Residuals (first 5 rows x first 5 stocks) ---')
print(model.residuals.iloc[:5, :5])

In [None]:
# Verify beta recovery
beta_error = model.betas.values - true_betas
print(f'Beta estimation error  â€” mean: {beta_error.mean():.6f}, std: {beta_error.std():.4f}')
print(f'Mean R-squared: {model.r_squared.mean():.4f}')
print(f'Residual DataFrame shape: {model.residuals.shape}')