<a href="https://colab.research.google.com/github/rpjena/random_matrix/blob/main/stock_residuals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd


class StockResiduals:
    """
    Compute OLS residuals from regressing each stock's return series against
    a market (or any single-factor) return series.

    For each stock i the model is:
        R_i(t) = alpha_i + beta_i * R_m(t) + epsilon_i(t)

    NaN / null handling: each stock is regressed using only the rows where
    *both* that stock and the market have non-null values.  Rows that were
    NaN in the input remain NaN in the residuals output.  If a stock has
    fewer than ``min_obs`` valid observations the regression is skipped and
    its alpha, beta, R-squared are set to NaN.

    Parameters
    ----------
    stocks : pd.DataFrame
        (T x N) DataFrame of stock return series. Index is the time axis
        (dates or integers), columns are stock identifiers.
    market : pd.Series
        (T,) Series of market returns aligned to the same index as `stocks`.
    min_obs : int, optional
        Minimum number of non-NaN observations required to run a regression
        for a given stock.  Default is 30.

    Attributes
    ----------
    residuals : pd.DataFrame
        (T x N) residual returns, same index/columns as `stocks`.
        Rows that were NaN in the input stay NaN.
    betas : pd.Series
        Beta coefficient for each stock (indexed by stock name).
    alphas : pd.Series
        Intercept for each stock.
    r_squared : pd.Series
        R-squared for each stock regression.
    obs_count : pd.Series
        Number of valid (non-NaN) observations used per stock.
    """

    def __init__(self, stocks, market, min_obs=30):
        if stocks.shape[0] != market.shape[0]:
            raise ValueError(
                f"Length mismatch: stocks has {stocks.shape[0]} rows, "
                f"market has {market.shape[0]} rows."
            )

        self.stocks = stocks
        self.market = market
        self.min_obs = min_obs

        # Run the regression
        self.alphas, self.betas, self.r_squared, self.residuals, self.obs_count = (
            self._fit()
        )

    def _fit(self):
        """
        Vectorised OLS with NaN awareness.

        Strategy
        --------
        1.  Identify rows where the market is valid (``mkt_valid``).
        2.  For stocks that have *no* NaNs on those rows, run a single
            batched regression (fast path — covers the common case).
        3.  For the remaining stocks, regress one-by-one using each stock's
            own valid-row mask (slow path — only for stocks with NaNs).

        Returns
        -------
        alphas    : pd.Series
        betas     : pd.Series
        r_squared : pd.Series
        residuals : pd.DataFrame
        obs_count : pd.Series
        """
        Y_full = self.stocks.values.astype(float)       # (T, N)
        x_full = self.market.values.astype(float)        # (T,)
        T, N = Y_full.shape
        cols = self.stocks.columns
        idx  = self.stocks.index

        # Output arrays — default to NaN
        alphas_arr = np.full(N, np.nan)
        betas_arr  = np.full(N, np.nan)
        r2_arr     = np.full(N, np.nan)
        resid      = np.full_like(Y_full, np.nan)
        obs_arr    = np.zeros(N, dtype=int)

        mkt_valid = ~np.isnan(x_full)                    # (T,)
        stock_nan  = np.isnan(Y_full)                    # (T, N)

        # ---- fast path: stocks with no NaNs where market is valid ----
        has_nan = stock_nan[mkt_valid].any(axis=0)       # (N,) bool
        fast_mask = ~has_nan                             # columns for batch OLS
        n_fast = fast_mask.sum()

        if n_fast > 0:
            rows = mkt_valid
            x = x_full[rows]
            Y = Y_full[np.ix_(rows, fast_mask)]
            t = rows.sum()
            obs_arr[fast_mask] = t

            if t >= self.min_obs:
                X = np.column_stack([np.ones(t), x])
                XtX_inv = np.linalg.inv(X.T @ X)
                coeffs = XtX_inv @ (X.T @ Y)            # (2, n_fast)

                alphas_arr[fast_mask] = coeffs[0]
                betas_arr[fast_mask]  = coeffs[1]

                Y_hat = X @ coeffs
                res   = Y - Y_hat
                ss_res = np.sum(res ** 2, axis=0)
                ss_tot = np.sum((Y - Y.mean(axis=0)) ** 2, axis=0)
                r2_arr[fast_mask] = np.where(ss_tot > 0, 1.0 - ss_res / ss_tot, np.nan)

                # Place residuals back; rows where market was NaN stay NaN
                resid[np.ix_(rows, fast_mask)] = res

        # ---- slow path: per-stock regression for columns with NaNs ----
        slow_cols = np.where(has_nan)[0]
        for j in slow_cols:
            valid = mkt_valid & ~stock_nan[:, j]
            t = valid.sum()
            obs_arr[j] = t
            if t < self.min_obs:
                continue

            x = x_full[valid]
            y = Y_full[valid, j]
            X = np.column_stack([np.ones(t), x])
            XtX_inv = np.linalg.inv(X.T @ X)
            coeffs = XtX_inv @ (X.T @ y)                # (2,)

            alphas_arr[j] = coeffs[0]
            betas_arr[j]  = coeffs[1]

            y_hat = X @ coeffs
            res   = y - y_hat
            ss_res = np.sum(res ** 2)
            ss_tot = np.sum((y - y.mean()) ** 2)
            r2_arr[j] = (1.0 - ss_res / ss_tot) if ss_tot > 0 else np.nan

            resid[valid, j] = res

        return (
            pd.Series(alphas_arr, index=cols, name='alpha'),
            pd.Series(betas_arr,  index=cols, name='beta'),
            pd.Series(r2_arr,     index=cols, name='r_squared'),
            pd.DataFrame(resid,   index=idx,  columns=cols),
            pd.Series(obs_arr,    index=cols, name='obs_count'),
        )

    def summary(self):
        """
        Return a compact DataFrame with alpha, beta, R-squared, and
        observation count per stock.

        Returns
        -------
        pd.DataFrame
            (N x 4) summary table.
        """
        return pd.DataFrame({
            'alpha':     self.alphas,
            'beta':      self.betas,
            'r_squared': self.r_squared,
            'obs_count': self.obs_count,
        })

## Example usage with synthetic data (including NaNs)

Generate a panel of T = 2000, N = 5000 with known betas, sprinkle in NaNs,
then verify recovery.

In [None]:
np.random.seed(42)

T = 2000
N = 5000

# True parameters
true_betas  = np.random.uniform(0.5, 1.8, size=N)
true_alphas = np.random.normal(0.0001, 0.0005, size=N)

# Market returns
market_returns = np.random.normal(0.0005, 0.01, size=T)

# Stock returns: R_i = alpha_i + beta_i * R_m + noise
noise = np.random.normal(0, 0.02, size=(T, N))
stock_returns = true_alphas + np.outer(market_returns, true_betas) + noise

dates = pd.date_range('2017-01-01', periods=T, freq='B')
tickers = [f'S{i:04d}' for i in range(N)]

stocks_df = pd.DataFrame(stock_returns, index=dates, columns=tickers)
market_sr = pd.Series(market_returns, index=dates, name='MKT')

# --- Inject NaNs ---
# ~5% random NaNs across the panel
nan_mask = np.random.rand(T, N) < 0.05
stocks_df[nan_mask] = np.nan

# A few NaNs in the market too
market_sr.iloc[10:15] = np.nan

# One stock with almost all NaNs (below min_obs threshold)
stocks_df['S0000'] = np.nan

print(f'stocks shape: {stocks_df.shape}')
print(f'market shape: {market_sr.shape}')
print(f'stock NaN %:  {stocks_df.isna().mean().mean() * 100:.1f}%')
print(f'market NaN count: {market_sr.isna().sum()}')

In [None]:
model = StockResiduals(stocks_df, market_sr)

print('--- Summary (first 10 stocks) ---')
print(model.summary().head(10))
print()
print('--- Residuals (first 5 rows x first 5 stocks) ---')
print(model.residuals.iloc[:5, :5])

In [None]:
# Verify beta recovery (only for stocks that had enough obs)
valid = model.obs_count >= 30
beta_error = model.betas[valid].values - true_betas[valid.values]
print(f'Stocks with enough obs: {valid.sum()} / {N}')
print(f'Beta estimation error  — mean: {beta_error.mean():.6f}, std: {beta_error.std():.4f}')
print(f'Mean R-squared: {model.r_squared[valid].mean():.4f}')
print(f'Residual DataFrame shape: {model.residuals.shape}')
print()
# Show that NaN rows stay NaN in residuals
print(f'NaN count in residuals: {model.residuals.isna().sum().sum()}')
print(f'S0000 beta (all-NaN stock): {model.betas["S0000"]}')
print(f'S0000 obs_count: {model.obs_count["S0000"]}')