<a href="https://colab.research.google.com/github/rpjena/random_matrix/blob/main/stock_residuals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd


class StockResiduals:
    """
    Compute OLS residuals from regressing each stock's return series against
    a market (or any single-factor) return series.

    For each stock i the model is:
        R_i(t) = alpha_i + beta_i * R_m(t) + epsilon_i(t)

    NaN / null handling: each stock is regressed using only the rows where
    *both* that stock and the market have non-null values.  Rows that were
    NaN in the input remain NaN in the residuals output.  If a stock has
    fewer than ``min_obs`` valid observations the regression is skipped and
    its alpha, beta, R-squared are set to NaN.

    Parameters
    ----------
    stocks : pd.DataFrame
        (T x N) DataFrame of stock return series. Index is the time axis
        (dates or integers), columns are stock identifiers.
    market : pd.Series
        (T,) Series of market returns aligned to the same index as `stocks`.
    min_obs : int, optional
        Minimum number of non-NaN observations required to run a regression
        for a given stock.  Default is 30.

    Attributes
    ----------
    residuals : pd.DataFrame
        (T x N) residual returns, same index/columns as `stocks`.
        Rows that were NaN in the input stay NaN.
    betas : pd.Series
        Beta coefficient for each stock (indexed by stock name).
    alphas : pd.Series
        Intercept for each stock.
    r_squared : pd.Series
        R-squared for each stock regression.
    obs_count : pd.Series
        Number of valid (non-NaN) observations used per stock.
    """

    def __init__(self, stocks, market, min_obs=30):
        if stocks.shape[0] != market.shape[0]:
            raise ValueError(
                f"Length mismatch: stocks has {stocks.shape[0]} rows, "
                f"market has {market.shape[0]} rows."
            )

        self.stocks = stocks
        self.market = market
        self.min_obs = min_obs

        # Run the regression
        self.alphas, self.betas, self.r_squared, self.residuals, self.obs_count = (
            self._fit()
        )

    def _fit(self):
        """
        Vectorised OLS with NaN awareness.

        Strategy
        --------
        1.  Identify rows where the market is valid (``mkt_valid``).
        2.  For stocks that have *no* NaNs on those rows, run a single
            batched regression (fast path — covers the common case).
        3.  For the remaining stocks, regress one-by-one using each stock's
            own valid-row mask (slow path — only for stocks with NaNs).

        Returns
        -------
        alphas    : pd.Series
        betas     : pd.Series
        r_squared : pd.Series
        residuals : pd.DataFrame
        obs_count : pd.Series
        """
        Y_full = self.stocks.values.astype(float)       # (T, N)
        x_full = self.market.values.astype(float)        # (T,)
        T, N = Y_full.shape
        cols = self.stocks.columns
        idx  = self.stocks.index

        # Output arrays — default to NaN
        alphas_arr = np.full(N, np.nan)
        betas_arr  = np.full(N, np.nan)
        r2_arr     = np.full(N, np.nan)
        resid      = np.full_like(Y_full, np.nan)
        obs_arr    = np.zeros(N, dtype=int)

        mkt_valid = ~np.isnan(x_full)                    # (T,)
        stock_nan  = np.isnan(Y_full)                    # (T, N)

        # ---- fast path: stocks with no NaNs where market is valid ----
        has_nan = stock_nan[mkt_valid].any(axis=0)       # (N,) bool
        fast_mask = ~has_nan                             # columns for batch OLS
        n_fast = fast_mask.sum()

        if n_fast > 0:
            rows = mkt_valid
            x = x_full[rows]
            Y = Y_full[np.ix_(rows, fast_mask)]
            t = rows.sum()
            obs_arr[fast_mask] = t

            if t >= self.min_obs:
                X = np.column_stack([np.ones(t), x])
                XtX_inv = np.linalg.inv(X.T @ X)
                coeffs = XtX_inv @ (X.T @ Y)            # (2, n_fast)

                alphas_arr[fast_mask] = coeffs[0]
                betas_arr[fast_mask]  = coeffs[1]

                Y_hat = X @ coeffs
                res   = Y - Y_hat
                ss_res = np.sum(res ** 2, axis=0)
                ss_tot = np.sum((Y - Y.mean(axis=0)) ** 2, axis=0)
                r2_arr[fast_mask] = np.where(ss_tot > 0, 1.0 - ss_res / ss_tot, np.nan)

                # Place residuals back; rows where market was NaN stay NaN
                resid[np.ix_(rows, fast_mask)] = res

        # ---- slow path: per-stock regression for columns with NaNs ----
        slow_cols = np.where(has_nan)[0]
        for j in slow_cols:
            valid = mkt_valid & ~stock_nan[:, j]
            t = valid.sum()
            obs_arr[j] = t
            if t < self.min_obs:
                continue

            x = x_full[valid]
            y = Y_full[valid, j]
            X = np.column_stack([np.ones(t), x])
            XtX_inv = np.linalg.inv(X.T @ X)
            coeffs = XtX_inv @ (X.T @ y)                # (2,)

            alphas_arr[j] = coeffs[0]
            betas_arr[j]  = coeffs[1]

            y_hat = X @ coeffs
            res   = y - y_hat
            ss_res = np.sum(res ** 2)
            ss_tot = np.sum((y - y.mean()) ** 2)
            r2_arr[j] = (1.0 - ss_res / ss_tot) if ss_tot > 0 else np.nan

            resid[valid, j] = res

        return (
            pd.Series(alphas_arr, index=cols, name='alpha'),
            pd.Series(betas_arr,  index=cols, name='beta'),
            pd.Series(r2_arr,     index=cols, name='r_squared'),
            pd.DataFrame(resid,   index=idx,  columns=cols),
            pd.Series(obs_arr,    index=cols, name='obs_count'),
        )

    def summary(self):
        """
        Return a compact DataFrame with alpha, beta, R-squared, and
        observation count per stock.

        Returns
        -------
        pd.DataFrame
            (N x 4) summary table.
        """
        return pd.DataFrame({
            'alpha':     self.alphas,
            'beta':      self.betas,
            'r_squared': self.r_squared,
            'obs_count': self.obs_count,
        })

## Example usage with synthetic data (including NaNs)

Generate a panel of T = 2000, N = 5000 with known betas, sprinkle in NaNs,
then verify recovery.

In [None]:
np.random.seed(42)

T = 2000
N = 5000

# True parameters
true_betas  = np.random.uniform(0.5, 1.8, size=N)
true_alphas = np.random.normal(0.0001, 0.0005, size=N)

# Market returns
market_returns = np.random.normal(0.0005, 0.01, size=T)

# Stock returns: R_i = alpha_i + beta_i * R_m + noise
noise = np.random.normal(0, 0.02, size=(T, N))
stock_returns = true_alphas + np.outer(market_returns, true_betas) + noise

dates = pd.date_range('2017-01-01', periods=T, freq='B')
tickers = [f'S{i:04d}' for i in range(N)]

stocks_df = pd.DataFrame(stock_returns, index=dates, columns=tickers)
market_sr = pd.Series(market_returns, index=dates, name='MKT')

# --- Inject NaNs ---
# ~5% random NaNs across the panel
nan_mask = np.random.rand(T, N) < 0.05
stocks_df[nan_mask] = np.nan

# A few NaNs in the market too
market_sr.iloc[10:15] = np.nan

# One stock with almost all NaNs (below min_obs threshold)
stocks_df['S0000'] = np.nan

print(f'stocks shape: {stocks_df.shape}')
print(f'market shape: {market_sr.shape}')
print(f'stock NaN %:  {stocks_df.isna().mean().mean() * 100:.1f}%')
print(f'market NaN count: {market_sr.isna().sum()}')

In [None]:
model = StockResiduals(stocks_df, market_sr)

print('--- Summary (first 10 stocks) ---')
print(model.summary().head(10))
print()
print('--- Residuals (first 5 rows x first 5 stocks) ---')
print(model.residuals.iloc[:5, :5])

In [None]:
# Verify beta recovery (only for stocks that had enough obs)
valid = model.obs_count >= 30
beta_error = model.betas[valid].values - true_betas[valid.values]
print(f'Stocks with enough obs: {valid.sum()} / {N}')
print(f'Beta estimation error  — mean: {beta_error.mean():.6f}, std: {beta_error.std():.4f}')
print(f'Mean R-squared: {model.r_squared[valid].mean():.4f}')
print(f'Residual DataFrame shape: {model.residuals.shape}')
print()
# Show that NaN rows stay NaN in residuals
print(f'NaN count in residuals: {model.residuals.isna().sum().sum()}')
print(f'S0000 beta (all-NaN stock): {model.betas["S0000"]}')
print(f'S0000 obs_count: {model.obs_count["S0000"]}')

---

## PnL Beta: cross-sectional sensitivity of residuals to portfolio PnL

**Layout**:
- `residuals`: (N x T) — rows are stocks, columns are dates
- `pnl`: (T x T) — columns are dates (DatetimeIndex); for each date `r`, `pnl[r]` is a T-length PnL vector
- `betas` output: (N x T) — for each date `r`, regress all N stock residual series against `pnl[r]`, then forward-fill NaNs along the time axis

For each date $r$ and stock $i$:

$$\varepsilon_i(s) = \alpha_i^{(r)} + \beta_i^{(r)} \cdot \text{PnL}^{(r)}(s) + u_i(s), \qquad s = 1, \ldots, T$$

NaN positions are filled with the last valid beta for that stock (forward-fill along time).

In [None]:
class PnLBeta:
    """
    For each date r, regress every stock's residual time series against the
    PnL column for that date, producing an (N x T) matrix of betas.

    Model (for date r, stock i):
        residual_i(s) = alpha_i^(r) + beta_i^(r) * pnl^(r)(s) + u_i(s)
        for s = 1 ... T  (using only non-NaN rows for both)

    After computing raw betas, NaN entries are forward-filled along the
    time axis so every stock carries its last valid beta forward.

    Parameters
    ----------
    residuals : pd.DataFrame
        (N x T) residual returns — rows are stocks, columns are dates.
    pnl : pd.DataFrame
        (T x T) portfolio PnL — columns are a DatetimeIndex matching the
        columns of `residuals`.  For date r, ``pnl[r]`` is a T-length
        PnL vector.
    min_obs : int, optional
        Minimum number of jointly non-NaN observations required to run
        OLS for a given date column.  Default is 30.

    Attributes
    ----------
    betas : pd.DataFrame
        (N x T) betas, forward-filled along the time axis (axis=1).
    betas_raw : pd.DataFrame
        (N x T) betas before forward-fill (NaN where regression was
        skipped or insufficient data).
    r_squared : pd.DataFrame
        (N x T) R-squared per stock per date (not forward-filled).
    """

    def __init__(self, residuals, pnl, min_obs=30):
        # columns of residuals (dates) must appear in pnl columns
        common_dates = residuals.columns.intersection(pnl.columns)
        if common_dates.empty:
            raise ValueError("No common dates between residuals columns and pnl columns.")

        self.residuals = residuals
        self.pnl = pnl
        self.min_obs = min_obs
        self._common_dates = common_dates

        self.betas_raw, self.r_squared = self._fit()
        # Forward-fill along time axis: each stock keeps its last valid beta
        self.betas = self.betas_raw.ffill(axis=1)

    def _fit(self):
        """
        For each date r in common_dates, run a vectorised OLS of
        residuals (N series of length T) on pnl[r] (length T).

        Returns
        -------
        betas_raw : pd.DataFrame  (N x T)
        r_squared : pd.DataFrame  (N x T)
        """
        # residuals: (N x T_cols) → work with the underlying (N, T_cols) array
        # but regressions run over the row-axis of pnl (T_rows).
        # Each regression uses the T_rows observations where both the PnL
        # column and each stock's residual row are non-NaN.

        R = self.residuals                              # (N x T_cols) DataFrame
        stock_names = R.index
        date_cols   = R.columns                         # DatetimeIndex (T_cols)
        N = len(stock_names)
        T_cols = len(date_cols)

        betas_arr = np.full((N, T_cols), np.nan)
        r2_arr    = np.full((N, T_cols), np.nan)

        # Residuals as (N x T_cols) numpy; we need each stock's full time
        # series as a column vector for the regression, so transpose to
        # (T_cols x N) — this is "Y" for each regression.
        Y = R.values.astype(float).T                    # (T_cols, N)
        Y_nan = np.isnan(Y)                             # (T_cols, N)

        for c_idx, date_r in enumerate(date_cols):
            if date_r not in self.pnl.columns:
                continue

            x = self.pnl[date_r].values.astype(float)   # (T_rows,)

            # pnl rows and residuals columns may differ in length;
            # use the shorter of the two
            L = min(len(x), Y.shape[0])
            x_slice = x[:L]
            Y_slice = Y[:L, :]                           # (L, N)

            x_valid = ~np.isnan(x_slice)                 # (L,)

            # --- fast path: stocks with no NaN in the valid-x rows ---
            y_nan_slice = np.isnan(Y_slice)
            stock_has_nan = y_nan_slice[x_valid].any(axis=0)   # (N,)
            fast = ~stock_has_nan
            n_fast = fast.sum()

            # rows usable by the fast-path stocks
            rows_fast = x_valid
            t_fast = rows_fast.sum()

            if n_fast > 0 and t_fast >= self.min_obs:
                xf = x_slice[rows_fast]
                Yf = Y_slice[np.ix_(rows_fast, fast)]    # (t_fast, n_fast)
                Xf = np.column_stack([np.ones(t_fast), xf])
                XtX_inv = np.linalg.inv(Xf.T @ Xf)
                coeffs = XtX_inv @ (Xf.T @ Yf)           # (2, n_fast)

                betas_arr[fast, c_idx] = coeffs[1]

                Yhat = Xf @ coeffs
                res  = Yf - Yhat
                ss_res = np.sum(res ** 2, axis=0)
                ss_tot = np.sum((Yf - Yf.mean(axis=0)) ** 2, axis=0)
                r2_arr[fast, c_idx] = np.where(
                    ss_tot > 0, 1.0 - ss_res / ss_tot, np.nan
                )

            # --- slow path: stocks that have NaNs ---
            slow_idxs = np.where(stock_has_nan)[0]
            for j in slow_idxs:
                valid = x_valid & ~y_nan_slice[:, j]
                t = valid.sum()
                if t < self.min_obs:
                    continue

                xj = x_slice[valid]
                yj = Y_slice[valid, j]
                Xj = np.column_stack([np.ones(t), xj])
                XtX_inv = np.linalg.inv(Xj.T @ Xj)
                coeffs = XtX_inv @ (Xj.T @ yj)

                betas_arr[j, c_idx] = coeffs[1]

                yhat = Xj @ coeffs
                res  = yj - yhat
                ss_res = np.sum(res ** 2)
                ss_tot = np.sum((yj - yj.mean()) ** 2)
                r2_arr[j, c_idx] = (
                    (1.0 - ss_res / ss_tot) if ss_tot > 0 else np.nan
                )

        return (
            pd.DataFrame(betas_arr, index=stock_names, columns=date_cols),
            pd.DataFrame(r2_arr,    index=stock_names, columns=date_cols),
        )

    def summary_at(self, date):
        """
        Return beta (forward-filled) and R-squared for every stock at
        a single date.

        Parameters
        ----------
        date : column label
            A date present in the columns.

        Returns
        -------
        pd.DataFrame  (N x 2)
        """
        return pd.DataFrame({
            'beta':      self.betas[date],
            'r_squared': self.r_squared[date],
        })

### Example: PnL betas from transposed residuals

Transpose the `StockResiduals` output to (N x T) and build a synthetic
(T x T) PnL DataFrame with DatetimeIndex columns.

In [None]:
# Transpose residuals to (N x T): stocks as rows, dates as columns
residuals_NxT = model.residuals.T                        # (N x T)
print(f'residuals (N x T): {residuals_NxT.shape}')

# Synthetic PnL (T x T): columns are the same DatetimeIndex as residuals
np.random.seed(99)
T_dates = len(dates)
pnl_data = np.random.normal(0, 0.01, size=(T_dates, T_dates))

# Inject ~3 % NaNs into PnL
pnl_nan_mask = np.random.rand(T_dates, T_dates) < 0.03
pnl_data[pnl_nan_mask] = np.nan

pnl_TxT = pd.DataFrame(pnl_data, index=dates, columns=dates)
print(f'PnL (T x T):       {pnl_TxT.shape}')
print(f'PnL NaN %:          {pnl_TxT.isna().mean().mean() * 100:.1f}%')

In [None]:
pnl_model = PnLBeta(residuals_NxT, pnl_TxT, min_obs=30)

print(f'betas_raw shape (N x T): {pnl_model.betas_raw.shape}')
print(f'betas shape (N x T):     {pnl_model.betas.shape}')
print(f'R² shape (N x T):        {pnl_model.r_squared.shape}')
print()
print('--- Raw betas (first 5 stocks x last 5 dates) ---')
print(pnl_model.betas_raw.iloc[:5, -5:])
print()
print('--- Forward-filled betas (first 5 stocks x last 5 dates) ---')
print(pnl_model.betas.iloc[:5, -5:])
print()
print('--- R² (first 5 stocks x last 5 dates) ---')
print(pnl_model.r_squared.iloc[:5, -5:])

In [None]:
# Snapshot at a specific date
snapshot_date = dates[-1]
snap = pnl_model.summary_at(snapshot_date)
print(f'--- Summary at {snapshot_date.date()} (first 10 stocks) ---')
print(snap.head(10))
print()
# S0000 had all-NaN residuals — beta should be NaN even after ffill
print(f'S0000 beta (all-NaN stock): {pnl_model.betas.loc["S0000"].iloc[-1]}')
print()
# NaN stats
raw_nan = pnl_model.betas_raw.isna().sum().sum()
filled_nan = pnl_model.betas.isna().sum().sum()
print(f'NaN count — raw betas: {raw_nan}, after ffill: {filled_nan}')