In [162]:
!pip install yfinance statsmodels --quiet


In [163]:
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
plt.style.use("seaborn-v0_8")


In [164]:
nifty100 = [
"ADANIENT.NS","ADANIPORTS.NS","ASIANPAINT.NS","AXISBANK.NS","BAJAJ-AUTO.NS",
"BAJFINANCE.NS","BAJAJFINSV.NS","BPCL.NS","BHARTIARTL.NS","BOSCHLTD.NS",
"BRITANNIA.NS","CIPLA.NS","COALINDIA.NS","DIVISLAB.NS","DRREDDY.NS",
"EICHERMOT.NS","GRASIM.NS","HCLTECH.NS","HDFC.NS","HDFCBANK.NS",
"HDFCLIFE.NS","HEROMOTOCO.NS","HINDALCO.NS","HINDUNILVR.NS","ICICIBANK.NS",
"ITC.NS","INDUSINDBK.NS","INFY.NS","JSWSTEEL.NS","KOTAKBANK.NS",
"LT.NS","M&M.NS","MARUTI.NS","NESTLEIND.NS","NTPC.NS",
"OIL.NS","ONGC.NS","POWERGRID.NS","RELIANCE.NS","SHREECEM.NS",
"SBIN.NS","SUNPHARMA.NS","TCS.NS","TATACONSUM.NS","TATAMOTORS.NS",
"TATASTEEL.NS","TECHM.NS","TITAN.NS","ULTRACEMCO.NS","UPL.NS",
"WIPRO.NS","COFORGE.NS","BANDHANBNK.NS","MOTHERSUMI.NS","SBILIFE.NS",
"PEL.NS","NTPC.NS","HINDPETRO.NS","VINATIORGA.NS","ADANIGREEN.NS",
"AUROPHARMA.NS","BHEL.NS","CADILAHC.NS","CANBK.NS","CHOLAFIN.NS",
"DLF.NS","GAIL.NS","HAVELLS.NS","IBULHSGFIN.NS","IDEA.NS",
"INDIGO.NS","LTIM.NS","MFSL.NS","MPI.NS","NAUKRI.NS",
"NMDC.NS","PAGEIND.NS","PEL.NS","PNB.NS","RECLTD.NS",
"SUNTV.NS","TATAPOWER.NS","TATACHEM.NS","TVSMOTOR.NS","WELCORP.NS",
"ZOMATO.NS","BALKRISIND.NS","BERGEPAINT.NS","BIOCON.NS","CROMPTON.NS",
"EXIDEIND.NS","GLENMARK.NS","HDFCLIFE.NS","IDFCFIRSTB.NS","INDHOTEL.NS",
"JUBLFOOD.NS","LTI.NS","MCDOWELL-N.NS","NIACL.NS","PNBHOUSING.NS"
]
# Remove duplicates & ensure unique
nifty100 = list(dict.fromkeys(nifty100))
print("Tickers loaded:", len(nifty100))


Tickers loaded: 97


In [165]:
# ===============================
# 1️⃣ Download prices 2019–2025
# ===============================
start_date = "2019-01-01"
end_date = "2025-12-31"   # extended to allow proper OOS + rolling warmup

print("Downloading price data ...")
raw = yf.download(nifty100, start=start_date, end=end_date,
                  group_by="ticker", auto_adjust=True, threads=True)

# Extract Close prices robustly
if ("Close" in raw.columns) or (
    isinstance(raw.columns, pd.MultiIndex) and "Close" in raw.columns.levels[1]
):
    prices = pd.DataFrame()
    for t in nifty100:
        try:
            prices[t] = raw[t]["Close"]
        except Exception:
            pass
else:
    prices = raw.copy()

# Clean data
prices.dropna(axis=1, how='all', inplace=True)
prices.dropna(axis=0, how='any', inplace=True)
prices = prices.loc[:, prices.std() > 0]

print("Final stocks with complete data:", prices.shape[1])
prices.tail()

# ===============================
# 2️⃣ Define selection vs backtest windows
# ===============================
selection_prices = prices.loc["2019-01-01":"2024-12-31"]   # for pair selection
backtest_prices  = prices.loc["2025-01-01":"2025-12-31"]   # OOS backtest


Downloading price data ...


[*********************100%***********************]  97 of 97 completed

10 Failed downloads:
['IBULHSGFIN.NS', 'MPI.NS', 'MOTHERSUMI.NS', 'TATAMOTORS.NS', 'CADILAHC.NS', 'HDFC.NS', 'LTI.NS', 'ZOMATO.NS', 'MCDOWELL-N.NS', 'PEL.NS']: YFTzMissingError('possibly delisted; no timezone found')


Final stocks with complete data: 87


In [166]:
# ===============================
# 3️⃣ Candidate pairs selection (2019–2024)
# ===============================

# Use only selection window
returns = selection_prices.pct_change().dropna()
corr = returns.corr()

corr_threshold = 0.55       # tuneable
max_candidate_pairs = 400   # safety cap

candidate_pairs = []
cols = selection_prices.columns.tolist()
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        c = corr.iloc[i, j]
        if c >= corr_threshold:
            candidate_pairs.append((cols[i], cols[j], c))

# Keep top by correlation
candidate_pairs = sorted(candidate_pairs, key=lambda x: x[2], reverse=True)
if len(candidate_pairs) > max_candidate_pairs:
    candidate_pairs = candidate_pairs[:max_candidate_pairs]

print("Candidate pairs after correlation filter (2019-2024):", len(candidate_pairs))


Candidate pairs after correlation filter (2019-2024): 46


In [167]:
# ===============================
# 4️⃣ Cointegration / half-life / beta (2019–2024 selection)
# ===============================

cointegrated = []

# Looser but practical filters
ADF_P_THRESHOLD = 0.08       # allows some pairs that are reasonably stationary
MIN_HALF_LIFE = 2            # min half-life
MAX_HALF_LIFE = 80          # max half-life

for a, b, corr_val in candidate_pairs:
    try:
        # Use only in-sample period for selection
        df = selection_prices[[a, b]].dropna()
        y = df[a]
        x = df[b]

        # Regression for beta
        X = sm.add_constant(x)
        model = sm.OLS(y, X).fit()
        beta = model.params.iloc[1]  # fixed

        # Spread & stationarity
        spread = y - beta * x
        spread = spread.dropna()
        pvalue = adfuller(spread)[1]

        # Half-life calculation
        spread_lag = spread.shift(1)
        spread_ret = spread - spread_lag
        spread_lag = spread_lag.dropna()
        spread_ret = spread_ret.dropna()

        X_hl = sm.add_constant(spread_lag)
        model_hl = sm.OLS(spread_ret, X_hl).fit()
        half_life = -np.log(2) / model_hl.params.iloc[1]  # fixed

        # Final filter
        if (pvalue < ADF_P_THRESHOLD) and (MIN_HALF_LIFE < half_life < MAX_HALF_LIFE):
            cointegrated.append((a, b, beta, pvalue, corr_val, half_life))

    except Exception:
        continue

# Create dataframe
coin_df = pd.DataFrame(
    cointegrated,
    columns=["A", "B", "beta", "adf_pval", "corr", "half_life"]
)

print("✅ FINAL PAIRS FOUND (2019-2024):", len(coin_df))
coin_df.sort_values("adf_pval").head(10)


✅ FINAL PAIRS FOUND (2019-2024): 18


Unnamed: 0,A,B,beta,adf_pval,corr,half_life
2,HINDALCO.NS,TATASTEEL.NS,3.854485,0.003657,0.725739,45.232995
0,BAJFINANCE.NS,BAJAJFINSV.NS,0.389847,0.012449,0.824249,48.388058
5,ICICIBANK.NS,SBIN.NS,1.429482,0.014812,0.658016,36.242164
8,HDFCBANK.NS,ICICIBANK.NS,0.363026,0.017065,0.621813,49.456118
14,HINDUNILVR.NS,NESTLEIND.NS,1.256678,0.020371,0.575176,52.290355
12,NTPC.NS,POWERGRID.NS,1.245945,0.026253,0.583776,50.987796
4,SHREECEM.NS,ULTRACEMCO.NS,1.025005,0.028538,0.663979,57.219833
6,GRASIM.NS,ULTRACEMCO.NS,0.260412,0.033466,0.654953,43.053617
3,HINDALCO.NS,JSWSTEEL.NS,0.638922,0.036439,0.680723,58.022435
15,COALINDIA.NS,ONGC.NS,1.723302,0.038745,0.564858,47.568484


In [168]:
for a, b, corr_val in candidate_pairs[:20]:
    df = prices[[a, b]].dropna()
    y = df[a]
    x = df[b]
    
    beta = sm.OLS(y, sm.add_constant(x)).fit().params.iloc[1]   # fixed
    spread = y - beta*x
    pval = adfuller(spread)[1]
    
    spread_lag = spread.shift(1).dropna()
    spread_ret = (spread - spread.shift(1)).dropna()
    
    hl = -np.log(2) / sm.OLS(spread_ret, sm.add_constant(spread_lag)).fit().params.iloc[1]  # fixed
    print(a, b, round(pval,3), round(hl,1))


BAJFINANCE.NS BAJAJFINSV.NS 0.048 65.3
BPCL.NS HINDPETRO.NS 0.008 47.9
JSWSTEEL.NS TATASTEEL.NS 0.02 67.4
CANBK.NS PNB.NS 0.809 359.1
SBIN.NS CANBK.NS 0.164 141.8
AXISBANK.NS ICICIBANK.NS 0.112 91.0
HINDALCO.NS TATASTEEL.NS 0.109 66.8
HINDALCO.NS JSWSTEEL.NS 0.074 63.7
SHREECEM.NS ULTRACEMCO.NS 0.01 54.7
ADANIENT.NS ADANIPORTS.NS 0.663 535.9
ICICIBANK.NS SBIN.NS 0.064 54.2
SBIN.NS PNB.NS 0.42 303.4
ASIANPAINT.NS BERGEPAINT.NS 0.363 152.9
GRASIM.NS ULTRACEMCO.NS 0.005 35.4
HCLTECH.NS INFY.NS 0.607 254.3
INFY.NS TCS.NS 0.068 71.8
HCLTECH.NS TCS.NS 0.674 245.3
HDFCBANK.NS ICICIBANK.NS 0.012 51.9
AXISBANK.NS SBIN.NS 0.015 53.9
HDFCBANK.NS KOTAKBANK.NS 0.002 40.4


In [169]:
# ===============================
# 5️⃣ Spread volatility, score, inverse-vol weights
# ===============================
vols = []
scores = []

for _, row in coin_df.iterrows():
    a, b, beta = row["A"], row["B"], row["beta"]
    df = selection_prices[[a, b]].dropna()
    y = df[a]
    x = df[b]

    spread = y - beta * x
    norm_spread = spread / y.mean()

    spread_vol = norm_spread.rolling(60).std().mean()
    vols.append(spread_vol)

    score = spread_vol / row["half_life"]
    scores.append(score)

coin_df["spread_vol"] = vols
coin_df["score"] = scores
coin_df["inv_vol"] = 1 / coin_df["spread_vol"]
coin_df["weight"] = coin_df["inv_vol"] / coin_df["inv_vol"].sum()

# Select top-K pairs
top_k = 15
coin_df = coin_df.sort_values("score", ascending=False).reset_index(drop=True)
selected = coin_df.head(top_k).copy()

# ===============================
# 5️⃣b Ensure both tickers exist in 2025
# ===============================
valid_pairs = []
for _, row in selected.iterrows():
    if row["A"] in backtest_prices.columns and row["B"] in backtest_prices.columns:
        valid_pairs.append(row)
selected = pd.DataFrame(valid_pairs)

print("Pairs available for 2025 OOS backtest:", len(selected))
used = set()
filtered = []

for _, row in selected.iterrows():
    if row["A"] not in used and row["B"] not in used:
        filtered.append(row)
        used.update([row["A"], row["B"]])

selected = pd.DataFrame(filtered)



Pairs available for 2025 OOS backtest: 15


In [170]:
# ===============================
# 6️⃣ Backtest function
# ===============================
def backtest_pair(backtest_prices, A, B, beta, window=60, entry_z=2, exit_z=0.5, tc=0.001):
    s = backtest_prices[A] - beta * backtest_prices[B]
    norm_s = s / backtest_prices[A].mean()

    mean = norm_s.rolling(window).mean()
    std = norm_s.rolling(window).std()
    z = (norm_s - mean) / std
    z = z.dropna()

    if z.empty:
        return None

    pnl = []
    pos_log = []
    last_pos = 0

    for i in range(1, len(z)):
        zi = z.iloc[i]
        date = z.index[i]
        if zi > entry_z and last_pos == 0:
            last_pos = -1
            cost = tc * (abs(backtest_prices[A].loc[date]) + abs(backtest_prices[B].loc[date]))
        elif zi < -entry_z and last_pos == 0:
            last_pos = 1
            cost = tc * (abs(backtest_prices[A].loc[date]) + abs(backtest_prices[B].loc[date]))
        elif abs(zi) < exit_z and last_pos != 0:
            last_pos = 0
            cost = tc * (abs(backtest_prices[A].loc[date]) + abs(backtest_prices[B].loc[date]))
        else:
            cost = 0
        # Skip low-signal days completely
        if abs(zi) < 0.75:
            pnl.append(0)
            pos_log.append(last_pos)
            continue

        spread_change = s.iloc[i] - s.iloc[i-1]
        daily_pnl = last_pos * spread_change - cost

        pnl.append(daily_pnl)
        pos_log.append(last_pos)

    pnl = pd.Series(pnl, index=z.index[1:])
    equity = pnl.cumsum()

    sharpe = np.sqrt(252) * pnl.mean() / pnl.std() if pnl.std() > 0 else 0
    maxdd = (equity - equity.cummax()).min()
    trades = sum(1 for x in pos_log if x != 0)

    return {"equity": equity, "pnl": pnl, "sharpe": sharpe, "maxdd": maxdd, "trades": trades}


In [171]:
results = []
portfolio_equities = []

for idx, row in selected.iterrows():
    A = row["A"]
    B = row["B"]
    beta = row["beta"]

    res = backtest_pair(prices, A, B, beta, window=60, entry_z=1.8, exit_z=0.75, tc=0.001)

    if res is None:
        continue

    results.append({
        "A": A,
        "B": B,
        "beta": beta,
        "adf_pval": row["adf_pval"],
        "sharpe": res["sharpe"],
        "max_dd": res["maxdd"],
        "trades": res["trades"]
    })

    portfolio_equities.append(res["equity"].rename(f"{A}-{B}"))

print("Backtested pairs:", len(results))

Backtested pairs: 9


In [172]:
# ===============================
# PORTFOLIO AGGREGATION (FIXED)
# ===============================

# 1️⃣ Build per-pair PnL correctly from equity curves
portfolio_pnls = pd.concat(
    [
        eq.diff().fillna(0).rename(f"{r['A']}-{r['B']}")
        for eq, r in zip(portfolio_equities, results)
    ],
    axis=1
)

# Guard: if nothing traded
if portfolio_pnls.empty:
    raise ValueError("No portfolio PnLs to aggregate")

# 2️⃣ Raw portfolio PnL & equity
portfolio_pnl_raw = portfolio_pnls.sum(axis=1)
portfolio_equity_raw = portfolio_pnl_raw.cumsum()

# ===============================
# VOL TARGET SCALING (ROBUST)
# ===============================

target_vol = 0.008

realized_vol = portfolio_pnl_raw.rolling(30, min_periods=10).std()
scaling = target_vol / realized_vol

scaling = (
    scaling.replace([np.inf, -np.inf], np.nan)
           .ffill()
           .fillna(0)
           .clip(0, 1.5)
)

portfolio_pnl = portfolio_pnl_raw * scaling
portfolio_equity = portfolio_pnl.cumsum()

# ===============================
# METRICS
# ===============================

portfolio_sharpe = (
    np.sqrt(252) * portfolio_pnl.mean() / portfolio_pnl.std()
    if portfolio_pnl.std() > 0 else 0
)

portfolio_maxdd = (portfolio_equity - portfolio_equity.cummax()).min()

# ===============================
# REPORT
# ===============================

print("\n=== Portfolio Summary (VOL TARGETED, 2025 OOS) ===")
print(f"Portfolio Sharpe: {portfolio_sharpe:.3f}")
print(f"Portfolio Max Drawdown: {portfolio_maxdd:.2f}")
print(f"Total pairs traded: {len(results)}")
print(f"Total trades (sum rough): {sum(r['trades'] for r in results)}")
print(f"Average pair Sharpe: {np.mean([r['sharpe'] for r in results]):.3f}")

print("\nNon-zero PnL days:", (portfolio_pnl_raw != 0).sum())
print("Total days:", len(portfolio_pnl_raw))



=== Portfolio Summary (VOL TARGETED, 2025 OOS) ===
Portfolio Sharpe: 0.511
Portfolio Max Drawdown: -0.19
Total pairs traded: 9
Total trades (sum rough): 6289
Average pair Sharpe: 0.297

Non-zero PnL days: 1647
Total days: 1667
