 The null $H_0$ in strategy performance

In [1]:
import numpy as np
from scipy import stats

# Example: daily excess returns of a strategy (after costs, in %)
excess_returns = np.array([0.12, -0.08, 0.05, 0.03, -0.02, 0.07, 0.10])

# Null hypothesis H0: true mean excess return μ = 0
# One-sample t-test
t_stat, p_value = stats.ttest_1samp(excess_returns, popmean=0.0)

print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.4f}")

# Decision rule at 5% significance
alpha = 0.05
if p_value < alpha:
    print("Reject H0: Evidence of positive or negative alpha.")
else:
    print("Fail to reject H0: No statistical evidence of alpha.")

T-statistic: 1.466
P-value: 0.1931
Fail to reject H0: No statistical evidence of alpha.


The  alternative hypothesis $H_a$ or $H_1$

In [2]:
import numpy as np
from scipy import stats

# Example: daily excess returns of a strategy (after costs, in %)
excess_returns = np.array([0.12, -0.08, 0.05, 0.03, -0.02, 0.07, 0.10])

# Null hypothesis H0: μ = 0
# Alternative hypothesis Ha: μ > 0 (positive alpha exists)
t_stat, p_two_sided = stats.ttest_1samp(excess_returns, popmean=0.0)

# Convert two-sided p-value to one-sided
if t_stat > 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1.0  # if t_stat <= 0, evidence does not support Ha

print(f"T-statistic: {t_stat:.3f}")
print(f"One-sided p-value: {p_one_sided:.4f}")

# Decision rule at 5% significance
alpha = 0.05
if p_one_sided < alpha:
    print("Reject H0: Evidence supports Ha (positive alpha).")
else:
    print("Fail to reject H0: No evidence for positive alpha.")

T-statistic: 1.466
One-sided p-value: 0.0965
Fail to reject H0: No evidence for positive alpha.


Robust test statistic (mean return with HAC standard error)

In [3]:
import numpy as np
import statsmodels.api as sm

# Example: daily excess returns of a strategy (after costs, in %)
excess_returns = np.array([0.12, -0.08, 0.05, 0.03, -0.02, 0.07, 0.10])

# Null hypothesis: H0: μ = 0
# Sample estimate (mean return)
mean_ret = np.mean(excess_returns)

# Set up regression: returns = μ + ε
X = np.ones((len(excess_returns), 1))  # intercept only
ols_model = sm.OLS(excess_returns, X).fit(cov_type='HAC', cov_kwds={'maxlags':5})

# Extract test statistic and p-value for intercept (mean return)
t_stat = ols_model.tvalues[0]
p_value = ols_model.pvalues[0]

print(f"Sample mean: {mean_ret:.4f}")
print(f"T-statistic (Newey-West): {t_stat:.3f}")
print(f"P-value: {p_value:.4f}")

Sample mean: 0.0386
T-statistic (Newey-West): 3.136
P-value: 0.0017


t-statistic on regression alpha

In [4]:
import numpy as np
import statsmodels.api as sm

# Example: strategy and market daily excess returns (after risk-free rate)
strategy_excess = np.array([0.12, -0.08, 0.05, 0.03, -0.02, 0.07, 0.10])
market_excess   = np.array([0.10, -0.05, 0.02, 0.04, -0.01, 0.06, 0.09])

# Regression: strategy_excess ~ alpha + beta * market_excess
X = sm.add_constant(market_excess)  # adds intercept term
ols_model = sm.OLS(strategy_excess, X).fit(cov_type='HAC', cov_kwds={'maxlags':5})

alpha_hat = ols_model.params[0]
t_alpha   = ols_model.tvalues[0]
p_alpha   = ols_model.pvalues[0]

print(f"Estimated alpha: {alpha_hat:.4f}")
print(f"T-statistic (alpha): {t_alpha:.3f}")
print(f"P-value (alpha): {p_alpha:.4f}")

Estimated alpha: -0.0066
T-statistic (alpha): -2.867
P-value (alpha): 0.0041


F-statistic on joint hypotheses

In [5]:
import numpy as np
import statsmodels.api as sm

# Example: strategy returns and five candidate predictive factors
n_obs = 50
np.random.seed(42)
strategy_returns = np.random.normal(0, 0.01, n_obs)
factors = np.random.normal(0, 0.01, (n_obs, 5))

# Regression: strategy_returns ~ intercept + factors
X = sm.add_constant(factors)
ols_model = sm.OLS(strategy_returns, X).fit()

# Joint null hypothesis: all factor coefficients = 0
hypotheses = "x1 = x2 = x3 = x4 = x5 = 0"
f_test = ols_model.f_test(hypotheses)

print(f"F-statistic: {float(f_test.fvalue):.3f}")
print(f"P-value: {float(f_test.pvalue):.4f}")

F-statistic: 0.738
P-value: 0.5990


Decision via hypotheses test

In [6]:
import numpy as np
from scipy.stats import norm

# Example: daily returns from a strategy
np.random.seed(42)
returns = np.random.normal(0.0005, 0.01, size=1250)  # ~5 years of daily data

# Compute Sharpe ratio
mean_ret = np.mean(returns)
std_ret = np.std(returns, ddof=1)
sharpe = mean_ret / std_ret * np.sqrt(252)

# Standard error of Sharpe (Lo 2002 approx, IID assumption here)
n = len(returns)
se_sharpe = 1 / np.sqrt(n)

# One-sided z-test: H0: Sharpe <= 0 vs Ha: Sharpe > 0
z_stat = sharpe / se_sharpe
p_value = 1 - norm.cdf(z_stat)

print(f"Sharpe ratio: {sharpe:.2f}")
print(f"Z-statistic: {z_stat:.2f}")
print(f"One-sided p-value: {p_value:.4f}")

alpha = 0.05
if p_value < alpha:
    print("Reject H0: Evidence Sharpe > 0.")
else:
    print("Fail to reject H0.")

Sharpe ratio: 1.41
Z-statistic: 49.78
One-sided p-value: 0.0000
Reject H0: Evidence Sharpe > 0.


Decision via confidence interval

In [7]:
import numpy as np

# Parameters
n_boot = 5000
block_size = 20  # days per block to preserve autocorrelation
n = len(returns)

def sharpe_ratio(x):
    return np.mean(x) / np.std(x, ddof=1) * np.sqrt(252)

boot_sharpes = []
rng = np.random.default_rng(123)

# Stationary block bootstrap
for _ in range(n_boot):
    start_indices = rng.integers(0, n - block_size, size=n // block_size + 1)
    sampled = np.concatenate([returns[i:i+block_size] for i in start_indices])
    sampled = sampled[:n]  # trim to length
    boot_sharpes.append(sharpe_ratio(sampled))

ci_lower, ci_upper = np.percentile(boot_sharpes, [5, 95])

print(f"Bootstrap Sharpe CI (90%): [{ci_lower:.2f}, {ci_upper:.2f}]")
if ci_lower > 0:
    print("Reject H0: Interval excludes 0.")
else:
    print("Fail to reject H0.")

Bootstrap Sharpe CI (90%): [0.68, 2.19]
Reject H0: Interval excludes 0.


Type I error

In [8]:
import numpy as np
from scipy import stats

np.random.seed(42)

# Parameters
n_trials = 10000       # number of simulated "backtests"
n_obs = 252            # daily returns in one year
alpha = 0.05           # significance level

false_positives = 0

for _ in range(n_trials):
    # Simulate strategy returns with true mean = 0 (no edge)
    returns = np.random.normal(0, 0.01, size=n_obs)

    # One-sample t-test: H0: mean = 0
    t_stat, p_value = stats.ttest_1samp(returns, popmean=0.0)

    if p_value < alpha:
        false_positives += 1

rate = false_positives / n_trials
print(f"Empirical Type I error rate: {rate:.3f} (expected ~{alpha})")

Empirical Type I error rate: 0.049 (expected ~0.05)


Type II error

In [9]:
import numpy as np
from scipy import stats

np.random.seed(123)

# Parameters
n_trials = 10000        # number of simulated "backtests"
n_obs = 252             # daily returns in one year
true_mean = 0.0005      # 0.05% daily mean (~12.5% annualized return)
vol = 0.01              # daily volatility = 1%
alpha = 0.05            # significance level

false_negatives = 0

for _ in range(n_trials):
    # Simulate strategy returns with small positive edge
    returns = np.random.normal(true_mean, vol, size=n_obs)

    # One-sample t-test: H0: mean = 0
    t_stat, p_value = stats.ttest_1samp(returns, popmean=0.0)

    # If p-value > alpha, we fail to reject H0 → Type II error
    if p_value > alpha:
        false_negatives += 1

rate = false_negatives / n_trials
print(f"Empirical Type II error rate: {rate:.3f}")
print(f"Power of the test (1 - beta): {1-rate:.3f}")

Empirical Type II error rate: 0.869
Power of the test (1 - beta): 0.131


Liklihood ratio

In [10]:
import numpy as np
import statsmodels.api as sm
from scipy.stats import chi2

# --- Simulated data ---
np.random.seed(42)
n = 200
x1 = np.random.normal(size=n)
x2 = np.random.normal(size=n)
eps = np.random.normal(scale=1.0, size=n)

# True model: includes x1 and x2
y = 1.0 + 2.0 * x1 + 0.5 * x2 + eps

# --- Null model (restricted): y ~ 1 + x1 ---
X_null = sm.add_constant(x1)
model_null = sm.OLS(y, X_null).fit()

# --- Alternative model (unrestricted): y ~ 1 + x1 + x2 ---
X_alt = sm.add_constant(np.column_stack([x1, x2]))
model_alt = sm.OLS(y, X_alt).fit()

# --- Likelihood Ratio statistic ---
ll_null = model_null.llf   # log-likelihood of null model
ll_alt  = model_alt.llf    # log-likelihood of alternative model
LR_stat = -2 * (ll_null - ll_alt)

# Degrees of freedom = number of constraints = 1 (beta2 = 0)
df = model_alt.df_model - model_null.df_model
p_value = 1 - chi2.cdf(LR_stat, df)

print(f"LR statistic: {LR_stat:.3f}")
print(f"Degrees of freedom: {df}")
print(f"P-value: {p_value:.4f}")

LR statistic: 41.263
Degrees of freedom: 1.0
P-value: 0.0000


Detecting factor redundancy

In [11]:
import numpy as np
import statsmodels.api as sm
from scipy.stats import chi2

np.random.seed(42)
n, k_base, k_new = 300, 10, 3

# Simulate baseline factors and returns
X_base = np.random.normal(size=(n, k_base))
X_new  = np.random.normal(size=(n, k_new))
beta_base = np.random.uniform(-0.2, 0.2, size=k_base)
beta_new  = np.array([0.0, 0.0, 0.0])  # true redundancy (no edge)
y = 0.5 + X_base @ beta_base + X_new @ beta_new + np.random.normal(size=n)

# Fit null (baseline only) and alt (baseline + new) models
X0 = sm.add_constant(X_base)
X1 = sm.add_constant(np.column_stack([X_base, X_new]))
model_null = sm.OLS(y, X0).fit()
model_alt  = sm.OLS(y, X1).fit()

# Likelihood ratio test
ll_null, ll_alt = model_null.llf, model_alt.llf
LR_stat = -2 * (ll_null - ll_alt)
df = k_new
p_value = 1 - chi2.cdf(LR_stat, df)

print(f"LR statistic: {LR_stat:.3f}, df={df}, p-value={p_value:.4f}")

LR statistic: 2.811, df=3, p-value=0.4217


Detecting structural breaks

In [12]:
np.random.seed(123)
n = 200
break_point = 100

x = np.random.normal(size=n)
# Different regimes before and after break
y = np.empty(n)
y[:break_point]  = 0.5 + 1.0*x[:break_point]  + np.random.normal(size=break_point)
y[break_point:]  = 1.0 + 2.0*x[break_point:]  + np.random.normal(size=n-break_point)

# Null: single regression for all data
X = sm.add_constant(x)
model_null = sm.OLS(y, X).fit()

# Alt: separate regressions before/after break → equivalent to allowing interaction
d = (np.arange(n) >= break_point).astype(int)  # break dummy
X_alt = sm.add_constant(np.column_stack([x, d, x*d]))
model_alt = sm.OLS(y, X_alt).fit()

# Likelihood ratio test
ll_null, ll_alt = model_null.llf, model_alt.llf
LR_stat = -2 * (ll_null - ll_alt)
df = model_alt.df_model - model_null.df_model
p_value = 1 - chi2.cdf(LR_stat, df)

print(f"LR statistic: {LR_stat:.3f}, df={df}, p-value={p_value:.4f}")

LR statistic: 77.217, df=2.0, p-value=0.0000


Assessing model complexity

In [13]:
np.random.seed(321)
n = 250
momentum = np.random.normal(size=n)
# True relation has quadratic component
y = 0.5 + 1.5*momentum - 0.7*momentum**2 + np.random.normal(scale=1.0, size=n)

# Null: linear model y ~ momentum
X_lin = sm.add_constant(momentum)
model_null = sm.OLS(y, X_lin).fit()

# Alt: quadratic model y ~ momentum + momentum^2
X_quad = sm.add_constant(np.column_stack([momentum, momentum**2]))
model_alt = sm.OLS(y, X_quad).fit()

# Likelihood ratio test
ll_null, ll_alt = model_null.llf, model_alt.llf
LR_stat = -2 * (ll_null - ll_alt)
df = model_alt.df_model - model_null.df_model
p_value = 1 - chi2.cdf(LR_stat, df)

print(f"LR statistic: {LR_stat:.3f}, df={df}, p-value={p_value:.4f}")

LR statistic: 139.205, df=1.0, p-value=0.0000
