In [1]:
import numpy as np
import statsmodels.api as sm

num = 1000 
 
event_time = int(num / 2) 
 
R_market = np.random.normal(0, 1, num) + np.arange(num) / num 
 
R_target = 2 + R_market + np.random.normal(0, 1, num) + (np.arange(num) == int(num / 2) + 1) * 2 
 
results = sm.OLS(R_target[:event_time], sm.add_constant(R_market[:event_time])).fit() 
 
alpha, beta = results.params 
 
resid = R_target - results.predict(sm.add_constant(R_market)) 
 
print(resid[event_time + 1] / resid[:event_time].std(ddof = 2)) 

3.8886377816731037


In [2]:
import numpy as np
import statsmodels.api as sm

def gen_data(num=1000, seed=None):
    """
    Generate one realization of the data:
     - event_time is at num//2
     - R_market has a linear trend + noise
     - R_target has baseline 2 + R_market + noise + a jump of size 2 at event_time+1
    """
    if seed is not None:
        np.random.seed(seed)
    event_time = int(num / 2)
    R_market = np.random.normal(0, 1, num) + np.arange(num) / num
    R_target = (
        2
        + R_market
        + np.random.normal(0, 1, num)
        + (np.arange(num) == event_time + 1) * 2
    )
    return R_market, R_target, event_time

def standardized_residual_at_event(R_market, R_target, event_time, test_time_offset=1):
    """
    Fit OLS on data up to `event_time` (exclusive) and compute
    the standardized residual at event_time + test_time_offset.
    """
    train_end = event_time
    test_idx = event_time + test_time_offset
    if test_idx >= len(R_target) or train_end < 2:
        return np.nan
    model = sm.OLS(R_target[:train_end], sm.add_constant(R_market[:train_end])).fit()
    resid = R_target - model.predict(sm.add_constant(R_market))
    sd_pre = resid[:train_end].std(ddof=2)
    if sd_pre == 0 or np.isnan(sd_pre):
        return np.nan
    return resid[test_idx] / sd_pre  # this is the "t-value" analogue

def estimate_power(num_sims=2000, threshold=1.96):
    """
    Question 1: Simulate many datasets and compute the fraction where
    |standardized residual at the real event| exceeds threshold.
    """
    count_detect = 0
    valid = 0
    for _ in range(num_sims):
        R_market, R_target, event_time = gen_data()
        tval = standardized_residual_at_event(R_market, R_target, event_time)
        if np.isnan(tval):
            continue
        valid += 1
        if np.abs(tval) > threshold:
            count_detect += 1
    return count_detect / valid if valid > 0 else np.nan

def placebo_full_false_positive_rate(num=1000, threshold=1.96):
    """
    Question 2: On one fixed dataset (seed=0), run placebo tests
    for every possible fictitious event_time and record how often
    the test at that fictional event_time+1 exceeds threshold.
    """
    R_market, R_target, _ = gen_data(seed=0, num=num)
    false_positives = 0
    total = 0
    for t0 in range(1, num - 1):  # so that test point t0+1 is valid
        tval = standardized_residual_at_event(R_market, R_target, t0)
        if np.isnan(tval):
            continue
        total += 1
        if np.abs(tval) > threshold:
            false_positives += 1
    return false_positives / total if total > 0 else np.nan

def local_placebo_higher_fraction(num_sims=500):
    """
    Question 3: For many random datasets, compare the actual event's
    |t| to the 40 nearby placebo |t|s (20 before and 20 after), and
    record the fraction of those placebos with higher |t|.
    """
    fractions = []
    for i in range(num_sims):
        seed = 1000 + i
        R_market, R_target, event_time = gen_data(seed=seed)
        t_actual = standardized_residual_at_event(R_market, R_target, event_time)
        if np.isnan(t_actual):
            continue
        # define placebo event_times so that their test point is within ±20 of actual event point
        # actual test point is event_time+1; so placebo test points are event_time+1 + d for d in [-20..-1,1..20]
        d_list = [d for d in range(-20, 21) if d != 0]
        placebo_event_times = [event_time + d - 1 for d in d_list]  # because standardized_residual uses event_time and tests at +1
        tvals = []
        num = len(R_target)
        for pet in placebo_event_times:
            if pet < 1 or pet >= num - 1:
                continue
            t_placebo = standardized_residual_at_event(R_market, R_target, pet)
            if not np.isnan(t_placebo):
                tvals.append(t_placebo)
        if len(tvals) == 0:
            continue
        frac = np.mean([np.abs(tv) > np.abs(t_actual) for tv in tvals])
        fractions.append(frac)
    return np.mean(fractions), np.std(fractions), len(fractions)

# Run evaluation
power_estimate = estimate_power(num_sims=2000)
placebo_fp_rate = placebo_full_false_positive_rate()
local_frac_mean, local_frac_std, n_used = local_placebo_higher_fraction(num_sims=500)

print(f"Question 1 estimated power (|t| > 1.96): {power_estimate:.3f}")
print(f"Question 2 placebo false positive rate over all times: {placebo_fp_rate:.3f}")
print(f"Question 3 average fraction of 40 placebos with higher |t| than actual: {local_frac_mean:.3f} (std {local_frac_std:.3f}, n={n_used})")

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Question 1 estimated power (|t| > 1.96): 0.527
Question 2 placebo false positive rate over all times: 0.047
Question 3 average fraction of 40 placebos with higher |t| than actual: 0.141 (std 0.224, n=500)


In [3]:
import numpy as np
import statsmodels.api as sm

def make_error(corr_const, num=1000):
    # stationary AR(1)-like error with variance 25
    sigma = 5 / np.sqrt(((1 - corr_const) ** 2) / (1 - corr_const ** 2))
    err = []
    prev = np.random.normal(0, sigma)
    for _ in range(num):
        prev = corr_const * prev + (1 - corr_const) * np.random.normal(0, sigma)
        err.append(prev)
    return np.array(err)

def standardized_residual_at_event(R_market, R_target, event_time, test_time_offset=1):
    train_end = event_time
    test_idx = event_time + test_time_offset
    if test_idx >= len(R_target) or train_end < 2:
        return np.nan
    model = sm.OLS(R_target[:train_end], sm.add_constant(R_market[:train_end])).fit()
    resid = R_target - model.predict(sm.add_constant(R_market))
    sd_pre = resid[:train_end].std(ddof=2)
    if sd_pre == 0 or np.isnan(sd_pre):
        return np.nan
    return resid[test_idx] / sd_pre

def placebo_full_false_positive_rate_corr_error(num=1000, corr_const=0.9, threshold=1.96):
    np.random.seed(0)  # fixed dataset
    event_time = int(num / 2)
    R_market = np.random.normal(0, 1, num) + np.arange(num) / num
    err = make_error(corr_const, num=num)
    # add jump at event_time+1
    R_target = 2 + R_market + err + (np.arange(num) == event_time + 1) * 2

    false_positives = 0
    total = 0
    for t0 in range(1, num - 1):
        tval = standardized_residual_at_event(R_market, R_target, t0)
        if np.isnan(tval):
            continue
        total += 1
        if np.abs(tval) > threshold:
            false_positives += 1
    return false_positives / total if total > 0 else np.nan

# Run it
fp_rate = placebo_full_false_positive_rate_corr_error()
print("Question 4 placebo false positive rate with corr_const=0.9:", fp_rate)

Question 4 placebo false positive rate with corr_const=0.9: 0.04714142427281846
