In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
from scipy.stats import ks_1samp, uniform, norm, gaussian_kde
import torch
from torchvision.datasets import MNIST
from torchvision import transforms
from torchvision.models import ResNet18_Weights
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from joblib import Parallel, delayed

from tqdm import tqdm

import timm

np.random.seed(42)

matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
plt.style.use("math.mplstyle")

plt.rcParams.update({"axes.labelsize": 14})
plt.rcParams.update({"xtick.labelsize": 14})
plt.rcParams.update({"ytick.labelsize": 14})
plt.rcParams.update({"legend.fontsize": 14})
plt.rcParams.update({"axes.titlesize": 16})

In [None]:
n = 1000
xi = 400

# Gaussian mean change

In [None]:
timeseries = np.concatenate([np.random.normal(-1, 1, xi), np.random.normal(1, 1, n - xi)])

time_indices = np.arange(1, n + 1)

plt.plot(time_indices, timeseries)
plt.show()

## Oracle likelihood ratio score

In [None]:
def compute_p_value_for_t(t, n, timeseries, left_scores, right_scores):
    p_values_curr = np.zeros(n)
    
    for r in range(t+1):
        left_segment_scores = left_scores[:r+1]
        rank = np.sum(left_scores[r] < left_segment_scores) + np.random.uniform(0, 1) * np.sum(left_scores[r] == left_segment_scores)
        p_values_curr[r] = rank / (r + 1)
    
    for r in range(n-1, t, -1):
        right_segment_scores = right_scores[r:]
        rank = np.sum(right_scores[r] < right_segment_scores) + np.random.uniform(0, 1) * np.sum(right_scores[r] == right_segment_scores)
        p_values_curr[r] = rank / (n - r + 1)
    
    p_left = ks_1samp(p_values_curr[:t+1], uniform.cdf, method="exact")[1]
    p_right = ks_1samp(p_values_curr[t+1:], uniform.cdf, method="exact")[1]
    
    return 1 - (1 - min(p_left, p_right)) ** 2

def run_single_simulation(n, xi):
    f_0 = norm(-1, 1)
    f_1 = norm(1, 1)

    left_scores = f_1.pdf(timeseries) / f_0.pdf(timeseries)
    right_scores = f_0.pdf(timeseries) / f_1.pdf(timeseries)

    p_values = Parallel(n_jobs=-1, verbose=1)(
        delayed(compute_p_value_for_t)(t, n, timeseries, left_scores, right_scores) 
        for t in range(n-1)
    )
    
    return np.array(p_values)

p_values = run_single_simulation(n, xi)

In [None]:
time_indices_p = np.arange(1, n)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi, color="red", linestyle="--", label="Changepoint ($\\xi = 400$)")
plt.axhline(0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)")
plt.xlabel("$t$")
plt.title("p-values for Gaussian mean change (oracle score)")
plt.legend()
plt.savefig("images/oracle.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue change point: {xi}")
print(f"Detected change point: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - xi)}")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05)[0] + 1}")

## Parametric learned score

In [None]:
def parametric_density(data):
    data = np.asarray(data).reshape(-1)
    return lambda x: norm.pdf(x, loc=np.mean(data), scale=1)

def _parametric_p_value_for_t(t, timeseries):
    n = len(timeseries)
    left_scores = np.zeros(n)
    right_scores = np.zeros(n)
    for r in range(t + 1):
        data_f0 = timeseries[: r + 1]
        kde_f0 = parametric_density(data_f0)
        if t + 1 >= n:
            left_scores[r] = 1.0
            continue
        data_f1 = timeseries[t + 1 :]
        if len(data_f1) == 0:
            left_scores[r] = 1.0
            continue
        kde_f1 = parametric_density(data_f1)
        for i in range(r + 1):
            left_scores[i] = kde_f1(timeseries[i]) / (kde_f0(timeseries[i]) + 1e-10)
    for r in range(t + 1, n):
        data_f1 = timeseries[r:]
        kde_f1 = parametric_density(data_f1)
        data_f0 = timeseries[: t + 1]
        kde_f0 = parametric_density(data_f0)
        for i in range(r, n):
            right_scores[i] = kde_f0(timeseries[i]) / (kde_f1(timeseries[i]) + 1e-10)
    p_values_curr = np.zeros(n)
    for r in range(t + 1):
        left_segment_scores = left_scores[: r + 1]
        rank = np.sum(left_scores[r] < left_segment_scores) + np.random.uniform(0, 1) * np.sum(left_scores[r] == left_segment_scores)
        p_values_curr[r] = rank / (r + 1)
    for r in range(t + 1, n):
        right_segment_scores = right_scores[r:]
        rank = np.sum(right_scores[r] < right_segment_scores) + np.random.uniform(0, 1) * np.sum(right_scores[r] == right_segment_scores)
        p_values_curr[r] = rank / (n - r)
    try:
        if t > 0:
            p_left = ks_1samp(p_values_curr[: t + 1], uniform.cdf, method="exact")[1]
        else:
            p_left = 1.0
        if t + 1 < n:
            p_right = ks_1samp(p_values_curr[t + 1 :], uniform.cdf, method="exact")[1]
        else:
            p_right = 1.0
    except:
        p_left = 1.0
        p_right = 1.0
    return 1 - (1 - min(p_left, p_right)) ** 2

def run_single_simulation_parametric(n, xi):
    timeseries = np.concatenate([np.random.normal(-1, 1, xi), np.random.normal(1, 1, n - xi)])
    p_values = Parallel(n_jobs=-1, verbose=1)(delayed(_parametric_p_value_for_t)(t, timeseries) for t in range(n - 1))
    return np.array(p_values)

p_values = run_single_simulation_parametric(n, xi)

In [None]:
time_indices_p = np.arange(1, n)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi, color="red", linestyle="--", label="Changepoint ($\\xi = 400$)")
plt.axhline(0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)")
plt.xlabel("$t$")
plt.title("p-values for Gaussian mean change (parametric learned score)")
plt.legend()
plt.savefig("images/parametric.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue change point: {xi}")
print(f"Detected change point: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - xi)}")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05)[0] + 1}")

## Kernel density estimator (KDE) learned score

In [None]:
def compute_kde(data):
    data = np.asarray(data).reshape(-1)
    if len(data) <= 1:
        mean = data[0] if len(data) == 1 else 0
        return lambda x: norm.pdf(x, loc=mean, scale=0.1)
    return gaussian_kde(data)

def _kde_p_value_for_t(t, timeseries):
    n = len(timeseries)
    left_scores = np.zeros(n)
    right_scores = np.zeros(n)

    for r in range(t + 1):
        data_f0 = timeseries[: r + 1]
        kde_f0 = compute_kde(data_f0)

        if t + 1 >= n:
            left_scores[r] = 1.0
            continue

        data_f1 = timeseries[t + 1 :]
        if len(data_f1) == 0:
            left_scores[r] = 1.0
            continue

        kde_f1 = compute_kde(data_f1)
        left_scores[: r + 1] = kde_f1(timeseries[: r + 1]) / (
            kde_f0(timeseries[: r + 1]) + 1e-10
        )

    for r in range(t + 1, n):
        data_f1 = timeseries[r:]
        kde_f1 = compute_kde(data_f1)
        data_f0 = timeseries[: t + 1]
        kde_f0 = compute_kde(data_f0)
        right_scores[r:] = kde_f0(timeseries[r:]) / (
            kde_f1(timeseries[r:]) + 1e-10
        )

    p_values_curr = np.zeros(n)

    for r in range(t + 1):
        left_segment_scores = left_scores[: r + 1]
        rank = np.sum(left_scores[r] < left_segment_scores) + np.random.uniform(0, 1) * np.sum(
            left_scores[r] == left_segment_scores
        )
        p_values_curr[r] = rank / (r + 1)

    for r in range(t + 1, n):
        right_segment_scores = right_scores[r:]
        rank = np.sum(right_scores[r] < right_segment_scores) + np.random.uniform(0, 1) * np.sum(
            right_scores[r] == right_segment_scores
        )
        p_values_curr[r] = rank / (n - r)

    try:
        if t > 0:
            p_left = ks_1samp(p_values_curr[: t + 1], uniform.cdf, method="exact")[1]
        else:
            p_left = 1.0
        if t + 1 < n:
            p_right = ks_1samp(p_values_curr[t + 1 :], uniform.cdf, method="exact")[1]
        else:
            p_right = 1.0
    except Exception:
        p_left = 1.0
        p_right = 1.0

    return 1 - (1 - min(p_left, p_right)) ** 2

def run_single_simulation_kde(n, xi, n_jobs=-1, verbose=1):
    timeseries = np.concatenate(
        [np.random.normal(-1, 1, xi), np.random.normal(1, 1, n - xi)]
    )
    p_values = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_kde_p_value_for_t)(t, timeseries) for t in range(n - 1)
    )
    return np.array(p_values)

p_values = run_single_simulation_kde(n, xi)

In [None]:
time_indices_p = np.arange(1, n)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi, color="red", linestyle="--", label="Changepoint ($\\xi = 400$)")
plt.axhline(0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)")
plt.xlabel("$t$")
plt.title("p-values for Gaussian mean change (KDE learned score)")
plt.legend()
# plt.savefig("images/kde.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue change point: {xi}")
print(f"Detected change point: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - xi)}")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05)[0] + 1}")

## Coverage and width simulations

In [None]:
def run_single_simulation(n, xi):
    timeseries = np.concatenate([np.random.normal(-1, 1, xi), np.random.normal(1, 1, n - xi)])
    
    f_0 = norm(-1, 1)
    f_1 = norm(1, 1)
    
    left_scores = f_1.pdf(timeseries) / f_0.pdf(timeseries)
    right_scores = f_0.pdf(timeseries) / f_1.pdf(timeseries)
    
    p_values = np.zeros(n-1)
    for t in range(n-1):
        p_values_curr = np.zeros(n)
        
        for r in range(t+1):
            left_segment_scores = left_scores[:r+1]
            rank = np.sum(left_scores[r] < left_segment_scores) + np.random.uniform(0, 1) * np.sum(left_scores[r] == left_segment_scores)
            p_values_curr[r] = rank / (r + 1)
        
        for r in range(n-1, t, -1):
            right_segment_scores = right_scores[r:]
            rank = np.sum(right_scores[r] < right_segment_scores) + np.random.uniform(0, 1) * np.sum(right_scores[r] == right_segment_scores)
            p_values_curr[r] = rank / (n - r + 1)
        
        p_left = ks_1samp(p_values_curr[:t+1], uniform.cdf, method="exact")[1]
        p_right = ks_1samp(p_values_curr[t+1:], uniform.cdf, method="exact")[1]
        
        p_values[t] = 1 - (1 - min(p_left, p_right)) ** 2
    
    return p_values

def run_simulation_study(n_simulations=1000, n=500, xi=200):
    all_p_values = np.zeros((n_simulations, n-1))
    
    coverages_95 = []
    coverages_50 = []
    widths_95 = []
    widths_50 = []
    detected_cps = []
    
    pbar = tqdm(range(n_simulations))
    
    for i in pbar:
        p_values = run_single_simulation(n, xi)
        all_p_values[i] = p_values
        
        coverage_95 = p_values[xi-1] > 0.05
        coverage_50 = p_values[xi-1] > 0.50
        width_95 = np.sum(p_values > 0.05)
        width_50 = np.sum(p_values > 0.50)
        detected_cp = np.argmax(p_values) + 1
        
        coverages_95.append(coverage_95)
        coverages_50.append(coverage_50)
        widths_95.append(width_95)
        widths_50.append(width_50)
        detected_cps.append(detected_cp)
        
        if i > 0:
            running_cov_95 = np.mean(coverages_95)
            running_cov_50 = np.mean(coverages_50)
            running_width_95 = np.mean(widths_95)
            running_error = np.mean([abs(cp - xi) for cp in detected_cps])
            
            pbar.set_description(f"Cov95:{running_cov_95:.3f} Cov50:{running_cov_50:.3f} W95:{running_width_95:.1f} Err:{running_error:.1f}")
    
    return all_p_values


In [None]:
all_p_values = run_simulation_study(n_simulations=1000, n=500, xi=200)

In [None]:
xi = 200
n_simulations, n_minus_1 = all_p_values.shape

coverage_95 = all_p_values[:, xi-1] > 0.05
coverage_50 = all_p_values[:, xi-1] > 0.50

widths_95 = np.sum(all_p_values > 0.05, axis=1)
widths_50 = np.sum(all_p_values > 0.50, axis=1)

detected_cps = np.argmax(all_p_values, axis=1) + 1
detection_errors = np.abs(detected_cps - xi)

print(f"95% Coverage: {np.mean(coverage_95):.3f}")
print(f"50% Coverage: {np.mean(coverage_50):.3f}")
print(f"Average Width (95%): {np.mean(widths_95):.1f}")
print(f"Average Width (50%): {np.mean(widths_50):.1f}")
print(f"Average Detection Error: {np.mean(detection_errors):.1f}")

In [None]:
plt.hist(widths_95)
plt.show()

In [None]:
plt.hist(detection_errors)
plt.show()

In [None]:
plt.hist(detected_cps)
plt.axvline(x=200, color='red', linestyle='--')
plt.show()

In [None]:
np.save('all_p_values.npy', all_p_values)

# MNIST digit change

In [None]:
def get_mnist_trained_model(device="cpu"):
    class MNISTModel(torch.nn.Module):
        def __init__(self):
            super(MNISTModel, self).__init__()
            self.conv1 = torch.nn.Conv2d(1, 32, 3, 1)
            self.conv2 = torch.nn.Conv2d(32, 64, 3, 1)
            self.dropout1 = torch.nn.Dropout(0.25)
            self.dropout2 = torch.nn.Dropout(0.5)
            self.fc1 = torch.nn.Linear(9216, 128)
            self.fc2 = torch.nn.Linear(128, 10)

        def forward(self, x):
            x = self.conv1(x)
            x = torch.nn.functional.relu(x)
            x = self.conv2(x)
            x = torch.nn.functional.relu(x)
            x = torch.nn.functional.max_pool2d(x, 2)
            x = self.dropout1(x)
            x = torch.flatten(x, 1)
            x = self.fc1(x)
            x = torch.nn.functional.relu(x)
            x = self.dropout2(x)
            x = self.fc2(x)
            return x

    model = MNISTModel().to(device)

    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    )

    train_dataset = MNIST(root="./data", train=True, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64)

    optimizer = torch.optim.Adam(model.parameters())
    criterion = torch.nn.CrossEntropyLoss()

    print("Training MNIST model...")
    model.train()
    for epoch in range(1):
        for batch_idx, (data, target) in enumerate(tqdm(train_loader)):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0:
                print(
                    f"Epoch: {epoch} [{batch_idx*len(data)}/{len(train_loader.dataset)} "
                    f"({100. * batch_idx / len(train_loader):.0f}%)]\\tLoss: {loss.item():.6f}"
                )

    model.eval()
    return model


def predict_digit(model, image, device="cpu"):
    image = image.reshape(1, 1, 28, 28)
    image_tensor = torch.tensor(image, device=device)

    with torch.no_grad():
        outputs = torch.softmax(model(image_tensor), dim=1).cpu()
        predicted = outputs.argmax(dim=1).item()
    return (predicted, outputs)


def generate_mnist_dataset(length, changepoint, digit1=3, digit2=7):
    transform = transforms.ToTensor()
    mnist_data = MNIST(root="./data", train=True, download=True, transform=transform)
    data = mnist_data.data.numpy()
    targets = mnist_data.targets.numpy()

    images_digit1 = data[targets == digit1]
    images_digit2 = data[targets == digit2]
    np.random.shuffle(images_digit1)
    np.random.shuffle(images_digit2)

    n1 = changepoint + 1
    n2 = length - n1
    if n1 > len(images_digit1) or n2 > len(images_digit2):
        raise ValueError("Insufficient images for the specified digits and length.")

    data1 = images_digit1[:n1]
    data2 = images_digit2[:n2]
    x = np.concatenate([data1, data2], axis=0)

    x = x.reshape(length, -1).astype(np.float32) / 255.0
    return x

In [None]:
def compute_sequential_scores(x, model, device="cpu"):
    length = len(x)

    print("Getting model predictions...")
    predictions = []
    probabilities = []

    for i in tqdm(range(length)):
        pred, prob = predict_digit(model, x[i], device)
        predictions.append(pred)
        probabilities.append(prob.squeeze())

    probabilities = torch.stack(probabilities)

    left_scores = np.zeros((length - 1, length))

    for t in tqdm(range(length - 1), desc="Computing left scores for each t"):
        reference = max(predictions[t+1:], key=predictions[t+1:].count)
        for r in range(t + 1):
            seen_digits = {}
            for i in range(r + 1):
                digit = predictions[i]
                seen_digits[digit] = seen_digits.get(digit, 0) + 1

            baseline_digit = max(seen_digits, key=seen_digits.get)

            prob_baseline = probabilities[r, baseline_digit]
            left_scores[t, r] = prob_baseline / (probabilities[r, reference] + 1e-10)

    right_scores = np.zeros((length - 1, length))

    for t in tqdm(range(length - 1), desc="Computing right scores for each t"):
        reference = max(predictions[:t+1], key=predictions[:t+1].count)
        for r in range(t + 1, length):
            seen_digits = {}
            for i in range(r, length):
                digit = predictions[i]
                seen_digits[digit] = seen_digits.get(digit, 0) + 1

            baseline_digit = max(seen_digits, key=seen_digits.get)

            prob_baseline = probabilities[r, baseline_digit]
            right_scores[t, r] = prob_baseline / (probabilities[r, reference] + 1e-10)

    return left_scores, right_scores, predictions, probabilities

In [None]:
model = get_mnist_trained_model()

In [None]:
def run_mnist_simulation(n, xi):
    x = generate_mnist_dataset(n, xi)
    left_scores, right_scores, predictions, probabilities = compute_sequential_scores(
        x, model
    )
    p_values = np.zeros(n - 1)
    for t in tqdm(range(n - 1)):
        p_values_curr = np.zeros(n)

        for r in range(t+1):
            left_segment_scores = left_scores[t, : r + 1]
            rank = np.sum(left_scores[t, r] < left_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(left_scores[t, r] == left_segment_scores)
            p_values_curr[r] = rank / (r + 1)

        for r in range(n - 1, t, -1):
            right_segment_scores = right_scores[t, r:]
            rank = np.sum(right_scores[t, r] < right_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(right_scores[t, r] == right_segment_scores)
            p_values_curr[r] = rank / (n - r)

        p_left = ks_1samp(p_values_curr[: t + 1], uniform.cdf, method="exact")[1]
        p_right = ks_1samp(p_values_curr[t + 1 :], uniform.cdf, method="exact")[1]

        p_values[t] = 1 - (1 - min(p_left, p_right)) ** 2

    return p_values

p_values = run_mnist_simulation(n, xi)

In [None]:
time_indices_p = np.arange(1, n)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi, color="red", linestyle="--", label="Changepoint ($\\xi = 400$)")
plt.axhline(
    0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)"
)
plt.xlabel("$t$")
plt.title("p-values for MNIST digit change (digit classifier)")
plt.legend()
plt.savefig("images/mnist-pvalues.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue change point: {xi}")
print(f"Detected change point: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - xi)}")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05) + 1}")

# SST-2 sentiment change (LLM)

In [None]:
def get_pretrained_sentiment_model(device="cpu"):
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    model.to(device)
    model.eval()
    return model, tokenizer


def generate_sentiment_dataset(length, changepoint, dataset_name="sst2"):
    dataset = load_dataset(dataset_name)

    train_data = dataset["train"]
    positive_texts = [item["sentence"] for item in train_data if item["label"] == 1]
    negative_texts = [item["sentence"] for item in train_data if item["label"] == 0]

    random.shuffle(positive_texts)
    random.shuffle(negative_texts)

    n1 = changepoint + 1
    n2 = length - n1

    if n1 > len(positive_texts) or n2 > len(negative_texts):
        raise ValueError("Insufficient texts for the specified length and changepoint.")

    texts_before = positive_texts[:n1]
    texts_after = negative_texts[:n2]

    texts = texts_before + texts_after
    true_labels = [1] * n1 + [0] * n2

    return texts, true_labels


def generate_mixed_sentiment_dataset(length, changepoint, dataset_name="sst2"):
    dataset = load_dataset(dataset_name)

    train_data = dataset["train"]
    positive_texts = [item["sentence"] for item in train_data if item["label"] == 1]
    negative_texts = [item["sentence"] for item in train_data if item["label"] == 0]

    random.shuffle(positive_texts)
    random.shuffle(negative_texts)

    n_pre = changepoint + 1
    n_post = length - n_pre

    n_pos_pre = int(n_pre * 0.6)
    n_neg_pre = n_pre - n_pos_pre

    n_pos_post = int(n_post * 0.4)
    n_neg_post = n_post - n_pos_post

    if n_pos_pre + n_pos_post > len(positive_texts) or n_neg_pre + n_neg_post > len(
        negative_texts
    ):
        raise ValueError(
            "Insufficient texts for the specified distribution and length."
        )

    pre_pos_texts = positive_texts[:n_pos_pre]
    pre_neg_texts = negative_texts[:n_neg_pre]
    pre_texts = pre_pos_texts + pre_neg_texts
    pre_labels = [1] * n_pos_pre + [0] * n_neg_pre

    pre_combined = list(zip(pre_texts, pre_labels))
    random.shuffle(pre_combined)
    pre_texts, pre_labels = zip(*pre_combined)

    post_pos_texts = positive_texts[n_pos_pre : n_pos_pre + n_pos_post]
    post_neg_texts = negative_texts[n_neg_pre : n_neg_pre + n_neg_post]
    post_texts = post_pos_texts + post_neg_texts
    post_labels = [1] * n_pos_post + [0] * n_neg_post

    post_combined = list(zip(post_texts, post_labels))
    random.shuffle(post_combined)
    post_texts, post_labels = zip(*post_combined)

    texts = list(pre_texts) + list(post_texts)
    true_labels = list(pre_labels) + list(post_labels)

    return texts, true_labels


def predict_sentiment(model, tokenizer, text, device="cpu"):
    inputs = tokenizer(
        text, return_tensors="pt", padding=True, truncation=True, max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).cpu()
        predicted = probs.argmax(dim=1).item()

    return (predicted, probs.squeeze())

## Plot examples

In [None]:
length = 1000
changepoint = 400
device = "cuda" if torch.cuda.is_available() else "cpu"

model, tokenizer = get_pretrained_sentiment_model(device)

print("Generating sentiment dataset...")
texts, true_labels = generate_sentiment_dataset(length, changepoint)

print("Getting predictions...")
predictions = []
probabilities = []

for i, text in enumerate(tqdm(texts)):
    pred, prob = predict_sentiment(model, tokenizer, text, device)
    predictions.append(pred)
    probabilities.append(prob)

probabilities = torch.stack(probabilities)

print("\nExamples before changepoint (positive):")
for i in range(3):
    idx = np.random.randint(0, changepoint)
    print(f'Text {i+1}: "{texts[idx]}"')
    print(
        f"True label: Positive, Predicted: {'Positive' if predictions[idx] == 1 else 'Negative'}"
    )
    print(f"Confidence: {probabilities[idx][predictions[idx]]:.4f}\n")

print("\nExamples after changepoint (negative):")
for i in range(3):
    idx = np.random.randint(changepoint + 1, length)
    print(f'Text {i+1}: "{texts[idx]}"')
    print(
        f"True label: Negative, Predicted: {'Positive' if predictions[idx] == 1 else 'Negative'}"
    )
    print(f"Confidence: {probabilities[idx][predictions[idx]]:.4f}\n")

In [None]:
def compute_sequential_sentiment_scores(texts, model, tokenizer, device="cpu"):
    length = len(texts)

    print("Getting model predictions...")
    predictions = []
    probabilities = []

    for i, text in enumerate(tqdm(texts)):
        pred, prob = predict_sentiment(model, tokenizer, text, device)
        predictions.append(pred)
        probabilities.append(prob)

    probabilities = torch.stack(probabilities)

    left_scores = np.zeros((length - 1, length))

    for t in tqdm(range(length - 1), desc="Computing left scores for each t"):
        for r in range(t + 1):
            seen_sentiments = {0: 0, 1: 0}
            for i in range(r + 1):
                sentiment = predictions[i]
                seen_sentiments[sentiment] += 1

            baseline_sentiment = max(seen_sentiments, key=seen_sentiments.get)

            prob_baseline = probabilities[r, baseline_sentiment]
            left_scores[t, r] = prob_baseline / (1 - prob_baseline + 1e-10)

    right_scores = np.zeros((length - 1, length))

    for t in tqdm(range(length - 1), desc="Computing right scores for each t"):
        for r in range(t + 1, length):
            seen_sentiments = {0: 0, 1: 0}
            for i in range(r, length):
                sentiment = predictions[i]
                seen_sentiments[sentiment] += 1

            baseline_sentiment = max(seen_sentiments, key=seen_sentiments.get)

            prob_baseline = probabilities[r, baseline_sentiment]
            right_scores[t, r] = prob_baseline / (1 - prob_baseline + 1e-10)

    return left_scores, right_scores, predictions, probabilities

## Full sentiment change (pos. to neg.)

In [None]:
def run_sentiment_simulation(n, xi):
    x, y = generate_sentiment_dataset(n, xi)
    left_scores, right_scores, predictions, probabilities = compute_sequential_sentiment_scores(x, model, tokenizer, device)

    p_values = np.zeros(n - 1)
    for t in tqdm(range(n - 1)):
        p_values_curr = np.zeros(n)

        for r in range(t + 1):
            left_segment_scores = left_scores[t, : r + 1]
            rank = np.sum(left_scores[t, r] < left_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(left_scores[t, r] == left_segment_scores)
            p_values_curr[r] = rank / (r + 1)

        for r in range(t + 1, n):
            right_segment_scores = right_scores[t, r:]
            rank = np.sum(
                right_scores[t, r] < right_segment_scores
            ) + np.random.uniform(0, 1) * np.sum(
                right_scores[t, r] == right_segment_scores
            )
            p_values_curr[r] = rank / (n - r)

        p_left = ks_1samp(p_values_curr[: t + 1], uniform.cdf, method="exact")[1]
        p_right = ks_1samp(p_values_curr[t + 1 :], uniform.cdf, method="exact")[1]

        p_values[t] = 1 - (1 - min(p_left, p_right)) ** 2

    return p_values


p_values = run_sentiment_simulation(n, xi)

In [None]:
time_indices_p = np.arange(1, n)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi, color="red", linestyle="--", label="Changepoint ($\\xi = 400$)")
plt.axhline(0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)")
plt.xlabel("$t$")
plt.title("p-values for SST-2 sentiment change")
plt.legend()
plt.savefig("images/sentiment-pvalues.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue change point: {xi}")
print(f"Detected change point: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - xi)}")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05)[0] + 1}")

## Mixed sentiment change (60% pos. to 60% neg.)

In [None]:
def run_mixed_sentiment_simulation(n, xi):
    x, y = generate_mixed_sentiment_dataset(n, xi)
    left_scores, right_scores, predictions, probabilities = (
        compute_sequential_sentiment_scores(x, model, tokenizer, device)
    )

    p_values = np.zeros(n - 1)
    for t in tqdm(range(n - 1)):
        p_values_curr = np.zeros(n)

        for r in range(t + 1):
            left_segment_scores = left_scores[t, : r + 1]
            rank = np.sum(left_scores[t, r] < left_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(left_scores[t, r] == left_segment_scores)
            p_values_curr[r] = rank / (r + 1)

        for r in range(t + 1, n):
            right_segment_scores = right_scores[t, r:]
            rank = np.sum(
                right_scores[t, r] < right_segment_scores
            ) + np.random.uniform(0, 1) * np.sum(
                right_scores[t, r] == right_segment_scores
            )
            p_values_curr[r] = rank / (n - r)

        p_left = ks_1samp(p_values_curr[: t + 1], uniform.cdf, method="exact")[1]
        p_right = ks_1samp(p_values_curr[t + 1 :], uniform.cdf, method="exact")[1]

        p_values[t] = 1 - (1 - min(p_left, p_right)) ** 2

    return p_values


p_values = run_mixed_sentiment_simulation(n, xi)

In [None]:
time_indices_p = np.arange(1, n)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi, color="red", linestyle="--", label="Changepoint ($\\xi = 400$)")
plt.axhline(0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)")
plt.xlabel("$t$")
plt.title("p-values for SST-2 mixed sentiment change")
plt.legend()
plt.savefig("images/sentiment-pvalues-mixed.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue change point: {xi}")
print(f"Detected change point: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - xi)}")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05)[0] + 1}")

# Human Activity Dataset (HAD) change

In [None]:
def load_had_dataset():
    import urllib.request
    import zipfile
    import os
    
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"
    
    os.makedirs("./data", exist_ok=True)
    
    if not os.path.exists("./data/UCI_HAR_Dataset.zip"):
        print("Downloading Human Activity Dataset...")
        urllib.request.urlretrieve(url, "./data/UCI_HAR_Dataset.zip")
    
    if not os.path.exists("./data/UCI HAR Dataset"):
        print("Extracting dataset...")
        with zipfile.ZipFile("./data/UCI_HAR_Dataset.zip", 'r') as zip_ref:
            zip_ref.extractall("./data")
    
    train_file = "./data/UCI HAR Dataset/train/X_train.txt"
    train_labels_file = "./data/UCI HAR Dataset/train/y_train.txt"
    
    if os.path.exists(train_file):
        X_train = np.loadtxt(train_file)
        y_train = np.loadtxt(train_labels_file)
        
        walking_indices = np.where(y_train == 1)[0]
        sitting_indices = np.where(y_train == 4)[0]
        
        n_walking = min(400, len(walking_indices))
        n_sitting = min(600, len(sitting_indices))
        
        if n_walking > 0 and n_sitting > 0:
            selected_walking = walking_indices[:n_walking]
            selected_sitting = sitting_indices[:n_sitting]
            combined_indices = np.concatenate([selected_walking, selected_sitting])
        
            timeseries = X_train[combined_indices, 0]
            true_changepoint = n_walking
            
            return timeseries, true_changepoint


print("Loading Human Activity Dataset...")
timeseries, true_changepoint = load_had_dataset()

n_had = len(timeseries)
xi_had = true_changepoint

print(f"HAD Dataset loaded:")
print(f"Total length: {n_had} samples")
print(f"True changepoint: {xi_had} (activity transition)")

time_indices = np.arange(1, n_had + 1)
plt.plot(time_indices, timeseries)
plt.axvline(x=xi_had + 1, color="red", linestyle="--", label=f"Changepoint ($\\xi = {xi_had}$)")
plt.xlabel("$t$")
plt.ylabel("Accelerometer reading")
plt.title("Human Activity Recognition accelerometer data")
plt.legend()
plt.savefig("images/had-examples.pdf")
plt.show()

pre_change = timeseries[:xi_had]
post_change = timeseries[xi_had:]

print(f"\nDataset Statistics:")
print(f"Pre-change activity mean: {np.mean(pre_change):.3f}, std: {np.std(pre_change):.3f}")
print(f"Post-change activity mean: {np.mean(post_change):.3f}, std: {np.std(post_change):.3f}")
print(f"Mean difference: {np.mean(pre_change) - np.mean(post_change):.3f}")

In [None]:
def run_had_simulation_kde():
    n = len(timeseries)

    p_values = np.zeros(n - 1)

    for t in tqdm(range(n - 1)):
        left_scores = np.zeros(n)
        right_scores = np.zeros(n)

        for r in range(t + 1):
            data_f0 = timeseries[: r + 1]
            kde_f0 = compute_kde(data_f0)
            f0_score = kde_f0(timeseries[r])

            if t + 1 >= n:
                left_scores[r] = 1.0
                continue

            data_f1 = timeseries[t + 1 :]
            if len(data_f1) == 0:
                left_scores[r] = 1.0
                continue

            kde_f1 = compute_kde(data_f1)
            f1_score = kde_f1(timeseries[r])

            left_scores[r] = f1_score / (
                f0_score + 1e-10
            )

        for r in range(t + 1, n):
            data_f1 = timeseries[r:]
            kde_f1 = compute_kde(data_f1)
            f1_score = kde_f1(timeseries[r])

            data_f0 = timeseries[: t + 1]
            kde_f0 = compute_kde(data_f0)
            f0_score = kde_f0(timeseries[r])

            right_scores[r] = f0_score / (f1_score + 1e-10)

        p_values_curr = np.zeros(n)

        for r in range(t + 1):
            left_segment_scores = left_scores[: r + 1]
            rank = np.sum(left_scores[r] < left_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(left_scores[r] == left_segment_scores)
            p_values_curr[r] = rank / (r + 1)

        for r in range(t + 1, n):
            right_segment_scores = right_scores[r:]
            rank = np.sum(right_scores[r] < right_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(right_scores[r] == right_segment_scores)
            p_values_curr[r] = rank / (n - r)

        try:
            if t > 0:
                p_left = ks_1samp(p_values_curr[: t + 1], uniform.cdf, method="exact")[
                    1
                ]
            else:
                p_left = 1.0

            if t + 1 < n:
                p_right = ks_1samp(p_values_curr[t + 1 :], uniform.cdf, method="exact")[
                    1
                ]
            else:
                p_right = 1.0
        except:
            p_left = 1.0
            p_right = 1.0

        p_values[t] = 1 - (1 - min(p_left, p_right)) ** 2

    return p_values

p_values = run_had_simulation_kde()

In [None]:
time_indices_p = np.arange(1, n_had)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi_had + 1, color="red", linestyle="--", label=f"Changepoint ($\\xi = {xi_had}$)")
plt.axhline(0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)")
plt.xlabel("$t$")
plt.title("p-values for HAR activity change")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig("images/had-pvalues.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue changepoint: {xi_had + 1} (activity transition)")
print(f"Detected changepoint: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - (xi_had + 1))} samples")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi_had-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05)[0] + 1}")

# CIFAR-100 class change

In [None]:
def get_cifar100_pretrained_model(device="cpu"):
    import detectors
    import timm
    print("Loading pretrained ResNet18 model from timm and adapting for CIFAR-100...")
    
    model = timm.create_model("resnet18_cifar100", pretrained=True, num_classes=100)
    model.to(device)
    model.eval()
    return model

In [None]:
def predict_cifar100_class(model, image, device="cpu"):
    if len(image.shape) == 3:
        image = image.unsqueeze(0)
    elif len(image.shape) == 4 and image.shape[0] != 1:
        image = image[:1]
    
    image_tensor = image.to(device)
    
    with torch.no_grad():
        outputs = torch.softmax(model(image_tensor), dim=1).cpu()
        predicted = outputs.argmax(dim=1).item()
    
    return (predicted, outputs.squeeze())


def generate_cifar100_dataset(length, changepoint, class1=15, class2=47):
    from torchvision.datasets import CIFAR100
    
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761])
    ])
    
    train_data = CIFAR100(root="./data", train=True, download=True, transform=transform)
    test_data = CIFAR100(root="./data", train=False, download=True, transform=transform)
    
    all_images = []
    all_labels = []
    
    for i in range(len(train_data)):
        image, label = train_data[i]
        all_images.append(image)
        all_labels.append(label)
    
    for i in range(len(test_data)):
        image, label = test_data[i]
        all_images.append(image)
        all_labels.append(label)
    
    class1_indices = [i for i, label in enumerate(all_labels) if label == class1]
    class2_indices = [i for i, label in enumerate(all_labels) if label == class2]
    
    np.random.shuffle(class1_indices)
    np.random.shuffle(class2_indices)
    
    n1 = changepoint + 1
    n2 = length - n1
    
    selected_indices = class1_indices[:n1] + class2_indices[:n2]
    selected_images = [all_images[i] for i in selected_indices]
    
    return torch.stack(selected_images)

In [None]:
def compute_sequential_cifar100_scores(x, model, device="cpu"):
    length = len(x)

    print("Getting CIFAR-100 model predictions...")
    predictions = []
    probabilities = []

    for i in tqdm(range(length)):
        pred, prob = predict_cifar100_class(model, x[i], device)
        predictions.append(pred)
        probabilities.append(prob)

    probabilities = torch.stack(probabilities)

    left_scores = np.zeros((length - 1, length))

    for t in tqdm(range(length - 1), desc="Computing left scores for each t"):
        reference = max(predictions[t+1:], key=predictions[t+1:].count)
        for r in range(t + 1):
            seen_classes = {}
            for i in range(r + 1):
                class_label = predictions[i]
                seen_classes[class_label] = seen_classes.get(class_label, 0) + 1

            baseline_class = max(seen_classes, key=seen_classes.get)

            prob_baseline = probabilities[r, baseline_class]
            left_scores[t, r] = prob_baseline / (probabilities[r, reference] + 1e-10)

    right_scores = np.zeros((length - 1, length))

    for t in tqdm(range(length - 1), desc="Computing right scores for each t"):
        reference = max(predictions[:t+1], key=predictions[:t+1].count)
        for r in range(t + 1, length):
            seen_classes = {}
            for i in range(r, length):
                class_label = predictions[i]
                seen_classes[class_label] = seen_classes.get(class_label, 0) + 1

            baseline_class = max(seen_classes, key=seen_classes.get)

            prob_baseline = probabilities[r, baseline_class]
            right_scores[t, r] = prob_baseline / (probabilities[r, reference] + 1e-10)

    return left_scores, right_scores, predictions, probabilities

In [None]:
from torchvision.datasets import CIFAR100

cifar100_dataset = CIFAR100(root="./data", train=False, download=True)
class_names = cifar100_dataset.classes

print("CIFAR-100 class names:")
print(f"Class 3: {class_names[3]}")
print(f"Class 4: {class_names[4]}")

transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761]
        ),
    ]
)

train_data = CIFAR100(root="./data", train=True, download=True, transform=transform)
test_data = CIFAR100(root="./data", train=False, download=True, transform=transform)

class3_count_train = sum(1 for _, label in train_data if label == 3)
class4_count_train = sum(1 for _, label in train_data if label == 4)
class3_count_test = sum(1 for _, label in test_data if label == 3)
class4_count_test = sum(1 for _, label in test_data if label == 4)

print(
    f"\nClass 3 ({class_names[3]}) - Train: {class3_count_train}, Test: {class4_count_test}, Total: {class3_count_train + class3_count_test}"
)
print(
    f"Class 4 ({class_names[4]}) - Train: {class4_count_train}, Test: {class4_count_test}, Total: {class4_count_train + class4_count_test}"
)


def generate_cifar100_dataset_fixed(length, changepoint, class1=3, class2=4):
    from torchvision.datasets import CIFAR100

    transform = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761]
            ),
        ]
    )

    train_data = CIFAR100(root="./data", train=True, download=True, transform=transform)
    test_data = CIFAR100(root="./data", train=False, download=True, transform=transform)

    class1_images = []
    class1_labels = []

    for _, (image, label) in enumerate(train_data):
        if label == class1:
            class1_images.append(image)
            class1_labels.append(label)

    for _, (image, label) in enumerate(test_data):
        if label == class1:
            class1_images.append(image)
            class1_labels.append(label)

    class2_images = []
    class2_labels = []

    for _, (image, label) in enumerate(train_data):
        if label == class2:
            class2_images.append(image)
            class2_labels.append(label)

    for _, (image, label) in enumerate(test_data):
        if label == class2:
            class2_images.append(image)
            class2_labels.append(label)

    np.random.shuffle(class1_images)
    np.random.shuffle(class2_images)

    n1 = changepoint + 1
    n2 = length - n1

    selected_class1 = class1_images[:n1]
    selected_class2 = class2_images[:n2]

    all_images = selected_class1 + selected_class2
    all_labels = [class1] * n1 + [class2] * n2

    print(f"Generated dataset: {len(all_images)} images")
    print(f"First {n1} images are class {class1} ({class_names[class1]})")
    print(f"Last {n2} images are class {class2} ({class_names[class2]})")

    actual_labels_start = all_labels[:5]
    actual_labels_end = all_labels[-5:]
    print(f"First 5 labels: {actual_labels_start}")
    print(f"Last 5 labels: {actual_labels_end}")

    return torch.stack(all_images), all_labels


x_fixed, labels_fixed = generate_cifar100_dataset_fixed(n, xi, class1=3, class2=4)
print(f"\nSuccessfully generated dataset with shape: {x_fixed.shape}")

def show_cifar100_examples_with_labels(x, labels, changepoint, n_examples=3):
    mean = torch.tensor([0.5071, 0.4867, 0.4408])
    std = torch.tensor([0.2675, 0.2565, 0.2761])

    def denormalize_image(tensor):
        denorm = tensor * std[:, None, None] + mean[:, None, None]
        return torch.clamp(denorm, 0, 1)

    fig, axes = plt.subplots(1, 5, figsize=(15, 3))

    class_names = CIFAR100(root="./data", train=False, download=True).classes

    time_points = [398, 399, 400]
    titles_before = [r"$t = 398$", r"$t = 399$", r"$t = \xi = 400$"]

    for i, (t, title) in enumerate(zip(time_points, titles_before)):
        idx = t - 1
        img_denorm = denormalize_image(x[idx])
        actual_class = labels[idx]
        class_name = class_names[actual_class]

        axes[i].imshow(img_denorm.permute(1, 2, 0).numpy())
        axes[i].set_title(f"{title}\n{class_name}")
        axes[i].axis("off")

    post_change_points = [401, 402]
    titles_after = [r"$t = 401$", r"$t = 402$"]

    for i, (t, title) in enumerate(zip(post_change_points, titles_after)):
        idx = t - 1
        img_denorm = denormalize_image(x[idx])
        actual_class = labels[idx]
        class_name = class_names[actual_class]

        axes[i + 3].imshow(img_denorm.permute(1, 2, 0).numpy())
        axes[i + 3].set_title(f"{title}\n{class_name}")
        axes[i + 3].axis("off")

    plt.tight_layout()
    plt.savefig("images/cifar100-examples.pdf")
    plt.show()

show_cifar100_examples_with_labels(x_fixed, labels_fixed, xi)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

cifar100_model = get_cifar100_pretrained_model(device)

In [None]:
n_cifar = 800
xi_cifar = 300

print(f"CIFAR-100 simulation parameters:")
print(f"Total length: {n_cifar}")
print(f"Changepoint: {xi_cifar}")
print(f"Pre-change images needed: {xi_cifar + 1}")
print(f"Post-change images needed: {n_cifar - xi_cifar - 1}")
print(f"Available per class: ~600 (500 train + 100 test)")

In [None]:
def run_cifar100_simulation(n, xi, class1=15, class2=47):
    x = generate_cifar100_dataset(n, xi, class1, class2)
    left_scores, right_scores, predictions, probabilities = compute_sequential_cifar100_scores(
        x, cifar100_model, device
    )
    p_values = np.zeros(n - 1)
    for t in tqdm(range(n - 1)):
        p_values_curr = np.zeros(n)

        for r in range(t+1):
            left_segment_scores = left_scores[t, : r + 1]
            rank = np.sum(left_scores[t, r] < left_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(left_scores[t, r] == left_segment_scores)
            p_values_curr[r] = rank / (r + 1)

        for r in range(n - 1, t, -1):
            right_segment_scores = right_scores[t, r:]
            rank = np.sum(right_scores[t, r] < right_segment_scores) + np.random.uniform(
                0, 1
            ) * np.sum(right_scores[t, r] == right_segment_scores)
            p_values_curr[r] = rank / (n - r)

        p_left = ks_1samp(p_values_curr[: t + 1], uniform.cdf, method="exact")[1]
        p_right = ks_1samp(p_values_curr[t + 1 :], uniform.cdf, method="exact")[1]

        p_values[t] = 1 - (1 - min(p_left, p_right)) ** 2

    return p_values

p_values = run_cifar100_simulation(n_cifar, xi_cifar, class1=3, class2=4)

In [None]:
time_indices_p = np.arange(1, n_cifar)
plt.plot(time_indices_p, p_values)
plt.axvline(x=xi_cifar, color="red", linestyle="--", label=f"Changepoint ($\\xi = {xi_cifar}$)")
plt.axhline(
    0.05, color="green", linestyle=":", label="Threshold ($\\alpha = 0.05$)"
)
plt.xlabel("$t$")
plt.title("p-values for CIFAR-100 class change (pretrained model)")
plt.legend()
plt.savefig("images/cifar100-pvalues.pdf")

plt.show()

detected_changepoint = np.argmax(p_values) + 1
print(f"\nTrue change point: {xi_cifar}")
print(f"Detected change point: {detected_changepoint}")
print(f"Detection error: {abs(detected_changepoint - xi_cifar)}")
print(f"Size of confidence set: {np.sum(p_values > 0.05)}")
print(f"Changepoint in confidence set: {p_values[xi_cifar-1] > 0.05}")
print(f"CI: {np.where(p_values > 0.05)[0] + 1}")