In [None]:
import torch
import torch.nn.functional as F
import pandas as pd
from scipy.stats import norm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from bin_cp.helpers.storage import load_smooth_prediction
from bin_cp.helpers.tensor import get_smooth_scores, get_cal_mask, quantization_pdf, bound_tensor
from bin_cp.robust.confidence import bernstein_bound, dkw_cdf
from bin_cp.robust.confidence import clopper_pearson_lower
from bin_cp.robust.bounds import mean_bounds_l2, CDF_bounds_l2

from bin_cp.cp.core import ConformalClassifier as CP
from bin_cp.cp.scores import APSScore, TPSScore

from bin_cp.methods.robust_cp import RobustCP, VanillaSmoothCP
from bin_cp.methods.cas import CAS
from bin_cp.methods.bin import BinCP
from bin_cp.methods.binary import BinCPThresholds

from tqdm import tqdm

import logging
logging.basicConfig(filename='std.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.propagate = True

  from .autonotebook import tqdm as notebook_tqdm


In [50]:
#region primary configs of the experiment

result_folder = "../../../output-results"

dataset_name = "cifar10"
model_sigma = 0.25
n_classes=10
n_datapoints = 2048
smoothing_sigma = 0.25
n_samples = 10000
n_trial_samples = 10

score_method = "APS"
calibration_budget = 0.1
n_iterations = 100
confidence = 0.99

#endregion

In [51]:
coverage_range = [0.9, 0.95]
r_range = [0.25]



#region loding smooth logit predictions
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

smooth_prediction = load_smooth_prediction(dataset_name=dataset_name,
    model_sigma=model_sigma,
    n_datapoints=n_datapoints,
    smoothing_sigma=smoothing_sigma,
    n_samples=n_samples)
n_classes = 10 if dataset_name == "cifar10" else None
#endregion

#region defining basic setup for conformal evaluation

score_pipeline = [
    TPSScore(softmax=True) if score_method == "TPS" else APSScore(softmax=True)] # defining the score function
cp = CP(score_pipeline=score_pipeline, coverage_guarantee=0.9) # the guarantee can vary later by cp.coverage_guarantee
smooth_scores = get_smooth_scores(smooth_prediction.logits, cp, mean=False)
smooth_scores = smooth_scores[:, :, :n_trial_samples]
y_true_mask = F.one_hot(smooth_prediction.y_true, num_classes=10).bool().to(device)
mean_scores = smooth_scores.mean(dim=-1)
#endregion
print(f"Loading {dataset_name} dataset with {n_datapoints} datapoints and {n_trial_samples} samples: Score method: {score_method}")

cal_mask = get_cal_mask(smooth_scores.mean(dim=-1), calibration_budget)
n_dcal = cal_mask.sum().item()

vanilla_cp = VanillaSmoothCP(nominal_coverage=0.9)
vanilla_results = []

vanilla_cp.pre_compute(smooth_scores, smooth_prediction.y_true)

for coverage_guarantee in coverage_range:
    r = 0
    vanilla_cp.set_nominal_coverage(coverage_guarantee)

    for iter_i in range(n_iterations):
        cal_mask = get_cal_mask(smooth_scores.mean(dim=-1), calibration_budget)
        eval_mask = ~cal_mask
        threshold = vanilla_cp.pre_compute_calibrate(cal_mask)
        pred_set = vanilla_cp.pre_compute_predict(eval_mask)

        empirical_coverage = vanilla_cp.internal_cp.coverage(pred_set, y_true_mask[eval_mask])
        average_set_size = pred_set.sum(dim=1).float().mean().item()

        vanilla_results.append({
            "method": "vanilla", 
            "iteration": iter_i,
            "coverage_guarantee": coverage_guarantee,
            "r": r,
            "smoothing_sigma": smoothing_sigma,
            "model_sigma": model_sigma,
            "threshold": threshold,
            "empirical_coverage": empirical_coverage,
            "average_set_size": average_set_size,
            "score_method": score_method,
            "dataset_name": dataset_name,
            "calibration_budget": calibration_budget,
        })

vanilla_results = pd.DataFrame(vanilla_results)
# vanilla_results.to_csv(f"{result_folder}/vanilla_results-{dataset_name}-smooth{smoothing_sigma}-model{model_sigma}-{score_method}-nsamples{n_trial_samples}.csv", index=False)
vanilla_results[vanilla_results["coverage_guarantee"] == 0.9].mean()

Loading cifar10 dataset with 2048 datapoints and 10 samples: Score method: APS


  vanilla_results[vanilla_results["coverage_guarantee"] == 0.9].mean()


iteration             49.500000
coverage_guarantee     0.900000
r                      0.000000
smoothing_sigma        0.250000
model_sigma            0.250000
threshold              0.210366
empirical_coverage     0.901123
average_set_size       1.505206
calibration_budget     0.100000
dtype: float64

In [56]:
smooth_scores.shape

torch.Size([2048, 10, 10])

In [66]:
r = 0.25


cas_results = []
bin_results = []

cas_cp = CAS(nominal_coverage=0.9, r=r, smoothing_sigma=smoothing_sigma, confidence_level=confidence, n_dcal=n_dcal, n_classes=n_classes, 
                        error_correction=False)
cas_cp.pre_compute(smooth_scores, smooth_prediction.y_true)

print("CAS pre-computed")

bin_cp = BinCP(nominal_coverage=0.9, smoothing_sigma=smoothing_sigma, n_dcal=n_dcal, n_classes=n_classes,
                    r=r, confidence_level=confidence,
                    error_correction=False,
                    p_base=0.5)

# bin_cp.pre_compute(smooth_scores, smooth_prediction.y_true)
print("bin pre-computed")

# here goes a for
coverage_guarantee = 0.9

print(f"Running for r={r}, coverage={coverage_guarantee}")
cas_cp.set_nominal_coverage(coverage_guarantee)
bin_cp.set_nominal_coverage(coverage_guarantee)

# here goes a for
for iter_i in tqdm(range(100)):
    cal_mask = get_cal_mask(smooth_scores.mean(dim=-1), calibration_budget)
    eval_mask = ~cal_mask

    # evaluating cas
    threshold_cas = cas_cp.pre_compute_calibrate(cal_mask)
    pred_set_cas = cas_cp.pre_compute_predict(eval_mask)

    empirical_coverage_cas = cas_cp.internal_cp.coverage(pred_set_cas, y_true_mask[eval_mask])
    average_set_size_cas = pred_set_cas.sum(dim=1).float().mean().item()

    cas_results.append({
        "method": "cas",
        "coverage_guarantee": coverage_guarantee,
        "iteration": iter_i,
        "r": r,
        "smoothing_sigma": smoothing_sigma,
        "model_sigma": model_sigma,
        "threshold": threshold_cas,
        "empirical_coverage": empirical_coverage_cas,
        "average_set_size": average_set_size_cas,
        "score_method": score_method,
        "confidence_level": confidence,
        "dataset_name": dataset_name,
        "calibration_budget": calibration_budget,
    })

    # evaluating bin
    threshold_bin = bin_cp.calibrate_from_scores(smooth_scores[cal_mask], smooth_prediction.y_true[cal_mask])
    pred_set_bin = bin_cp.predict_from_scores(smooth_scores[eval_mask])

    empirical_coverage_bin = bin_cp.internal_cp.coverage(pred_set_bin, y_true_mask[eval_mask])
    average_set_size_bin = pred_set_bin.sum(dim=1).float().mean().item()

    bin_results.append({
        "method": "bin",
        "coverage_guarantee": coverage_guarantee,
        "iteration": iter_i,
        "r": r,
        "smoothing_sigma": smoothing_sigma,
        "model_sigma": model_sigma,
        "threshold": threshold_bin,
        "empirical_coverage": empirical_coverage_bin,
        "average_set_size": average_set_size_bin,
        "score_method": score_method,
        "confidence_level": confidence,
        "dataset_name": dataset_name,
        "calibration_budget": calibration_budget,
    })

cas_results = pd.DataFrame(cas_results)
bin_results = pd.DataFrame(bin_results)

CAS pre-computed
bin pre-computed
Running for r=0.25, coverage=0.9


100%|██████████| 100/100 [00:00<00:00, 472.04it/s]


In [67]:
cas_results.mean()

  cas_results.mean()


coverage_guarantee     0.900000
iteration             49.500000
r                      0.250000
smoothing_sigma        0.250000
model_sigma            0.250000
threshold              0.046067
empirical_coverage     0.969740
average_set_size       2.546291
confidence_level       0.990000
calibration_budget     0.100000
dtype: float64

In [68]:
bin_results.mean()

  bin_results.mean()


coverage_guarantee     0.900000
iteration             49.500000
r                      0.250000
smoothing_sigma        0.250000
model_sigma            0.250000
threshold              0.120033
empirical_coverage     0.961057
average_set_size       2.302207
confidence_level       0.990000
calibration_budget     0.100000
dtype: float64