### Importing libraries

In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

model_ids = {}
artifact_path = "artifacts"


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from src.data.data_loader import load_electric
from src.models.benchmarks import comparison_methods
from src.models.conformal import conformal_class
from src.models.copula import fit_sample_copula
from src.models.representation import compute_representation
from src.utils.data_utils import (
    covariance_comparison,
    get_suspect_features,
    write_to_file,
)
from src.utils.helpers import inlier_outlier_dicts, sort_ci_vals


In [None]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
data_augment = False


## Load the data

In [None]:
logging.info("Loading data...")
(
    X_train,
    X_test,
    y_train,
    y_test,
) = load_electric()


## Fit baseline downstream model

In [None]:
logging.info("Training downstream model...")
clf = RandomForestClassifier()
clf.fit(X_train, y_train)


In [None]:
cov_suspects = covariance_comparison(
    clean_array=X_train.to_numpy(), noisy_array=X_test.to_numpy()
)
ks_suspect = get_suspect_features(
    clean_corpus=X_train.to_numpy(), test_dataset=X_test.to_numpy(), alpha=0.1
)
suspect_features = np.unique(np.append(cov_suspects, ks_suspect))
suspect_features = np.unique(np.append(suspect_features, [0]))


## Step 1: Copula

In [None]:
logging.info("Running copula step...")

if data_augment:

    copula_samples = fit_sample_copula(
        clean_corpus=X_train,
        copula="vine",
        copula_n_samples=10000,
        columns=list(X_train.columns),
        random_seed=64,
    )

else:
    copula_samples = X_train.to_numpy()


## 2. Representer

In [None]:
logging.info("Running representer...")
pcs_train, pcs_test, pcs_copula = compute_representation(
    train=X_train, test=X_test, copula_samples=X_train, n_components=4, rep_type="pca"
)


## 3. Conformal Predictor

In [None]:
logging.info("Running conformal predictor...")
means, stds = [], []


bases = ["tree", "rf", "svm", "knn"]

base = bases[0]


conformal_dict = {}
for feat in suspect_features:
    feat = int(feat)
    dim = pcs_copula.shape[1]
    conf = conformal_class(conformity_score="abs", input_dim=dim, base_name=base)
    conf.fit(x_train=pcs_copula, y_train=copula_samples[:, feat])
    conformal_dict[feat] = conf.predict(
        x_test=pcs_test, y_test=X_test.to_numpy()[:, feat]
    )
    logging.info(f"Running analysis for feature = {feat}")

inliers_dict, outliers_dict = inlier_outlier_dicts(conformal_dict, suspect_features)


small_ci_ids, large_ci_ids, df_out = sort_ci_vals(
    conformal_dict, inliers_dict, suspect_features, proportion=0.5
)
model_ids["DS_small"] = small_ci_ids
model_ids["DS_large"] = large_ci_ids


## Compute MPIs & store for later analysis

In [None]:
benchmark_mean = {}
benchmark_std = {}

mean_cert = []
mean_uncert = []

for n_ids in range(100, 1000, 100):
    y_true = y_test

    cert_ids = small_ci_ids[0:n_ids]
    y_pred = clf.predict(X_test.to_numpy()[cert_ids, :])
    acc_sc = accuracy_score(y_true.to_numpy()[cert_ids], y_pred)
    mean_cert.append(acc_sc)

    uncert_ids = large_ci_ids[-n_ids:]
    y_pred = clf.predict(X_test.to_numpy()[uncert_ids, :])
    acc_sc = accuracy_score(y_true.to_numpy()[uncert_ids], y_pred)
    mean_uncert.append(acc_sc)

mean_cert = np.array(mean_cert)
mean_uncert = np.array(mean_uncert)


benchmark_mean["DS"] = np.mean(mean_cert - mean_uncert)
benchmark_std["DS"] = np.std(mean_cert - mean_uncert)

val = benchmark_mean["DS"]
means.append(val)

val = benchmark_std["DS"]
stds.append(val)


In [None]:
benchmark_mean


In [None]:
benchmark_std


# Run comparisons

In [None]:
comparison_models = ["qr", "bnn", "conformal", "mcd", "ensemble", "gp"]

for model in comparison_models:
    logging.info(f"Evaluating benchmark: {model}")
    uncertainty_scores = []

    for feat in range(X_train.shape[1]):
        indices = list(range(X_train.shape[1]))
        indices.remove(feat)

        print(feat)

        ids = range(X_test.shape[0])
        uncertainty_score = comparison_methods(
            x_train=X_train.to_numpy()[:, indices],
            y_train=X_train.to_numpy()[:, feat],
            x_test=X_test.to_numpy()[:, indices],
            y_test=X_test.to_numpy()[:, feat],
            inlier_ids=ids,
            df_inlier=None,
            model_type=model,
            return_ids=False,
        )

        uncertainty_scores.append(uncertainty_score)

    ordered_scores = np.argsort([sum(i) for i in zip(*uncertainty_scores)])
    model_ids[model] = ordered_scores

    model_certainty = []
    model_uncertainty = []

    samples = np.arange(100, 5000, 100)
    for sample in samples:
        certain = ordered_scores[0:sample]
        uncertain = ordered_scores[-sample:]

        y_pred = clf.predict(X_test.to_numpy()[certain, :])
        model_certainty.append(accuracy_score(y_test.to_numpy()[certain], y_pred))

        y_pred = clf.predict((X_test.to_numpy()[uncertain, :]))
        model_uncertainty.append(accuracy_score(y_test.to_numpy()[uncertain], y_pred))

    diff_mean = np.mean(np.array(model_certainty) - np.array(model_uncertainty))
    diff_std = np.std(np.array(model_certainty) - np.array(model_uncertainty))

    benchmark_mean[model] = diff_mean
    benchmark_std[model] = diff_std

    print(model, " : ", diff_mean)


## Capture artifacts for later analysis

In [None]:
artifacts = {}

artifacts["clf"] = clf
artifacts["conformal_dict"] = conformal_dict
artifacts["inliers_dict"] = inliers_dict
artifacts["outliers_dict"] = outliers_dict
artifacts["benchmark_mean"] = benchmark_mean
artifacts["benchmark_std"] = benchmark_std
artifacts["small_ci_ids"] = small_ci_ids
artifacts["large_ci_ids"] = large_ci_ids
artifacts["df_out"] = df_out
artifacts["X_train"] = X_train
artifacts["X_test"] = X_test
artifacts["y_train"] = y_train
artifacts["y_test"] = y_test


write_to_file(artifacts, f"{artifact_path}/electric_dataset_artifacts.p")
write_to_file(model_ids, f"{artifact_path}/electric_model_ids.p")


## Summary

In [None]:
benchmark_mean


In [None]:
benchmark_std
