# RUN SYNTHETIC DATA EXPERIMENTS & LOG TO WANDB

In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)


os.environ["WANDB_API_KEY"] = "ADD YOUR API KEY HERE"
project_name = "data_suite_synthetic"
wandb_entity = "ADD YOUR ENTITY HERE"


### IMPORTS

In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy
import wandb
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

from src.data.data_loader import load_synthetic_data
from src.models.benchmarks import comparison_methods
from src.models.conformal import conformal_class
from src.models.copula import fit_sample_copula
from src.models.representation import compute_representation
from src.utils.data_utils import covariance_comparison, get_suspect_features
from src.utils.helpers import inlier_outlier_dicts, sort_cis_synth
from src.utils.uncertainty_metrics import *


In [None]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)


## RUN SYNTHETIC EXPERIMENT WITH DIFFERENT PARAMTERIZATIONS

In [None]:
props = [0.1, 0.25, 0.5, 0.75]
dists = ["normal", "beta", "gamma", "weibull"]
noise_vars = [1, 2, 5, 10]
copula_count_samples = [1000]
import traceback

n_runs=20 # this can be reduced to 10 too

for i in range(n_runs):
    for prop in props:
        for dist in dists:
            for noise_variance in noise_vars:
                for copula_n_samples in copula_count_samples:

                    try:
                        n_synthetic = 1000
                        train_prop = 1
                        rep_type = "pca"

                        wandb_dict = {}
                        wandb_dict["prop"] = prop
                        wandb_dict["dist"] = dist
                        wandb_dict["copula_n_samples"] = copula_n_samples
                        wandb_dict["n_synthetic"] = n_synthetic
                        wandb_dict["train_prop"] = train_prop
                        wandb_dict["noise_variance"] = noise_variance
                        wandb_dict["rep_type"] = rep_type

                        #
                        logging.info(
                            f"Starting experiment {prop}, {dist}, {noise_variance}, {copula_n_samples}"
                        )
                        
                        run = wandb.init(
                            project=project_name,
                            entity=wandb_entity,
                        )

                        logging.info("Loading synthetic data...")
                        (
                            train,
                            test,
                            orig_test,
                            noise_bool,
                            noise_matrix,
                            noise_idx,
                        ) = load_synthetic_data(
                            n_synthetic=n_synthetic,
                            mean=0,
                            noise_variance=noise_variance,
                            dim="small",
                            prop=prop,
                            dist=dist,
                        )

                        cov_suspects = covariance_comparison(
                            clean_array=train, noisy_array=test
                        )
                        ks_suspect = get_suspect_features(
                            clean_corpus=train, test_dataset=test, alpha=0.1
                        )
                        suspect_features = np.unique(
                            np.append(cov_suspects, ks_suspect)
                        )
                        suspect_features = np.unique(np.append(suspect_features, [0]))

                        ##############################################################
                        #
                        # STEP 1: COPULA
                        #
                        ##############################################################
                        logging.info("Fitting Copula")

                        copula_samples = fit_sample_copula(
                            clean_corpus=train,
                            copula="vine",
                            copula_n_samples=copula_n_samples,
                        )

                        ##############################################################
                        #
                        # STEP 2: REPRESENTER
                        #
                        ##############################################################

                        logging.info("Running representer...")

                        rep_dim = int(np.ceil(train.shape[1] / 2))
                        pcs_train, pcs_test, pcs_copula = compute_representation(
                            train,
                            test,
                            copula_samples,
                            n_components=rep_dim,
                            rep_type=rep_type,
                        )

                        ##############################################################
                        #
                        # STEP 3: CONFORMAL PREDICTOR
                        #
                        ##############################################################
                        logging.info("Running conformal predictor...")

                        conformal_dict = {}
                        for feat in suspect_features:
                            feat = int(feat)
                            dim = pcs_copula.shape[1]
                            conf = conformal_class(
                                conformity_score="sign", input_dim=dim
                            )
                            conf.fit(
                                x_train=pcs_copula, y_train=copula_samples[:, feat]
                            )
                            conformal_dict[feat] = conf.predict(
                                x_test=pcs_test, y_test=test[:, feat]
                            )
                            logging.info(f"Running analysis for feature = {feat}")

                        inliers_dict, outliers_dict = inlier_outlier_dicts(
                            conformal_dict, suspect_features
                        )

                        feature = 0

                        small_ci_ids, large_ci_ids, df_sorted = sort_cis_synth(
                            conformal_dict, inliers_dict, suspect_features=[0]
                        )

                        df_conformal = conformal_dict[feature]

                        inlier_ids = inliers_dict[feature]

                        df_inlier = df_conformal.iloc[inlier_ids, :]

                        ##############################################################
                        #
                        # MSE EXPERIMENT - SECTION 4.2
                        #
                        ##############################################################

                        inlier_ids = inliers_dict[0]
                        outlier_ids = outliers_dict[0]

                        # Create downstream linear model
                        regr = linear_model.LinearRegression()

                        # Train the model
                        regr.fit(train[:, 0:-1], train[:, -1])

                        # make predictions
                        y_pred = regr.predict(train[:, 0:-1])

                        # compute mean squared error
                        mse = mean_squared_error(train[:, -1], y_pred)
                        print(f"MSE Train (CLEAN DATA): {mse} \n")
                        wandb_dict["mse_train_clean"] = mse

                        print("-------------------------------")

                        y_pred = regr.predict(test[:, 0:-1])
                        mse = mean_squared_error(test[:, -1], y_pred)
                        print(
                            f"MSE Test (UNKNOWN SAMPLES - INLIERS+OUTLIERS): {mse} \n"
                        )
                        wandb_dict["mse_test_unknown"] = mse

                        print("-------------------------------")

                        y_pred = regr.predict(test[outlier_ids, 0:-1])
                        mse = mean_squared_error(test[outlier_ids, -1], y_pred)
                        print(f"MSE Outliers: {mse} \n")
                        wandb_dict["mse_test_outliers"] = mse

                        print("-------------------------------")

                        y_pred = regr.predict(test[inlier_ids, 0:-1])
                        mse = mean_squared_error(test[inlier_ids, -1], y_pred)
                        print(f"MSE Inliers: {mse } \n")
                        wandb_dict["mse_test_inliers"] = mse

                        y_pred = regr.predict(test[small_ci_ids, 0:-1])
                        mse = mean_squared_error(test[small_ci_ids, -1], y_pred)
                        print(f"MSE Inliers w/ SMALL CIs: {mse}\n ")
                        wandb_dict["mse_test_inliers_small_ci"] = mse

                        y_pred = regr.predict(test[large_ci_ids, 0:-1])
                        mse = mean_squared_error(test[large_ci_ids, -1], y_pred)
                        print(f"MSE Inliers w/  LARGE CIs: {mse}\n")
                        wandb_dict["mse_test_inliers_large_ci"] = mse

                        copula_noisy_ids = large_ci_ids
                        copula_non_noisy_ids = small_ci_ids

                        ##############################################################
                        #
                        # MSE EXPERIMENT - SECTION 4.2 (COMPARISON METHODS)
                        #
                        ##############################################################

                        logging.info("MSE EXPERIMENTS COMPARISON METHODS")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 0:-1],
                            y_train=train[:, -1:],
                            x_test=test[:, 0:-1],
                            y_test=test[:, -1:],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="mcd",
                        )
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE MCD model - Small sigma: {mse}\n ")
                        wandb_dict["mse_test_mcd_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE MCD model - Large sigma {mse}\n")
                        wandb_dict["mse_test_mcd_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 1:],
                            y_train=train[:, 0],
                            x_test=test[:, 1:],
                            y_test=test[:, 0],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="mcd",
                        )

                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE MCD Data - Small Sigma: {mse}\n ")
                        wandb_dict["mse_test_mcd_data_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE MCD Data - Large Sigma: {mse}\n")
                        wandb_dict["mse_test_mcd_data_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 0:-1],
                            y_train=train[:, -1:],
                            x_test=test[:, 0:-1],
                            y_test=test[:, -1:],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="ensemble",
                        )
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE ENS model - Small sigma: {mse}\n ")
                        wandb_dict["mse_test_ens_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE ENS model - Large sigma {mse}\n")
                        wandb_dict["mse_test_ens_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 1:],
                            y_train=train[:, 0],
                            x_test=test[:, 1:],
                            y_test=test[:, 0],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="ensemble",
                        )

                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE ENS Data - Small Sigma: {mse}\n ")
                        wandb_dict["mse_test_ens_data_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE ENS Data - Large Sigma: {mse}\n")
                        wandb_dict["mse_test_ens_data_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 0:-1],
                            y_train=train[:, -1],
                            x_test=test[:, 0:-1],
                            y_test=test[:, -1],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="conformal",
                        )
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE Conformal Model Small CIs: {mse}\n ")
                        wandb_dict["mse_test_conformal_small_ci"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE Conformal Model LARGE CIs: {mse}\n")
                        wandb_dict["mse_test_conformal_large_ci"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 0:-1],
                            y_train=train[:, -1:],
                            x_test=test[:, 0:-1],
                            y_test=test[:, -1:],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="gp",
                        )
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE GP model - Small sigma: {mse}\n ")
                        wandb_dict["mse_test_gp_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE GP model - Large sigma {mse}\n")
                        wandb_dict["mse_test_gp_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 1:],
                            y_train=train[:, 0],
                            x_test=test[:, 1:],
                            y_test=test[:, 0],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="gp",
                        )

                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE GP Data - Small Sigma: {mse}\n ")
                        wandb_dict["mse_test_gp_data_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE GP Data - Large Sigma: {mse}\n")
                        wandb_dict["mse_test_gp_data_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 0:-1],
                            y_train=train[:, -1:],
                            x_test=test[:, 0:-1],
                            y_test=test[:, -1:],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="qr",
                        )
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE QR model - Small sigma: {mse}\n ")
                        wandb_dict["mse_test_qr_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE QR model - Large sigma {mse}\n")
                        wandb_dict["mse_test_qr_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 1:],
                            y_train=train[:, 0],
                            x_test=test[:, 1:],
                            y_test=test[:, 0],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="qr",
                        )

                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE QR Data - Small Sigma: {mse}\n ")
                        wandb_dict["mse_test_qr_data_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE QR Data - Large Sigma: {mse}\n")
                        wandb_dict["mse_test_qr_data_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 0:-1],
                            y_train=train[:, -1:],
                            x_test=test[:, 0:-1],
                            y_test=test[:, -1:],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="bnn",
                        )
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE BNN model - Small sigma: {mse}\n ")
                        wandb_dict["mse_test_bnn_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE BNN model - Large sigma {mse}\n")
                        wandb_dict["mse_test_bnn_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 1:],
                            y_train=train[:, 0],
                            x_test=test[:, 1:],
                            y_test=test[:, 0],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="bnn",
                        )
                        bnn_noisy_ids = noisy
                        bnn_non_noisy_ids = non_noisy
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE BNN Data - Small Sigma: {mse}\n ")
                        wandb_dict["mse_test_bnn_data_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE BNN Data - Large Sigma: {mse}\n")
                        wandb_dict["mse_test_bnn_data_large_sigma"] = mse

                        print("-------------------------------")
                        non_noisy, noisy = comparison_methods(
                            x_train=train[:, 0:-1],
                            y_train=train[:, -1:],
                            x_test=test[:, 0:-1],
                            y_test=test[:, -1:],
                            inlier_ids=inlier_ids,
                            df_inlier=df_inlier,
                            model_type="copula",
                        )
                        y_pred = regr.predict(test[non_noisy, 0:-1])
                        mse = mean_squared_error(test[non_noisy, -1], y_pred)
                        print(f"MSE COPULA - Small Sigma: {mse}\n ")
                        wandb_dict["mse_test_copula_small_sigma"] = mse

                        y_pred = regr.predict(test[noisy, 0:-1])
                        mse = mean_squared_error(test[noisy, -1], y_pred)
                        print(f"MSE COPULA - Large Sigma: {mse}\n")
                        wandb_dict["mse_test_copula_large_sigma"] = mse

                        ##############################################################
                        #
                        # METRICS EXPERIMENT - SECTION 4.1
                        #
                        ##############################################################

                        ids = range(test.shape[0])

                        ids = inlier_ids

                        y_test_ids = noise_bool

                        x_train_uncert, y_train_uncert = train[:, 1:], train[:, 0]
                        x_test_uncert = test[:, 1:]

                        df_conformal = conformal_dict[feature]

                        dc = deepcopy(df_conformal)

                        dc = dc.iloc[ids, :]

                        dc["pred"] = dc["min"] + (dc["conf_interval"] / 2)

                        preds = dc["pred"]  # target predictions
                        # dc['true_val']  # ground truth observations
                        true = orig_test[ids, 0]
                        # lower bound of the prediction interval
                        lb = dc["min"]
                        # upper bound of the prediction interval
                        ub = dc["max"]

                        (
                            uncert_metrics,
                            excess,
                            deficet,
                            excess_all,
                            deficet_all,
                        ) = compute_uncertainty_metrics(
                            preds=preds, lower_bound=lb, upper_bound=ub, true=true
                        )

                        idx_ordered = list(dc.sort_values(by="conf_interval").index)
                        results, roc = test_ood(np.array(y_test_ids)[ids], idx_ordered)

                        wandb_dict = process_results(
                            wandb_dict,
                            results,
                            roc,
                            uncert_metrics,
                            excess,
                            deficet,
                            excess_all,
                            deficet_all,
                            name="conformal_copula",
                        )

                        from src.models.benchmarks import uncertainty_benchmark

                        ##############################################################
                        #
                        # METRICS EXPERIMENT - SECTION 4.1 (COMPARISON METHODS)
                        #
                        ##############################################################

                        uncertainty_benchmark(
                            x_train=x_train_uncert,
                            y_train=y_train_uncert,
                            x_test=x_test_uncert,
                            y_test=orig_test[:, 0],
                            y_test_ids=y_test_ids,
                            ids=ids,
                            model_type="qr",
                            wandb_dict=wandb_dict,
                            conformal_dict=None,
                        )
                        uncertainty_benchmark(
                            x_train=x_train_uncert,
                            y_train=y_train_uncert,
                            x_test=x_test_uncert,
                            y_test=orig_test[:, 0],
                            y_test_ids=y_test_ids,
                            ids=ids,
                            model_type="bnn",
                            wandb_dict=wandb_dict,
                            conformal_dict=None,
                        )
                        uncertainty_benchmark(
                            x_train=x_train_uncert,
                            y_train=y_train_uncert,
                            x_test=x_test_uncert,
                            y_test=orig_test[:, 0],
                            y_test_ids=y_test_ids,
                            ids=ids,
                            model_type="gp",
                            wandb_dict=wandb_dict,
                            conformal_dict=None,
                        )
                        uncertainty_benchmark(
                            x_train=x_train_uncert,
                            y_train=y_train_uncert,
                            x_test=x_test_uncert,
                            y_test=orig_test[:, 0],
                            y_test_ids=y_test_ids,
                            ids=ids,
                            model_type="mcd",
                            wandb_dict=wandb_dict,
                            conformal_dict=None,
                        )
                        uncertainty_benchmark(
                            x_train=x_train_uncert,
                            y_train=y_train_uncert,
                            x_test=x_test_uncert,
                            y_test=orig_test[:, 0],
                            y_test_ids=y_test_ids,
                            ids=ids,
                            model_type="ensemble",
                            wandb_dict=wandb_dict,
                            conformal_dict=None,
                        )
                        uncertainty_benchmark(
                            x_train=x_train_uncert,
                            y_train=y_train_uncert,
                            x_test=x_test_uncert,
                            y_test=orig_test[:, 0],
                            y_test_ids=y_test_ids,
                            ids=ids,
                            model_type="conformal",
                            wandb_dict=wandb_dict,
                            conformal_dict=None,
                        )
                        uncertainty_benchmark(
                            x_train=x_train_uncert,
                            y_train=y_train_uncert,
                            x_test=x_test_uncert,
                            y_test=orig_test[:, 0],
                            y_test_ids=y_test_ids,
                            ids=ids,
                            model_type="copula",
                            wandb_dict=wandb_dict,
                            conformal_dict=None,
                        )

                        wandb.log(wandb_dict)
                        run.finish()

                    except Exception as e:
                        print(traceback.format_exc())
                        logging.info(e)
                        wandb.log(wandb_dict)
                        run.finish()
