# DETAILED TUTORIAL: USING DATA SUITE ON SYNTHETIC DATA

## Imports

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import wandb
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

from src.data.data_loader import load_adult_data
from src.models.benchmarks import comparison_methods
from src.models.conformal import conformal_class
from src.models.representation import compute_representation
from src.utils.data_utils import (
    covariance_comparison,
    get_suspect_features,
    read_from_file,
    write_to_file,
)
from src.utils.helpers import inlier_outlier_dicts, sort_ci_vals
from src.utils.data_utils import (
    covariance_comparison,
    get_suspect_features,
)


model_ids = {}
artifact_path = "artifacts"



In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

## Generate synthetic data

In [4]:
props = [0.1, 0.25, 0.5, 0.75]
dists = ["normal", "beta", "gamma", "weibull"]
noise_vars = [1, 2, 5, 10]
copula_count_samples = [1000]

prop = props[0]
dist = dists[0]
noise_variance = noise_vars[0]
copula_n_samples = 1000


n_synthetic = 1000
train_prop = 1
rep_type = "pca"

wandb_dict = {}


from src.data.data_loader import load_synthetic_data

(
    train,
    test,
    orig_test,
    noise_bool,
    noise_matrix,
    noise_idx,
) = load_synthetic_data(
    n_synthetic=n_synthetic,
    mean=0,
    noise_variance=noise_variance,
    dim="small",
    prop=prop,
    dist=dist,
)


suspect_features = list(range(train.shape[1]))




## STEP 1: COPULA - We fit and sample a copula on the dataset. This step is optional, but it allows the user to only need synthetic data rather than real data

In [5]:
from src.models.copula import fit_sample_copula

copula_samples = fit_sample_copula(
    clean_corpus=train,
    copula="vine",
    copula_n_samples=copula_n_samples,
)

INFO:root:Vine...
INFO:copulas.multivariate.vine:Fitting VineCopula("direct")
INFO:root:Copula Samples = 1000


## STEP 2: REPRESENTER - learns a low dimensional representation of the data. The representation dimension is half, but can be adjusted as a hyperparameter

In [6]:
from src.models.representation import compute_representation

rep_dim = int(np.ceil(train.shape[1] / 2))
pcs_train, pcs_test, pcs_copula = compute_representation(
    train,
    test,
    copula_samples,
    n_components=rep_dim,
    rep_type=rep_type,
)


# STEP 3: CONFORMAL PREDICTOR - a feature-wise conformal predictor is fit and each reconstruction assessed

In [7]:
from src.models.conformal import conformal_class

conformal_dict = {}
for feat in suspect_features:
    feat = int(feat)
    dim = pcs_copula.shape[1]
    conf = conformal_class(
        conformity_score="sign", input_dim=dim
    )
    conf.fit(
        x_train=pcs_copula, y_train=copula_samples[:, feat]
    )
    conformal_dict[feat] = conf.predict(
        x_test=pcs_test, y_test=test[:, feat]
    )
    logging.info(f"Running analysis for feature = {feat}")


INFO:root:Running analysis for feature = 0
INFO:root:Running analysis for feature = 1
INFO:root:Running analysis for feature = 2


## PROCESS CONFORMAL INTERVALS - we need to process the intervals 

In [8]:
from src.utils.helpers import *

proportion=0.4

inliers_dict, outliers_dict = inlier_outlier_dicts(
    conformal_dict, suspect_features
)


small_ci_ids, large_ci_ids, df_sorted = sort_cis_synth(
    conformal_dict, inliers_dict, suspect_features=[0], proportion=proportion
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inlier[f"{feature}_contrib"] = df_inlier["norm_interval"]


## EXAMPLE: TRAIN A DOWNSTREAM REGRESSION MODEL & SHOW RESULTS ON THE DIFFERENT TYPES OF SAMPLES

In [9]:
from src.models.benchmarks import comparison_methods

inlier_ids = inliers_dict[0]
outlier_ids = outliers_dict[0]


#####################################
#
# FIT A DOWNSTREAM MODEL ON TRAINING DATA
# MAKE PREDICTIONS ON TEST DATA
#
#####################################

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(train[:, 0:-1], train[:, -1])

y_pred = regr.predict(train[:, 0:-1])


#####################################
#
# ASSESS MSE ON THE DIFFERENT TYPE OF SAMPLES IDENTIFIED
#
#####################################

mse = mean_squared_error(train[:, -1], y_pred)
print(f"MSE Train data: {mse} \n")
wandb_dict["mse_train_clean"] = mse

print("-------------------------------")

y_pred = regr.predict(test[:, 0:-1])
mse = mean_squared_error(test[:, -1], y_pred)
print(
    f"MSE Test (ALL SAMPLES - INLIERS+OUTLIERS): {mse} \n"
)
wandb_dict["mse_test_unknown"] = mse

print("-------------------------------")

y_pred = regr.predict(test[outlier_ids, 0:-1])
mse = mean_squared_error(test[outlier_ids, -1], y_pred)
print(f"MSE Outliers: {mse} \n")
wandb_dict["mse_test_outliers"] = mse

print("-------------------------------")

y_pred = regr.predict(test[inlier_ids, 0:-1])
mse = mean_squared_error(test[inlier_ids, -1], y_pred)
print(f"MSE Inliers: {mse } \n")
wandb_dict["mse_test_inliers"] = mse

y_pred = regr.predict(test[small_ci_ids, 0:-1])
mse = mean_squared_error(test[small_ci_ids, -1], y_pred)
print(f"MSE Inliers w/ SMALL CIs: {mse}\n ")
wandb_dict["mse_test_inliers_small_ci"] = mse

y_pred = regr.predict(test[large_ci_ids, 0:-1])
mse = mean_squared_error(test[large_ci_ids, -1], y_pred)
print(f"MSE Inliers w/  LARGE CIs: {mse}\n")
wandb_dict["mse_test_inliers_large_ci"] = mse

MSE Train data: 0.06611016393596339 

-------------------------------
MSE Test (ALL SAMPLES - INLIERS+OUTLIERS): 0.10925554872133665 

-------------------------------
MSE Outliers: 0.16960072149325237 

-------------------------------
MSE Inliers: 0.10163516854194797 

MSE Inliers w/ SMALL CIs: 0.07927355182798156
 
MSE Inliers w/  LARGE CIs: 0.11329596405589415



### Note the differences between samples with small CIs and Large CIs - indicating we can trust samples with small CIs more

## EXAMPLE: Compute performance metrics

In [10]:
from src.utils.uncertainty_metrics import *
from copy import deepcopy

ids = range(test.shape[0])

ids = inlier_ids

y_test_ids = noise_bool

x_train_uncert, y_train_uncert = train[:, 1:], train[:, 0]
x_test_uncert = test[:, 1:]
feature=0

df_conformal = conformal_dict[feature]

df_conformal = df_conformal.iloc[ids, :]

df_conformal["pred"] = df_conformal["min"] + (df_conformal["conf_interval"] / 2)

preds = df_conformal["pred"]  # target predictions
# dc['true_val']  # ground truth observations
true = orig_test[ids, 0]
# lower bound of the prediction interval
lb = df_conformal["min"]
# upper bound of the prediction interval
ub = df_conformal["max"]

print("COMPUTING PERFORMANCE METRICS")

(
    uncert_metrics,
    excess,
    deficet,
    excess_all,
    deficet_all,
) = compute_uncertainty_metrics(
    preds=preds, lower_bound=lb, upper_bound=ub, true=true
)

idx_ordered = list(df_conformal.sort_values(by="conf_interval").index)
results, roc = test_ood(np.array(y_test_ids)[ids], idx_ordered)

wandb_dict = process_results(
    wandb_dict,
    results,
    roc,
    uncert_metrics,
    excess,
    deficet,
    excess_all,
    deficet_all,
    name="conformal_copula",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


COMPUTING PERFORMANCE METRICS
KNNs ROC:0.4772, precision @ rank n:0.9483
KNNs ROC:0.4772, precision @ rank n:0.9483
KNNs ROC:0.4806, precision @ rank n:0.9483
KNNs ROC:0.4806, precision @ rank n:0.9483
KNNs ROC:0.4836, precision @ rank n:0.9483
KNNs ROC:0.4864, precision @ rank n:0.9483
KNNs ROC:0.4864, precision @ rank n:0.9483
KNNs ROC:0.489, precision @ rank n:0.9483
KNNs ROC:0.489, precision @ rank n:0.9483
KNNs ROC:0.4914, precision @ rank n:0.9483
KNNs ROC:0.4914, precision @ rank n:0.9483
KNNs ROC:0.4936, precision @ rank n:0.9483
KNNs ROC:0.4957, precision @ rank n:0.9483
KNNs ROC:0.4957, precision @ rank n:0.9483
KNNs ROC:0.4976, precision @ rank n:0.9483
KNNs ROC:0.4976, precision @ rank n:0.9483
KNNs ROC:0.4995, precision @ rank n:0.9483
KNNs ROC:0.5012, precision @ rank n:0.9483
KNNs ROC:0.5012, precision @ rank n:0.9483
KNNs ROC:0.5028, precision @ rank n:0.9483
KNNs ROC:0.5028, precision @ rank n:0.9483
KNNs ROC:0.5043, precision @ rank n:0.9483
KNNs ROC:0.5057, precision

## This dict could be logged - note PICP (Prediction interval coverage probability) = Coverage 

In [11]:
wandb_dict

{'mse_train_clean': 0.06611016393596339,
 'mse_test_unknown': 0.10925554872133665,
 'mse_test_outliers': 0.16960072149325237,
 'mse_test_inliers': 0.10163516854194797,
 'mse_test_inliers_small_ci': 0.07927355182798156,
 'mse_test_inliers_large_ci': 0.11329596405589415,
 'excess_conformal_copula': 1.7270678235888246,
 'deficet_conformal_copula': 0.11122050922588227,
 'excess_all_conformal_copula': 1.6975956764286058,
 'deficet_all_conformal_copula': 0.0018979609082915062,
 'roc_conformal_copula': 0.4772,
 'rmse_conformal_copula': 0.5246920438634434,
 'nll_conformal_copula': 0.9067576930600207,
 'auucc_gain_conformal_copula': -0.33335677581719003,
 'picp_conformal_copula': 0.9829351535836177,
 'mpiw_conformal_copula': 4.190101260962822,
 'r2_conformal_copula': 0.9165079038050987,
 'TPR_conformal_copula': 0.9481123944147561,
 'FPR_conformal_copula': 0.9658907975227742,
 'TNR_conformal_copula': 0.03409026344198027,
 'FNR_conformal_copula': 0.05171522151353215,
 'Recall_conformal_copula': 0