# OOD Detection Pipeline

## 1. Load data to obtain hidden representations.

### Datasets

In [None]:
from constants import JORDAN_DATASET_FILEPATH, MAESTRO_DATASET_FILEPATH
from data.jordan_dataset import JordanDataset
from data.maestro_dataset import MaestroDataset


print("Loading Jordan dataset - train")
id_train_dataset = JordanDataset(
    data_dir=JORDAN_DATASET_FILEPATH,
    split="train"
)
print(f"Obtained {len(id_train_dataset)} training examples from Jordan dataset - train")

print("Loading Jordan dataset - test")
id_test_dataset = JordanDataset(
    data_dir=JORDAN_DATASET_FILEPATH,
    split="validation"
)
print(f"Obtained {len(id_test_dataset)} testing examples from Jordan dataset - test")

print("Loading Maestro dataset - test")
ood_dataset = MaestroDataset(
    data_dir=MAESTRO_DATASET_FILEPATH,
    split="test"
)
print(f"Obtained {len(ood_dataset)} testing examples from Maestro dataset - test")

  from .autonotebook import tqdm as notebook_tqdm


Loading Jordan dataset - train
Loading train split from /scratch/joel/jordan_dataset...
Loaded 4060 samples from train split
Sample keys: ['input_ids', 'labels']
Skipped 490 bad samples
Obtained 4060 training examples from Jordan dataset - train
Loading Jordan dataset - test
Loading validation split from /scratch/joel/jordan_dataset...
Loaded 84 samples from validation split
Sample keys: ['input_ids', 'labels']
Skipped 0 bad samples
Obtained 84 testing examples from Jordan dataset - test
Loading Maestro dataset - test
path /scratch/joel/maestrodata/test.txt
Obtained 962 testing examples from Maestro dataset - test


### Model

In [2]:
from constants import DEVICE, JORDAN_MODEL_NAME
import torch
from transformers import AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained(
    JORDAN_MODEL_NAME,
    dtype=torch.float32,
).to(DEVICE)

### Extract layers

In [3]:
from extract_layers.pooling_functions import pool_mean_std

batch_size = 8
n_layers = 24
pooling_function = pool_mean_std

layers_to_extract = list(range(n_layers + 1))

### Extract representations: run this only once

In [4]:
# from constants import SCRATCH_FILEPATH
# from torch.utils.data import DataLoader
# from utils.data_loading import collate_fn
# from extract_layers.extract_layers_main import extract_representations

# for dataset_name, dataset in [("id_train_dataset", id_train_dataset), ("id_test_dataset", id_test_dataset), ("ood_dataset", ood_dataset)]:
#     print(f"Creating {dataset_name} dataloader")
#     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
#     print(f"Extracting representations from {dataset_name} dataloader")
#     save_dir = f"{SCRATCH_FILEPATH}/{dataset_name}"
#     extract_representations(
#         model,
#         dataloader,
#         pooling_function=pooling_function,
#         save_dir=save_dir,
#         layers=layers_to_extract,
#     )


# 2. OOD Detector

Refer to [OOD detection docs](../../docs/ood_detection.md) for more information about this.

In [5]:
from main.transformations import Transformations
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from main.scoring_functions import mahalanobis_distance

transformations = Transformations(
    [
        PCA(n_components=10),
        StandardScaler(),
    ]
) # this is IN ADDITION to the extraction done in extract_layers.py
scoring_function = mahalanobis_distance

### Retrieve extracted data from files

In [11]:
from constants import SCRATCH_FILEPATH
import numpy as np

layer_idx = 1
num_test_samples = 50   

id_train_data = np.load(f"{SCRATCH_FILEPATH}/id_train_dataset/layer_{layer_idx}.npy")
id_test_data = np.load(f"{SCRATCH_FILEPATH}/id_test_dataset/layer_{layer_idx}.npy")[:num_test_samples, :]
ood_data = np.load(f"{SCRATCH_FILEPATH}/ood_dataset/layer_{layer_idx}.npy")[:num_test_samples, :]

print("Obtained hidden layer activations from files")
print("ID train data shape:", id_train_data.shape)
print("ID test data shape:", id_test_data.shape)
print("OOD data shape:", ood_data.shape)


Obtained hidden layer activations from files
ID train data shape: (4060, 2048)
ID test data shape: (50, 2048)
OOD data shape: (50, 2048)


## Create OOD detector
from transformations and scoring function

In [12]:
from main.ood_detector import OODDetector

ood_detector = OODDetector(
    embedding_function=transformations,
    scoring_function=scoring_function,
    id_train_data=id_train_data,
)

We can evaluate the detector!

```threshold = 0.7``` with ```threshold_type = "percentile"``` means that the score threshold is set to be the 70th percentile of all the scores (ID test + OOD test combined).

In [13]:
confusion_matrix, true_positive_rate, false_positive_rate = ood_detector.evaluate(
    id_test_data,
    ood_data,
    threshold=0.7,
    threshold_type="percentile",
)
print("Confusion matrix:", confusion_matrix)
print("True positive rate:", true_positive_rate)
print("False positive rate:", false_positive_rate)


Confusion matrix: tensor([[19, 31],
        [11, 39]])
True positive rate: tensor(0.3800)
False positive rate: tensor(0.2200)


We can plot the ROC curve and get AUROC!

AUROC will appear as ```main/auroc.png```

In [14]:
from eval.auroc import get_auroc

auroc = get_auroc(
    ood_detector,
    id_test_data,
    ood_data,
)

  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_scores, torch.tensor(threshold, dtype=all_scores.dtype)
  all_sc

THIS IS SO SUS i think it needs to be debugged...

In [15]:
print("AUROC:", auroc)



AUROC: 0.5143999457359314
