# Normality Tests

## View some data

In [2]:
from constants.data_constants import JORDAN_DATASET_FILEPATH, MAESTRO_DATASET_FILEPATH
from constants.real_time_constants import SLIDING_WINDOW_LEN, STRIDE
from data.jordan_dataset import JordanDataset
from data.maestro_dataset import MaestroDataset
from data.sliding_window import SlidingWindowDataset

## pure data
id_train_base_dataset = JordanDataset(
    data_dir=JORDAN_DATASET_FILEPATH,
    split="train",
    name="id_train_dataset"
)
id_test_base_dataset = JordanDataset(
    data_dir=JORDAN_DATASET_FILEPATH,
    split="validation",
    name="id_test_dataset"
)
ood_test_base_dataset = MaestroDataset(
    data_dir=MAESTRO_DATASET_FILEPATH,
    split="test",
    name="maestro_test_dataset"
)

## dataset that takes chunks of 120 tokens out of the above datasets
id_train_dataset = SlidingWindowDataset(
    base_dataset=id_train_base_dataset,
    name="id_train_dataset",
    k=SLIDING_WINDOW_LEN,
    stride=STRIDE,
)
id_test_dataset = SlidingWindowDataset(
    base_dataset=id_test_base_dataset,
    name="id_test_dataset",
    k=SLIDING_WINDOW_LEN,
    stride=STRIDE,
)   
ood_test_dataset = SlidingWindowDataset(
    base_dataset=ood_test_base_dataset,
    name="ood_test_dataset",
    k=SLIDING_WINDOW_LEN,
    stride=STRIDE,
)




  from .autonotebook import tqdm as notebook_tqdm


Sample tokens:
Detected 0 bad samples
Sample tokens:
Detected 0 bad samples


In [3]:
import random
import pandas as pd
from utils.process_tokens import get_readable_events

events_by_dataset = {}
for name, dataset in [("id_train", id_train_dataset), ("id_test", id_test_dataset), ("ood_test", ood_test_dataset)]:
    random_idx = random.randint(0, len(dataset) - 1)
    input_ids = dataset[random_idx]['input_ids']
    if hasattr(input_ids, 'tolist'):
        input_ids = input_ids.tolist()
    try:
        events = get_readable_events(input_ids)
    except Exception as e:
        print(f"Error: {e}")
        print(f"Input IDs: {input_ids}")
        continue
    events_by_dataset[name] = events

min_events = min(len(events) for events in events_by_dataset.values())
num_rows = min(min_events, 50)

display_keys = ["misc", "onset", "dur", "inst", "pitch", "antic", "vel"]
key_mapping = {
    "misc": "special_token",
    "onset": "onset",
    "dur": "duration",
    "inst": "instrument",
    "pitch": "pitch",
    "antic": "anticipated",
    "vel": "velocity"
}

columns = pd.MultiIndex.from_product([events_by_dataset.keys(), display_keys], names=['dataset', 'key'])

table_data = []
for event_idx in range(num_rows):
    row = []
    for dataset_name in events_by_dataset.keys():
        events = events_by_dataset[dataset_name]
        event = events[event_idx]
        for display_key in display_keys:
            actual_key = key_mapping[display_key]
            value = event.get(actual_key, "")
            if hasattr(value, 'item'):
                value = value.item()
            elif hasattr(value, 'tolist'):
                value = value.tolist()
            row.append(value)
    table_data.append(row)

df = pd.DataFrame(table_data, columns=columns)
df.to_csv("events.csv", index=False)
df

dataset,id_train,id_train,id_train,id_train,id_train,id_train,id_train,id_test,id_test,id_test,id_test,id_test,id_test,id_test,ood_test,ood_test,ood_test,ood_test,ood_test,ood_test,ood_test
key,misc,onset,dur,inst,pitch,antic,vel,misc,onset,dur,...,pitch,antic,vel,misc,onset,dur,inst,pitch,antic,vel
0,AR,,,,,,,AR,,,...,,,,AR,,,,,,
1,,5.23,0.15,0.0,G#4,True,,,7.97,0.23,...,C#2,False,,,15.96,0.05,0.0,G4,False,
2,,0.91,0.31,0.0,A0,False,,,8.06,0.07,...,A#4,False,,,16.28,0.16,0.0,D#3,False,
3,,5.58,0.18,0.0,G#4,True,,,8.07,0.04,...,F4,False,,,16.3,0.12,0.0,C4,False,
4,,5.59,0.1,0.0,G#1,True,,,8.07,0.05,...,C5,False,,,16.46,0.05,0.0,D#4,False,
5,,0.91,0.38,0.0,A1,False,,,8.07,0.05,...,F5,False,,,16.49,0.06,0.0,G3,False,
6,,1.53,0.03,0.0,C5,False,,,8.18,0.36,...,F#2,False,,,16.68,0.06,0.0,G4,False,
7,,5.93,0.09,0.0,B1,True,,,8.28,0.25,...,F5,False,,,16.69,0.09,0.0,C4,False,
8,,5.94,0.19,0.0,G#4,True,,,8.28,0.26,...,A#4,False,,,17.06,0.6,0.0,D#5,False,
9,,6.1,0.23,0.0,E4,True,,,8.28,0.26,...,C5,False,,,17.52,0.1,0.0,G2,False,


## Running all multivariate tests

In [4]:
from data_analysis.mardia import mardia
from data_analysis.royston import royston
from data_analysis.hz import hz
from extract_layers.pooling_functions import pool_mean_std

num_layers = 24

tests = [mardia, royston, hz]

pooling_function = pool_mean_std

## Extract layers

### Sanity check

On an actually multivariate normal distribution, all tests should return a p-value of close to 1.

In [5]:
import numpy as np

X = np.random.randn(4000, 100)
metrics = {}
for test in tests:
    metrics = metrics | test(X)

print(metrics)




  Subsampling to 1000 samples and 100 features for testing
p_value <0.001
{'skewness': np.float64(172368.188), 'kurtosis': np.float64(-0.209), 'skewness_p': 0.127, 'kurtosis_p': 0.835, 'royston_p': 0.102, 'hz_p_value': 0.001}


### Extract layers

In [6]:
from extract_layers.extract_layers_main import extract_representations, collate_fn
from constants.model_constants import JORDAN_MODEL_NAME, DEVICE
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(JORDAN_MODEL_NAME).to(DEVICE)
num_layers = 24

id_train_dataloader = DataLoader(
    id_train_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn
)

extract_representations(
    model=model,
    data=id_train_dataloader,
    pooling_function=pooling_function,
    layers=list(range(num_layers + 1)),
)

Representations already exist. Loading from disk.


Loading layers from disk:   0%|          | 0/25 [00:00<?, ?it/s]

Loading layer 0 from disk.


Loading layers from disk:   4%|▍         | 1/25 [00:00<00:06,  3.80it/s]

Loading layer 1 from disk.


Loading layers from disk:   8%|▊         | 2/25 [00:00<00:06,  3.81it/s]

Loading layer 2 from disk.


Loading layers from disk:  12%|█▏        | 3/25 [00:00<00:05,  3.76it/s]

Loading layer 3 from disk.


Loading layers from disk:  16%|█▌        | 4/25 [00:01<00:05,  3.72it/s]

Loading layer 4 from disk.


Loading layers from disk:  20%|██        | 5/25 [00:01<00:05,  3.68it/s]

Loading layer 5 from disk.


Loading layers from disk:  24%|██▍       | 6/25 [00:01<00:05,  3.70it/s]

Loading layer 6 from disk.


Loading layers from disk:  28%|██▊       | 7/25 [00:01<00:04,  3.74it/s]

Loading layer 7 from disk.


Loading layers from disk:  32%|███▏      | 8/25 [00:02<00:04,  3.73it/s]

Loading layer 8 from disk.


Loading layers from disk:  36%|███▌      | 9/25 [00:02<00:04,  3.72it/s]

Loading layer 9 from disk.


Loading layers from disk:  40%|████      | 10/25 [00:02<00:04,  3.70it/s]

Loading layer 10 from disk.


Loading layers from disk:  44%|████▍     | 11/25 [00:02<00:03,  3.70it/s]

Loading layer 11 from disk.


Loading layers from disk:  48%|████▊     | 12/25 [00:03<00:03,  3.70it/s]

Loading layer 12 from disk.


Loading layers from disk:  52%|█████▏    | 13/25 [00:03<00:03,  3.72it/s]

Loading layer 13 from disk.


Loading layers from disk:  56%|█████▌    | 14/25 [00:03<00:02,  3.69it/s]

Loading layer 14 from disk.


Loading layers from disk:  60%|██████    | 15/25 [00:04<00:02,  3.67it/s]

Loading layer 15 from disk.


Loading layers from disk:  64%|██████▍   | 16/25 [00:04<00:02,  3.66it/s]

Loading layer 16 from disk.


Loading layers from disk:  68%|██████▊   | 17/25 [00:04<00:02,  3.65it/s]

Loading layer 17 from disk.


Loading layers from disk:  72%|███████▏  | 18/25 [00:04<00:01,  3.67it/s]

Loading layer 18 from disk.


Loading layers from disk:  76%|███████▌  | 19/25 [00:05<00:01,  3.65it/s]

Loading layer 19 from disk.


Loading layers from disk:  80%|████████  | 20/25 [00:05<00:01,  3.65it/s]

Loading layer 20 from disk.


Loading layers from disk:  84%|████████▍ | 21/25 [00:05<00:01,  3.05it/s]

Loading layer 21 from disk.


Loading layers from disk:  88%|████████▊ | 22/25 [00:06<00:01,  2.73it/s]

Loading layer 22 from disk.


Loading layers from disk:  92%|█████████▏| 23/25 [00:06<00:00,  2.57it/s]

Loading layer 23 from disk.


Loading layers from disk:  96%|█████████▌| 24/25 [00:07<00:00,  2.54it/s]

Loading layer 24 from disk.


Loading layers from disk: 100%|██████████| 25/25 [00:07<00:00,  3.29it/s]


{0: array([[-5.9110806e-03,  6.3979891e-03, -6.0609970e-03, ...,
          3.4708261e-02,  3.5237722e-02,  3.5380285e-02],
        [-7.3599704e-03,  8.1306314e-03, -2.4221973e-03, ...,
          3.3040121e-02,  3.6365788e-02,  3.8337175e-02],
        [-5.6966753e-03,  8.0154659e-03, -1.6049905e-03, ...,
          3.3712570e-02,  3.4965046e-02,  4.0272471e-02],
        ...,
        [ 9.2563396e-03,  1.7316611e-03, -3.5579493e-03, ...,
          3.0467726e-02,  3.5387058e-02,  3.7506565e-02],
        [ 5.6256950e-03, -6.0085522e-04,  1.4028049e-03, ...,
          3.2196917e-02,  3.4391984e-02,  3.7280243e-02],
        [ 8.1770848e-03,  8.2212966e-05,  4.8882253e-03, ...,
          3.6085494e-02,  3.2617692e-02,  3.5533369e-02]],
       shape=(89238, 2048), dtype=float32),
 1: array([[ 0.00034859,  0.07378831,  0.00954882, ...,  0.08258595,
          0.09581736,  0.135085  ],
        [ 0.01209578,  0.04203761,  0.02592461, ...,  0.08891829,
          0.08475216,  0.12993735],
        [-0.

In [7]:
import numpy as np
import pandas as pd
from constants.file_format import get_extract_layers_file_path
from tqdm import tqdm


metrics = []

for layer_idx in tqdm(range(num_layers+1), desc="Processing layers"):
    layer_dir = get_extract_layers_file_path(
        dataset_name="id_train_dataset",
        pooling_function_name=pooling_function.__name__,
        layer_idx=layer_idx,
    )
    layer_data = np.load(layer_dir)
    layer_data=layer_data[..., :10]
    print("Processing layer", layer_idx)
    print("Layer data shape:", layer_data.shape)
    layer_metrics = {}
    for test in tests:
        print("Running test", test.__name__)
        layer_metrics = layer_metrics | test(layer_data)
    metrics.append(layer_metrics)

all_metrics = pd.DataFrame(metrics)


Processing layers:   0%|          | 0/25 [00:00<?, ?it/s]

Processing layer 0
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:   4%|▍         | 1/25 [00:03<01:21,  3.38s/it]

p_value <0.001
Processing layer 1
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:   8%|▊         | 2/25 [00:06<01:16,  3.31s/it]

p_value <0.001
Processing layer 2
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  12%|█▏        | 3/25 [00:09<01:10,  3.22s/it]

p_value <0.001
Processing layer 3
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  16%|█▌        | 4/25 [00:12<01:07,  3.21s/it]

p_value <0.001
Processing layer 4
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  20%|██        | 5/25 [00:17<01:10,  3.52s/it]

p_value <0.001
Processing layer 5
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  24%|██▍       | 6/25 [00:20<01:07,  3.56s/it]

p_value <0.001
Processing layer 6
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  28%|██▊       | 7/25 [00:24<01:03,  3.54s/it]

p_value <0.001
Processing layer 7
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  32%|███▏      | 8/25 [00:27<00:59,  3.50s/it]

p_value <0.001
Processing layer 8
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  36%|███▌      | 9/25 [00:31<00:56,  3.50s/it]

p_value <0.001
Processing layer 9
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  40%|████      | 10/25 [00:34<00:52,  3.50s/it]

p_value <0.001
Processing layer 10
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  44%|████▍     | 11/25 [00:38<00:49,  3.51s/it]

p_value <0.001
Processing layer 11
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  48%|████▊     | 12/25 [00:41<00:45,  3.52s/it]

p_value <0.001
Processing layer 12
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing
p_value <0.001


Processing layers:  52%|█████▏    | 13/25 [00:45<00:42,  3.57s/it]

Processing layer 13
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  56%|█████▌    | 14/25 [00:48<00:39,  3.56s/it]

p_value <0.001
Processing layer 14
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  60%|██████    | 15/25 [00:52<00:35,  3.59s/it]

p_value <0.001
Processing layer 15
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing
p_value <0.001


Processing layers:  64%|██████▍   | 16/25 [00:56<00:32,  3.63s/it]

Processing layer 16
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  68%|██████▊   | 17/25 [00:59<00:28,  3.60s/it]

p_value <0.001
Processing layer 17
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  72%|███████▏  | 18/25 [01:03<00:25,  3.64s/it]

p_value <0.001
Processing layer 18
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing
p_value <0.001


Processing layers:  76%|███████▌  | 19/25 [01:06<00:21,  3.57s/it]

Processing layer 19
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  80%|████████  | 20/25 [01:10<00:17,  3.55s/it]

p_value <0.001
Processing layer 20
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing
p_value <0.001


Processing layers:  84%|████████▍ | 21/25 [01:13<00:14,  3.56s/it]

Processing layer 21
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  88%|████████▊ | 22/25 [01:17<00:10,  3.54s/it]

p_value <0.001
Processing layer 22
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  92%|█████████▏| 23/25 [01:20<00:07,  3.52s/it]

p_value <0.001
Processing layer 23
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing


Processing layers:  96%|█████████▌| 24/25 [01:24<00:03,  3.51s/it]

p_value <0.001
Processing layer 24
Layer data shape: (89238, 10)
Running test mardia
  Subsampling to 5000 samples and 10 features for testing
Running test royston
  Subsampling to 1000 samples and 10 features for testing
Running test hz
  Subsampling to 5000 samples and 10 features for testing
p_value <0.001


Processing layers: 100%|██████████| 25/25 [01:28<00:00,  3.52s/it]


In [8]:
all_metrics

Unnamed: 0,skewness,kurtosis,skewness_p,kurtosis_p,royston_p,hz_p_value
0,10115.173,60.084,0.001,0.001,0.001,0.001
1,4041.508,15.285,0.001,0.001,0.001,0.001
2,4051.336,17.622,0.001,0.001,0.001,0.001
3,3591.987,20.055,0.001,0.001,0.001,0.001
4,3703.463,20.0,0.001,0.001,0.001,0.001
5,3296.042,15.194,0.001,0.001,0.001,0.001
6,4071.404,20.792,0.001,0.001,0.001,0.001
7,3998.564,18.374,0.001,0.001,0.001,0.001
8,3122.209,14.031,0.001,0.001,0.001,0.001
9,2772.687,12.497,0.001,0.001,0.001,0.001


In [9]:

## optionally save it
all_metrics.to_csv("all_metrics.csv", index=False)