# Data Analysis

## View some data

In [1]:
from constants import JORDAN_DATASET_FILEPATH, MAESTRO_DATASET_FILEPATH
from data.jordan_dataset import JordanDataset
from data.maestro_dataset import MaestroDataset

id_train_dataset = JordanDataset(
    data_dir=JORDAN_DATASET_FILEPATH,
    split="train"
)
id_test_dataset = JordanDataset(
    data_dir=JORDAN_DATASET_FILEPATH,
    split="validation"
)
ood_test_dataset = MaestroDataset(
    data_dir=MAESTRO_DATASET_FILEPATH,
    split="test",
)

  from .autonotebook import tqdm as notebook_tqdm


Loading train split from /scratch/joel/jordan_dataset...
Loaded 4060 samples from train split
Sample keys: ['input_ids', 'labels']
Skipped 490 bad samples
Loading validation split from /scratch/joel/jordan_dataset...
Loaded 84 samples from validation split
Sample keys: ['input_ids', 'labels']
Skipped 0 bad samples
path /scratch/joel/maestrodata/test.txt


In [11]:
import random
import pandas as pd
from utils.process_tokens import get_readable_events

events_by_dataset = {}
for name, dataset in [("id_train", id_train_dataset), ("id_test", id_test_dataset), ("ood_test", ood_test_dataset)]:
    random_idx = random.randint(0, len(dataset) - 1)
    input_ids = dataset[random_idx]['input_ids']
    if hasattr(input_ids, 'tolist'):
        input_ids = input_ids.tolist()
    events = get_readable_events(input_ids)
    events_by_dataset[name] = events

min_events = min(len(events) for events in events_by_dataset.values())
num_rows = min(min_events, 30)

display_keys = ["misc", "onset", "dur", "inst", "pitch", "antic"]
key_mapping = {
    "misc": "special_token",
    "onset": "onset",
    "dur": "duration",
    "inst": "instrument",
    "pitch": "pitch",
    "antic": "anticipated"
}

columns = pd.MultiIndex.from_product([events_by_dataset.keys(), display_keys], names=['dataset', 'key'])

table_data = []
for event_idx in range(num_rows):
    row = []
    for dataset_name in events_by_dataset.keys():
        events = events_by_dataset[dataset_name]
        event = events[event_idx]
        for display_key in display_keys:
            actual_key = key_mapping[display_key]
            value = event.get(actual_key, "")
            if hasattr(value, 'item'):
                value = value.item()
            elif hasattr(value, 'tolist'):
                value = value.tolist()
            row.append(value)
    table_data.append(row)

df = pd.DataFrame(table_data, columns=columns)
df

dataset,id_train,id_train,id_train,id_train,id_train,id_train,id_test,id_test,id_test,id_test,id_test,id_test,ood_test,ood_test,ood_test,ood_test,ood_test,ood_test
key,misc,onset,dur,inst,pitch,antic,misc,onset,dur,inst,pitch,antic,misc,onset,dur,inst,pitch,antic
0,AAR,,,,,,AR,,,,,,AR,,,,,
1,,0.0,0.16,0.0,B2,False,,0.0,0.12,0.0,G#3,False,,0.0,0.11,0.0,A#4,False
2,,0.06,0.24,0.0,D#4,False,,0.01,0.09,0.0,G#4,False,,0.17,0.09,0.0,D#4,False
3,,0.17,0.07,0.0,G#4,False,,0.05,0.1,0.0,C#4,False,,0.32,0.11,0.0,C#4,False
4,,0.26,0.47,0.0,D#5,False,,0.07,0.11,0.0,C#5,False,,0.32,0.48,0.0,F5,False
5,,0.36,0.05,0.0,G#4,False,,0.51,0.1,0.0,F3,False,,0.32,0.06,0.0,C#5,False
6,,0.44,0.08,0.0,D#4,False,,0.55,0.11,0.0,G#3,False,,0.46,0.17,0.0,D#4,False
7,,0.5,0.05,0.0,G#4,False,,0.56,0.14,0.0,G#4,False,,0.66,0.18,0.0,C4,False
8,,0.54,0.15,0.0,C#4,False,,0.58,0.11,0.0,C#4,False,,0.7,0.38,0.0,C5,False
9,,0.59,0.4,0.0,C#5,False,,0.6,0.14,0.0,C#5,False,,0.7,0.41,0.0,D#5,False


## Running all multivariate tests

In [3]:
from data_analysis.mardia import mardia
from data_analysis.royston import royston
from data_analysis.hz import hz
from extract_layers.pooling_functions import pool_mean_std

num_layers = 24

tests = [mardia, royston, hz]

pooling_function = pool_mean_std

## Extract layers

### Sanity check

On an actually multivariate normal distribution, all tests should return a p-value of close to 1.

In [4]:
import numpy as np

X = np.random.randn(4000, 100)
metrics = {}
for test in tests:
    metrics = metrics | test(X)

print(metrics)




  Subsampling to 1000 samples and 100 features for testing
p_value 1.0
{'skewness': np.float64(171153.895), 'kurtosis': np.float64(-0.131), 'skewness_p': 0.824, 'kurtosis_p': 0.896, 'royston_p': 0.965, 'hz_p_value': 1.0}


In [5]:
import os
import numpy as np
import pandas as pd

from constants import SCRATCH_FILEPATH
from tqdm import tqdm


metrics = []

for layer_idx in tqdm(range(num_layers+1), desc="Processing layers"):
    layer_dir = os.path.join(SCRATCH_FILEPATH, pooling_function.__name__, f"layer_{layer_idx}.npy")
    layer_data = np.load(layer_dir)
    print("Processing layer", layer_idx)
    print("Layer data shape:", layer_data.shape)
    layer_metrics = {}
    for test in tests:
        print("Running test", test.__name__)
        layer_metrics = layer_metrics | test(layer_data)
    metrics.append(layer_metrics)

all_metrics = pd.DataFrame(metrics)


Processing layers:   0%|          | 0/25 [00:00<?, ?it/s]

Processing layer 0
Layer data shape: (4060, 2048)
Running test mardia
  Subsampling to 4060 samples and 100 features for testing
Running test royston
  Subsampling to 1000 samples and 100 features for testing
Running test hz
  Subsampling to 4060 samples and 100 features for testing


Processing layers:   4%|▍         | 1/25 [00:02<01:03,  2.66s/it]

p_value <0.001
Processing layer 1
Layer data shape: (4060, 2048)
Running test mardia
  Subsampling to 4060 samples and 100 features for testing
Running test royston
  Subsampling to 1000 samples and 100 features for testing
Running test hz
  Subsampling to 4060 samples and 100 features for testing


Processing layers:   8%|▊         | 2/25 [00:05<00:59,  2.61s/it]

p_value <0.001
Processing layer 2
Layer data shape: (4060, 2048)
Running test mardia
  Subsampling to 4060 samples and 100 features for testing


Processing layers:   8%|▊         | 2/25 [00:05<01:00,  2.63s/it]


KeyboardInterrupt: 

In [None]:
all_metrics

In [None]:

## optionally save it
all_metrics.to_csv("all_metrics.csv", index=False)