# Data Analysis

## Running all multivariate tests

In [11]:
from data_analysis.mardia import mardia
from data_analysis.royston import royston
from data_analysis.hz import hz
from extract_layers.pooling_functions import pool_mean_std

num_layers = 24

tests = [mardia, royston, hz]

pooling_function = pool_mean_std

## Extract layers

### Sanity check

On an actually multivariate normal distribution, all tests should return a p-value of close to 1.

In [None]:
import numpy as np

X = np.random.randn(4000, 100)
metrics = {}
for test in tests:
    metrics = metrics | test(X)

print(metrics)




  Subsampling to 1000 samples and 100 features for testing
p_value 1.0
{'skewness': np.float64(171803.814), 'kurtosis': np.float64(-1.194), 'skewness_p': 0.429, 'kurtosis_p': 0.232, 'royston_p': 0.808, 'hz_p_value': 1.0}


In [12]:
import os
import numpy as np
import pandas as pd

from constants import SCRATCH_FILEPATH
from tqdm import tqdm


metrics = []

for layer_idx in tqdm(range(num_layers+1), desc="Processing layers"):
    layer_dir = os.path.join(SCRATCH_FILEPATH, pooling_function.__name__, f"layer_{layer_idx}.npy")
    layer_data = np.load(layer_dir)
    print("Processing layer", layer_idx)
    print("Layer data shape:", layer_data.shape)
    layer_metrics = {}
    for test in tests:
        print("Running test", test.__name__)
        layer_metrics = layer_metrics | test(layer_data)
    metrics.append(layer_metrics)

all_metrics = pd.DataFrame(metrics)


Processing layers:   0%|          | 0/25 [00:00<?, ?it/s]

Processing layer 0
Layer data shape: (4060, 2048)
Running test mardia
  Subsampling to 4060 samples and 100 features for testing
Running test royston
  Subsampling to 1000 samples and 100 features for testing
Running test hz
  Subsampling to 4060 samples and 100 features for testing
p_value <0.001


Processing layers:   4%|▍         | 1/25 [00:02<01:11,  3.00s/it]

Processing layer 1
Layer data shape: (4060, 2048)
Running test mardia
  Subsampling to 4060 samples and 100 features for testing
Running test royston
  Subsampling to 1000 samples and 100 features for testing


Processing layers:   4%|▍         | 1/25 [00:04<01:37,  4.05s/it]


KeyboardInterrupt: 

In [None]:
all_metrics

Unnamed: 0,skewness,kurtosis,skewness_p,kurtosis_p,royston_p,hz_p_value
0,811981.901,189.521,0.001,0.001,0.001,0.001
1,846267.491,203.664,0.001,0.001,0.001,0.001
2,946212.099,159.081,0.001,0.001,0.001,0.001
3,1022056.993,160.237,0.001,0.001,0.001,0.001
4,1136691.815,172.796,0.001,0.001,0.001,0.001
5,1074922.945,159.802,0.001,0.001,0.001,0.001
6,1093980.844,161.981,0.001,0.001,0.001,0.001
7,989175.41,138.543,0.001,0.001,0.001,0.001
8,980938.668,120.633,0.001,0.001,0.001,0.001
9,895468.309,106.874,0.001,0.001,0.001,0.001


In [None]:

## optionally save it
all_metrics.to_csv("all_metrics.csv", index=False)