In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import tonic

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_folder = '../data/bin_1ms_comp/'
chars = list("AB")
fidxs = list(range(1, 5))
paths = sorted([f'{data_folder}/{c}{fi}.npy' for c in chars for fi in fidxs])

## checks

check potential dataset class imbalance

In [18]:
per_particle = {ch: (0, 0) for ch in chars}
for ch in chars:
    for fi in fidxs:
        path = f'{data_folder}/{ch}{fi}.npy'
        data = np.load(path, allow_pickle=True)
        over1k = [e for e in data if e.shape[0] >= 1000]
        per_particle[ch] = (per_particle[ch][0] + len(over1k), per_particle[ch][1] + data.shape[0])
        print(f'{ch}{fi}: {data.shape[0]/1000:4.1f}k -> {len(over1k)/1000:4.1f}k '
              f'({len(over1k)/data.shape[0]:5.1%})', end=', ')
    print()
for ch, (n_over, n_total) in per_particle.items():
    print(f'{ch}: {n_total/1000:5.1f}k -> {n_over/1000:5.1f}k ({n_over/n_total:5.1%})')

A1: 61.1k ->  7.4k (12.1%), A2: 62.6k ->  4.7k ( 7.5%), A3: 62.7k -> 22.1k (35.2%), A4: 70.4k -> 15.5k (22.0%), 
B1: 63.1k ->  6.9k (10.9%), B2: 62.8k -> 12.2k (19.4%), B3: 65.8k -> 20.1k (30.5%), B4: 62.3k ->  6.4k (10.2%), 
A: 256.9k ->  49.7k (19.3%)
B: 254.0k ->  45.5k (17.9%)


- no strong imbalance between particles
- trials 2 and 4 are imbalanced between particles 
    - can expect worse generalization over data folds -> balance when training? *not needed, see results below*

## load data

In [3]:
data = []
label = []
trial = []
for class_idx, ch in enumerate(chars):
    for fi in fidxs:
        path = f'{data_folder}/{ch}{fi}.npy'
        d = np.load(path, allow_pickle=True)
        d = d[[e.shape[0] >= 1_000 for e in d]]
        data.append(d)
        label.append([class_idx] * len(d))
        trial.append([fi] * len(d))
data = np.concatenate(data)
label = np.concatenate(label)
trial = np.concatenate(trial)

In [4]:
transform = tonic.transforms.ToImage(sensor_size=(32, 24, 2,))
data = np.array([transform(img) for img in data])

In [5]:
data.shape

(95154, 2, 24, 32)

## train linear model

In [6]:
data = data.reshape(data.shape[0], -1) # 2, 24, 32 -> 1536
data.shape

(95154, 1536)

### generalization to new trial

Train on three trials, test on the fourth one

In [50]:
# train linear classifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

tracc = []
teacc = []

for test_trial in range(1, 5):
    # train test split
    train_idxs = trial != test_trial
    test_idxs = trial == test_trial
    train_data = data[train_idxs]
    train_labels = label[train_idxs]
    train_trials = trial[train_idxs]
    test_data = data[test_idxs]
    test_labels = label[test_idxs]
    test_trials = trial[test_idxs]

    # log
    print(f'Testing on trial {test_trial}, training on all others')
    print(f'  Train: {train_data.shape[0]/1000:3.0f}k, Test: {test_data.shape[0]/1000:3.0f}k')
    ntot = np.sum(train_labels == 0) + np.sum(train_labels == 1)
    print('  Class balance: ')
    print(f'train {np.sum(train_labels == 0)/ntot:.0%} | {np.sum(train_labels == 1)/ntot:.0%}', end=', ')
    ntot = np.sum(test_labels == 0) + np.sum(test_labels == 1)
    print(f'test  {np.sum(test_labels == 0)/ntot:.0%} | {np.sum(test_labels == 1)/ntot:.0%}')

    # train pipeline: standard scaler + logistic regression
    pipeline = Pipeline([
        ('scaler', StandardScaler()), 
        ('model', LogisticRegression(max_iter=1000))
    ])
    pipeline.fit(train_data, train_labels)
    tracc.append(pipeline.score(train_data, train_labels))
    teacc.append(pipeline.score(test_data, test_labels))
    print(f'Train accuracy: {tracc[-1]:.2%}')
    print(f'Test accuracy:  {teacc[-1]:.2%}')
    print()

Testing on trial 1, training on all others
Train: 80.9k, Test: 14.2k
Train A|B: 52.26% | 47.74%
Test A|B:  51.77% | 48.23%
Train accuracy: 99.00%
Test accuracy: 94.78%

Testing on trial 2, training on all others
Train: 78.3k, Test: 16.9k
Train A|B: 57.45% | 42.55%
Test A|B:  27.83% | 72.17%
Train accuracy: 98.96%
Test accuracy: 96.11%

Testing on trial 3, training on all others
Train: 53.0k, Test: 42.2k
Train A|B: 52.03% | 47.97%
Test A|B:  52.39% | 47.61%
Train accuracy: 98.50%
Test accuracy: 96.98%

Testing on trial 4, training on all others
Train: 73.3k, Test: 21.9k
Train A|B: 46.61% | 53.39%
Test A|B:  70.89% | 29.11%
Train accuracy: 98.98%
Test accuracy: 96.33%



In [58]:
print(f'training: {tracc.mean()/100:.2%} +- {tracc.std()/100:.2%}')
print(f'testing:  {teacc.mean()/100:.2%} +- {teacc.std()/100:.2%}')

training: 98.86% +- 0.21%
testing:  96.05% +- 0.80%


### generalization within trials

train on the first 80% of each trial, test on the remaining 20% of each trial

In [61]:
transform = tonic.transforms.ToImage(sensor_size=(32, 24, 2,))
train_data = []
test_data = []
train_labels = []
test_labels = []
train_trials = []
test_trials = []
for class_idx, ch in enumerate(chars):
    for fi in fidxs:
        path = f'{data_folder}/{ch}{fi}.npy'
        d = np.load(path, allow_pickle=True)
        # filter (>1k events), transform to image, flatten
        d = d[[e.shape[0] >= 1_000 for e in d]]
        d = np.array([transform(img) for img in d]).reshape(-1, 1536)

        # random 80/20 split
        train_len = d.shape[0] * 4 // 5
        test_len = d.shape[0] - train_len
        shuffled_idxs = np.random.permutation(d.shape[0])
        train_data.append(d[shuffled_idxs[:train_len]])
        test_data.append(d[shuffled_idxs[train_len:]])

        train_labels += [class_idx] * train_len
        test_labels += [class_idx] * test_len
        train_trials += [fi] * train_len
        test_trials += [fi] * test_len
train_data = np.concatenate(train_data)
test_data = np.concatenate(test_data)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
train_trials = np.array(train_trials)
test_trials = np.array(test_trials)

In [62]:
# train linear classifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# train pipeline: standard scaler + logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('model', LogisticRegression(max_iter=1000))
])
pipeline.fit(train_data, train_labels)

tracc = pipeline.score(train_data, train_labels)
print(f'Train accuracy: {tracc:.2%}')

print(f'Test accuracy: ', end='')
teacc = {}
for te_idx in range(1, 5):
    te_mask = test_trials == te_idx
    teacc[te_idx] = pipeline.score(test_data[te_mask], test_labels[te_mask])
    print(f'{teacc[te_idx]:.2%}', end=' ')
    tr_mask = train_trials == te_idx
    ntot = np.sum(train_labels[tr_mask] == 0) + np.sum(train_labels[tr_mask] == 1)
    print(f'({np.sum(train_labels[tr_mask] == 0)/ntot:.2f}:{np.sum(train_labels[tr_mask] == 1)/ntot:.2f})', end=', ')
print('\n')

Train accuracy: 98.75%
Test accuracy: 97.01% (0.52:0.48), 96.95% (0.28:0.72), 98.72% (0.52:0.48), 97.76% (0.71:0.29), 

