# Testing Dataset

In [1]:
from datasets import get_dataset
from encoding.transforms import get_transform

In [2]:
(transform_train, transform_val) = get_transform("imagenet")

In [3]:
ds_args = [
    ("sml_lab", "single", "train", 1000, transform_train),
    ("sml_lab", "single", "test", 1000, transform_val),
    ("sml_lab_test", "single", None, None, transform_val),
    ("sml_expo_eval", "single", None, None, transform_val),
    ("sml_lab", "dual", "train", 1000, transform_train),
    ("sml_lab", "dual", "test", 1000, transform_val),
    ("sml_lab_test", "dual", None, None, transform_val),
    ("sml_expo_eval", "dual", None, None, transform_val)
]

In [4]:
for args in ds_args:
    ds = get_dataset(*args)
    print(f'{args[0]}, {args[1]}, split: {args[2]}, n per class: {args[3]}, len: {len(ds)}')

sml_lab, single, split: train, n per class: 1000, len: 29532
sml_lab, single, split: test, n per class: 1000, len: 8595
sml_lab_test, single, split: None, n per class: None, len: 5000
sml_expo_eval, single, split: None, n per class: None, len: 1506
sml_lab, dual, split: train, n per class: 1000, len: 100000
sml_lab, dual, split: test, n per class: 1000, len: 100000
sml_lab_test, dual, split: None, n per class: None, len: 125000
sml_expo_eval, dual, split: None, n per class: None, len: 11998


### Single Image Dataset
Are the images unique, or are there doubles?

In [5]:
sml_lab_single_train_ds = get_dataset("sml_lab", "single", "train", 1000, transform_train)

In [6]:
sml_lab_single_train_image_paths = [a[-1] for a in sml_lab_single_train_ds]

In [7]:
sml_lab_single_test_ds = get_dataset("sml_lab", "single", "test", 1000, transform_val)

In [8]:
sml_lab_single_test_image_paths = [a[-1] for a in sml_lab_single_test_ds]

Train and Test sets for Single Dataset are unique.

In [10]:
len(sml_lab_single_train_image_paths), len(sml_lab_single_test_image_paths)

(29532, 8595)

In [11]:
len(set(sml_lab_single_train_image_paths)), len(set(sml_lab_single_test_image_paths))

(29532, 8595)

In [17]:
sml_lab_test_ds = get_dataset("sml_lab_test", "single", None, None, transform_val)
sml_expo_eval_ds = get_dataset("sml_expo_eval", "single", None, None, transform_val)

In [14]:
sml_lab_test_image_paths = [a[-1] for a in sml_lab_test_ds]
sml_expo_eval_image_paths = [a[-1] for a in sml_expo_eval_ds]

Single Dataset Test and Eval Datasets are unique.

In [15]:
len(sml_lab_test_image_paths), len(sml_expo_eval_image_paths)

(5000, 1506)

In [16]:
len(set(sml_lab_test_image_paths)), len(set(sml_expo_eval_image_paths))

(5000, 1506)

### Dual Image Dataset

Are the images unique, or are there doubles?


In [23]:
sml_lab_dual_train_ds = get_dataset("sml_lab", "dual", "train", 1000, transform_train)
sml_lab_dual_test_ds = get_dataset("sml_lab", "dual", "test", 1000, transform_val)
sml_lab_test_dual_ds = get_dataset("sml_lab_test", "dual", None, None, transform_val)
sml_expo_eval_dual_ds = get_dataset("sml_expo_eval", "dual", None, None, transform_val)

In [24]:
for a in sml_lab_dual_train_ds:
    print(a)
    break

(None, None, 0, 'data/training/SML/sml_05-13/A1/B/20240513164918095.jpg', 'data/training/SML/sml_04-17/A1/A/20240417120538101.jpg')


In [25]:
sml_lab_dual_train_image_paths = [(a[-2],a[-1]) for a in sml_lab_dual_train_ds]
sml_lab_dual_test_image_paths = [(a[-2],a[-1]) for a in sml_lab_dual_test_ds]
sml_lab_test_dual_image_paths = [(a[-2],a[-1]) for a in sml_lab_test_dual_ds]
sml_expo_eval_dual_image_paths = [(a[-2],a[-1]) for a in sml_expo_eval_dual_ds]

In [26]:
len(sml_lab_dual_train_image_paths), len(sml_lab_dual_test_image_paths), len(sml_lab_test_dual_image_paths), len(sml_expo_eval_dual_image_paths)

(100000, 100000, 125000, 11998)

In [27]:
len(set(sml_lab_dual_train_image_paths)), len(set(sml_lab_dual_test_image_paths)), len(set(sml_lab_test_dual_image_paths)), len(set(sml_expo_eval_dual_image_paths))

(100000, 100000, 125000, 11998)

In [None]:
# Test

In [34]:
train_pairs = [(p[0].split('/')[5], p[1].split('/')[5]) for p in sml_lab_dual_train_image_paths]

In [35]:
test_pairs = [(p[0].split('/')[5], p[1].split('/')[5]) for p in sml_lab_dual_test_image_paths]

In [36]:
from collections import Counter

In [37]:
Counter(train_pairs)

Counter({('A', 'B'): 50164, ('B', 'A'): 49836})

In [38]:
Counter(test_pairs)

Counter({('B', 'A'): 50121, ('A', 'B'): 49879})