In [1]:
import os
import glob
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import random

In [2]:
labels = os.listdir("tiny-imagenet-200/train")
print(len(labels))

200


In [4]:
unseen_test_labels = ['n02814533', 'n02236044', 'n04328186', 'n02423022', 'n06596364', 'n02892201', 'n07749582', 'n01944390', 'n04532106', 'n02129165']
labels = [y for y in labels if y not in unseen_test_labels]
labels = random.sample(labels, 100)

In [7]:
sorted_labels = sorted(labels)
labels_dict = {"label2id": {}, "id2label": {}}
for i, label in enumerate(sorted_labels):
    labels_dict["label2id"][label] = i
    labels_dict["id2label"][i] = label

import json
json.dump(labels_dict, open("labels_dict.json", "w"))

In [8]:
train_data = []
for y in labels:
    images = os.listdir(os.path.join("tiny-imagenet-200", "train", y, "images"))
    for x in images:
        path = f"train/{y}/images/{x}"
        train_data.append({
            "image_path": path,
            "label": y
        })

In [9]:
train_df = pd.DataFrame(train_data)
train_df

Unnamed: 0,image_path,label
0,train/n03976657/images/n03976657_0.JPEG,n03976657
1,train/n03976657/images/n03976657_1.JPEG,n03976657
2,train/n03976657/images/n03976657_10.JPEG,n03976657
3,train/n03976657/images/n03976657_100.JPEG,n03976657
4,train/n03976657/images/n03976657_101.JPEG,n03976657
...,...,...
49995,train/n02233338/images/n02233338_95.JPEG,n02233338
49996,train/n02233338/images/n02233338_96.JPEG,n02233338
49997,train/n02233338/images/n02233338_97.JPEG,n02233338
49998,train/n02233338/images/n02233338_98.JPEG,n02233338


In [10]:
train_df.label.value_counts()

n03976657    500
n02814860    500
n02437312    500
n04146614    500
n07875152    500
            ... 
n04023962    500
n03670208    500
n03814639    500
n03388043    500
n02233338    500
Name: label, Length: 100, dtype: int64

In [11]:
train_df.to_csv('train.csv', index=False)

# Val Test Split

In [12]:
seen_test_indices = [5, 6, 10, 11, 12, 15, 16, 18, 22, 25, 27, 29, 32, 33, 36, 37, 43, 44, 48, 49]

In [19]:
with open("tiny-imagenet-200/val/val_annotations.txt", "r") as f:
    lines = f.readlines()

val_samples = {}
for line in lines:
    tmp = line.strip().split('\t')
    path = f"val/images/{tmp[0]}"
    label = tmp[1]
    if label in unseen_test_labels: continue
    if label not in labels: continue

    if label in val_samples: val_samples[label].append(path)
    else: val_samples[label] = [path]

In [20]:
val_data = []
seen_test_samples = {}

for y, x in val_samples.items():
    for i, path in enumerate(x):
        if i in seen_test_indices:
            if y in seen_test_samples: seen_test_samples[y].append(path)
            else: seen_test_samples[y] = [path]
        else: val_data.append({
            "image_path": path,
            "label": y
        })

In [21]:
val_df = pd.DataFrame(val_data)
val_df

Unnamed: 0,image_path,label
0,val/images/val_0.JPEG,n03444034
1,val/images/val_284.JPEG,n03444034
2,val/images/val_355.JPEG,n03444034
3,val/images/val_505.JPEG,n03444034
4,val/images/val_1007.JPEG,n03444034
...,...,...
2995,val/images/val_8204.JPEG,n07875152
2996,val/images/val_8334.JPEG,n07875152
2997,val/images/val_8543.JPEG,n07875152
2998,val/images/val_8608.JPEG,n07875152


In [22]:
val_df.label.value_counts()

n03444034    30
n02814860    30
n03447447    30
n01882714    30
n03100240    30
             ..
n01945685    30
n01698640    30
n03970156    30
n01443537    30
n07875152    30
Name: label, Length: 100, dtype: int64

In [23]:
val_df.to_csv('val.csv', index=False)

# Create Seen Test

In [24]:
seen_seen_test_data = []

for y, x in seen_test_samples.items():
    random.shuffle(x)
    for i in range(10):
        seen_seen_test_data.append({
            'image1': x[i],
            'image2': x[len(x)-i-1],
            'label': 1
        })
    
    random.shuffle(x)
    for i in range(10):
        neg_id = random.sample([elem for i, elem in enumerate(seen_test_samples.keys()) if i != y], 1)[0]
        neg_sample = random.sample(seen_test_samples[neg_id], 1)[0]
        seen_seen_test_data.append({
            'image1': x[i],
            'image2': neg_sample,
            'label': 0
        })

In [25]:
seen_seen_test_df = pd.DataFrame(seen_seen_test_data)
seen_seen_test_df

Unnamed: 0,image1,image2,label
0,val/images/val_9939.JPEG,val/images/val_9382.JPEG,1
1,val/images/val_9418.JPEG,val/images/val_5453.JPEG,1
2,val/images/val_1008.JPEG,val/images/val_2507.JPEG,1
3,val/images/val_2556.JPEG,val/images/val_2134.JPEG,1
4,val/images/val_4839.JPEG,val/images/val_3433.JPEG,1
...,...,...,...
1995,val/images/val_5889.JPEG,val/images/val_8247.JPEG,0
1996,val/images/val_2768.JPEG,val/images/val_7634.JPEG,0
1997,val/images/val_1402.JPEG,val/images/val_3451.JPEG,0
1998,val/images/val_5020.JPEG,val/images/val_2396.JPEG,0


In [26]:
seen_seen_test_df.to_csv('seen_seen_test.csv', index=False)

# Create Unseen Test

In [27]:
with open("tiny-imagenet-200/val/val_annotations.txt", "r") as f:
    lines = f.readlines()

unseen_test_samples = {}
for line in lines:
    tmp = line.strip().split('\t')
    path = os.path.join("val", "images", tmp[0]).replace('\\', '/')
    label = tmp[1]
    if label in unseen_test_labels:
        if label in unseen_test_samples: unseen_test_samples[label].append(path)
        else: unseen_test_samples[label] = [path]

In [28]:
unseen_unseen_test_data = []

for y, x in unseen_test_samples.items():
    for i in range(10):
        aid = random.randint(0, len(x)-1)
        anchor = x[aid]
        pos_sample = random.sample([elem for i, elem in enumerate(x) if i != aid], 1)
        unseen_unseen_test_data.append({
            'image1': anchor,
            'image2': pos_sample[0],
            'label': 1
        })

        neg_id = random.sample([elem for i, elem in enumerate(unseen_test_samples.keys()) if i != y], 1)
        neg_sample = random.sample(unseen_test_samples[neg_id[0]], 1)
        unseen_unseen_test_data.append({
            'image1': anchor,
            'image2': neg_sample[0],
            'label': 0
        })

In [29]:
unseen_unseen_test_df = pd.DataFrame(unseen_unseen_test_data)
unseen_unseen_test_df

Unnamed: 0,image1,image2,label
0,val/images/val_217.JPEG,val/images/val_2313.JPEG,1
1,val/images/val_217.JPEG,val/images/val_7330.JPEG,0
2,val/images/val_3753.JPEG,val/images/val_898.JPEG,1
3,val/images/val_3753.JPEG,val/images/val_847.JPEG,0
4,val/images/val_8693.JPEG,val/images/val_1224.JPEG,1
...,...,...,...
195,val/images/val_5813.JPEG,val/images/val_235.JPEG,0
196,val/images/val_5163.JPEG,val/images/val_8155.JPEG,1
197,val/images/val_5163.JPEG,val/images/val_2024.JPEG,0
198,val/images/val_1666.JPEG,val/images/val_2541.JPEG,1


In [30]:
unseen_unseen_test_df.to_csv('unseen_unseen_test.csv', index=False)

# Seen Unseen Test

In [31]:
val_labels = val_df.label.unique()
seen_test_labels = random.sample(list(val_labels), 10)

In [32]:
random.shuffle(unseen_test_labels)
seen_unseen_test_data = []
for seen_label, unseen_label in zip(seen_test_labels, unseen_test_labels):
    seen_paths = random.sample(seen_test_samples[seen_label], 20)
    unseen_paths = random.sample(unseen_test_samples[unseen_label], 20)
    for sp, up in zip(seen_paths, unseen_paths):
        seen_unseen_test_data.append({
            'image1': sp,
            'image2': up,
            'label': 0
        })

In [33]:
seen_unseen_test_df = pd.DataFrame(seen_unseen_test_data)
seen_unseen_test_df

Unnamed: 0,image1,image2,label
0,val/images/val_8250.JPEG,val/images/val_2423.JPEG,0
1,val/images/val_7970.JPEG,val/images/val_545.JPEG,0
2,val/images/val_6757.JPEG,val/images/val_1603.JPEG,0
3,val/images/val_2307.JPEG,val/images/val_6178.JPEG,0
4,val/images/val_8743.JPEG,val/images/val_4533.JPEG,0
...,...,...,...
195,val/images/val_1613.JPEG,val/images/val_6167.JPEG,0
196,val/images/val_7241.JPEG,val/images/val_6849.JPEG,0
197,val/images/val_3571.JPEG,val/images/val_2398.JPEG,0
198,val/images/val_2193.JPEG,val/images/val_2079.JPEG,0


In [34]:
seen_unseen_test_df.to_csv('seen_unseen_test.csv', index=False)