In [1]:
import os
import glob
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import random

In [2]:
labels = os.listdir("tiny-imagenet-200/train")
print(len(labels))

200


In [3]:
unseen_test_labels = ['n02814533', 'n02236044', 'n04328186', 'n02423022', 'n06596364', 'n02892201', 'n07749582', 'n01944390', 'n04532106', 'n02129165']
labels = [y for y in labels if y not in unseen_test_labels]
labels = random.sample(labels, 50)

In [4]:
sorted_labels = sorted(labels)
labels_dict = {"label2id": {}, "id2label": {}}
for i, label in enumerate(sorted_labels):
    labels_dict["label2id"][label] = i
    labels_dict["id2label"][i] = label

import json
json.dump(labels_dict, open("labels_dict.json", "w"))

In [5]:
train_data = []
for y in labels:
    images = os.listdir(os.path.join("tiny-imagenet-200", "train", y, "images"))
    for x in images:
        path = f"train/{y}/images/{x}"
        train_data.append({
            "image_path": path,
            "label": y
        })

In [6]:
train_df = pd.DataFrame(train_data)
train_df

Unnamed: 0,image_path,label
0,train/n02769748/images/n02769748_0.JPEG,n02769748
1,train/n02769748/images/n02769748_1.JPEG,n02769748
2,train/n02769748/images/n02769748_10.JPEG,n02769748
3,train/n02769748/images/n02769748_100.JPEG,n02769748
4,train/n02769748/images/n02769748_101.JPEG,n02769748
...,...,...
24995,train/n03444034/images/n03444034_95.JPEG,n03444034
24996,train/n03444034/images/n03444034_96.JPEG,n03444034
24997,train/n03444034/images/n03444034_97.JPEG,n03444034
24998,train/n03444034/images/n03444034_98.JPEG,n03444034


In [7]:
train_df.label.value_counts()

n02769748    500
n02950826    500
n02963159    500
n02099712    500
n04311004    500
n03126707    500
n02909870    500
n04501370    500
n03649909    500
n01945685    500
n01917289    500
n04146614    500
n02233338    500
n07579787    500
n03404251    500
n09332890    500
n04486054    500
n07768694    500
n02268443    500
n04560804    500
n03976657    500
n03930313    500
n03983396    500
n03355925    500
n07615774    500
n04399382    500
n02509815    500
n03584254    500
n04507155    500
n02883205    500
n03447447    500
n03617480    500
n04356056    500
n02190166    500
n02917067    500
n02106662    500
n04149813    500
n01983481    500
n04254777    500
n02403003    500
n04371430    500
n02165456    500
n07734744    500
n07873807    500
n07871810    500
n03814639    500
n02437312    500
n03662601    500
n04023962    500
n03444034    500
Name: label, dtype: int64

In [8]:
train_df.to_csv('train.csv', index=False)

# Val Test Split

In [9]:
seen_test_indices = [5, 6, 10, 11, 12, 15, 16, 18, 22, 25, 27, 29, 32, 33, 36, 37, 43, 44, 48, 49]

In [10]:
with open("tiny-imagenet-200/val/val_annotations.txt", "r") as f:
    lines = f.readlines()

val_samples = {}
for line in lines:
    tmp = line.strip().split('\t')
    path = f"val/images/{tmp[0]}"
    label = tmp[1]
    if label in unseen_test_labels: continue
    if label not in labels: continue

    if label in val_samples: val_samples[label].append(path)
    else: val_samples[label] = [path]

In [11]:
val_data = []
seen_test_samples = {}

for y, x in val_samples.items():
    for i, path in enumerate(x):
        if i in seen_test_indices:
            if y in seen_test_samples: seen_test_samples[y].append(path)
            else: seen_test_samples[y] = [path]
        else: val_data.append({
            "image_path": path,
            "label": y
        })

In [12]:
val_df = pd.DataFrame(val_data)
val_df

Unnamed: 0,image_path,label
0,val/images/val_0.JPEG,n03444034
1,val/images/val_284.JPEG,n03444034
2,val/images/val_355.JPEG,n03444034
3,val/images/val_505.JPEG,n03444034
4,val/images/val_1007.JPEG,n03444034
...,...,...
1495,val/images/val_8414.JPEG,n03976657
1496,val/images/val_9137.JPEG,n03976657
1497,val/images/val_9681.JPEG,n03976657
1498,val/images/val_9731.JPEG,n03976657


In [13]:
val_df.label.value_counts()

n03444034    30
n02268443    30
n02403003    30
n03662601    30
n04254777    30
n02233338    30
n02509815    30
n02883205    30
n03447447    30
n04356056    30
n07873807    30
n02917067    30
n03983396    30
n04399382    30
n03649909    30
n03404251    30
n04507155    30
n04149813    30
n02099712    30
n03617480    30
n09332890    30
n02963159    30
n02106662    30
n04501370    30
n04311004    30
n07768694    30
n07615774    30
n02950826    30
n04146614    30
n04371430    30
n04486054    30
n03814639    30
n01983481    30
n03930313    30
n03584254    30
n02909870    30
n01917289    30
n07579787    30
n03355925    30
n03126707    30
n02165456    30
n01945685    30
n02437312    30
n02769748    30
n04560804    30
n04023962    30
n02190166    30
n07871810    30
n07734744    30
n03976657    30
Name: label, dtype: int64

In [14]:
val_df.to_csv('val.csv', index=False)

# Create Seen Test

In [15]:
seen_seen_test_data = []

for y, x in seen_test_samples.items():
    random.shuffle(x)
    for i in range(10):
        seen_seen_test_data.append({
            'image1': x[i],
            'image2': x[len(x)-i-1],
            'label': 1
        })
    
    random.shuffle(x)
    for i in range(10):
        neg_id = random.sample([elem for i, elem in enumerate(seen_test_samples.keys()) if i != y], 1)[0]
        neg_sample = random.sample(seen_test_samples[neg_id], 1)[0]
        seen_seen_test_data.append({
            'image1': x[i],
            'image2': neg_sample,
            'label': 0
        })

In [16]:
seen_seen_test_df = pd.DataFrame(seen_seen_test_data)
seen_seen_test_df

Unnamed: 0,image1,image2,label
0,val/images/val_5453.JPEG,val/images/val_9939.JPEG,1
1,val/images/val_8190.JPEG,val/images/val_9382.JPEG,1
2,val/images/val_7470.JPEG,val/images/val_3433.JPEG,1
3,val/images/val_1008.JPEG,val/images/val_6988.JPEG,1
4,val/images/val_2134.JPEG,val/images/val_4839.JPEG,1
...,...,...,...
995,val/images/val_3626.JPEG,val/images/val_6368.JPEG,0
996,val/images/val_3078.JPEG,val/images/val_3341.JPEG,0
997,val/images/val_7299.JPEG,val/images/val_5041.JPEG,0
998,val/images/val_3635.JPEG,val/images/val_2072.JPEG,0


In [17]:
seen_seen_test_df.to_csv('seen_seen_test.csv', index=False)

# Create Unseen Test

In [18]:
with open("tiny-imagenet-200/val/val_annotations.txt", "r") as f:
    lines = f.readlines()

unseen_test_samples = {}
for line in lines:
    tmp = line.strip().split('\t')
    path = os.path.join("val", "images", tmp[0]).replace('\\', '/')
    label = tmp[1]
    if label in unseen_test_labels:
        if label in unseen_test_samples: unseen_test_samples[label].append(path)
        else: unseen_test_samples[label] = [path]

In [19]:
unseen_unseen_test_data = []

for y, x in unseen_test_samples.items():
    for i in range(10):
        aid = random.randint(0, len(x)-1)
        anchor = x[aid]
        pos_sample = random.sample([elem for i, elem in enumerate(x) if i != aid], 1)
        unseen_unseen_test_data.append({
            'image1': anchor,
            'image2': pos_sample[0],
            'label': 1
        })

        neg_id = random.sample([elem for i, elem in enumerate(unseen_test_samples.keys()) if i != y], 1)
        neg_sample = random.sample(unseen_test_samples[neg_id[0]], 1)
        unseen_unseen_test_data.append({
            'image1': anchor,
            'image2': neg_sample[0],
            'label': 0
        })

In [20]:
unseen_unseen_test_df = pd.DataFrame(unseen_unseen_test_data)
unseen_unseen_test_df

Unnamed: 0,image1,image2,label
0,val/images/val_1671.JPEG,val/images/val_84.JPEG,1
1,val/images/val_1671.JPEG,val/images/val_4914.JPEG,0
2,val/images/val_4533.JPEG,val/images/val_4460.JPEG,1
3,val/images/val_4533.JPEG,val/images/val_6696.JPEG,0
4,val/images/val_2066.JPEG,val/images/val_2234.JPEG,1
...,...,...,...
195,val/images/val_1057.JPEG,val/images/val_7582.JPEG,0
196,val/images/val_6257.JPEG,val/images/val_3925.JPEG,1
197,val/images/val_6257.JPEG,val/images/val_567.JPEG,0
198,val/images/val_6198.JPEG,val/images/val_8155.JPEG,1


In [21]:
unseen_unseen_test_df.to_csv('unseen_unseen_test.csv', index=False)

# Seen Unseen Test

In [22]:
val_labels = val_df.label.unique()
seen_test_labels = random.sample(list(val_labels), 10)

In [23]:
random.shuffle(unseen_test_labels)
seen_unseen_test_data = []
for seen_label, unseen_label in zip(seen_test_labels, unseen_test_labels):
    seen_paths = random.sample(seen_test_samples[seen_label], 20)
    unseen_paths = random.sample(unseen_test_samples[unseen_label], 20)
    for sp, up in zip(seen_paths, unseen_paths):
        seen_unseen_test_data.append({
            'image1': sp,
            'image2': up,
            'label': 0
        })

In [24]:
seen_unseen_test_df = pd.DataFrame(seen_unseen_test_data)
seen_unseen_test_df

Unnamed: 0,image1,image2,label
0,val/images/val_8508.JPEG,val/images/val_3925.JPEG,0
1,val/images/val_1528.JPEG,val/images/val_1475.JPEG,0
2,val/images/val_4799.JPEG,val/images/val_1232.JPEG,0
3,val/images/val_1516.JPEG,val/images/val_6593.JPEG,0
4,val/images/val_3859.JPEG,val/images/val_1850.JPEG,0
...,...,...,...
195,val/images/val_7043.JPEG,val/images/val_1763.JPEG,0
196,val/images/val_3473.JPEG,val/images/val_6263.JPEG,0
197,val/images/val_5237.JPEG,val/images/val_4266.JPEG,0
198,val/images/val_6988.JPEG,val/images/val_1378.JPEG,0


In [34]:
seen_unseen_test_df.to_csv('seen_unseen_test.csv', index=False)