In [2]:
import os
import glob
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import random

In [1]:
labels = os.listdir("tiny-imagenet-200/train")
print(len(labels))

200


In [7]:
sorted_labels = sorted(labels)
labels_dict = {"label2id": {}, "id2label": {}}
for i, label in enumerate(sorted_labels):
    labels_dict["label2id"][label] = i
    labels_dict["id2label"][i] = label

import json
json.dump(labels_dict, open("labels_dict.json", "w"))

In [3]:
unseen_test_labels = ['n02814533', 'n02236044', 'n04328186', 'n02423022', 'n06596364', 'n02892201', 'n07749582', 'n01944390', 'n04532106', 'n02129165']
labels = [y for y in labels if y not in unseen_test_labels]

In [4]:
train_data = []
for y in labels:
    images = os.listdir(os.path.join("tiny-imagenet-200", "train", y, "images"))
    for x in images:
        path = os.path.join("train", y, "images", x)
        train_data.append({
            "image_path": path,
            "label": y
        })

In [5]:
train_df = pd.DataFrame(train_data)
train_df

Unnamed: 0,image_path,label
0,train\n01443537\images\n01443537_0.JPEG,n01443537
1,train\n01443537\images\n01443537_1.JPEG,n01443537
2,train\n01443537\images\n01443537_10.JPEG,n01443537
3,train\n01443537\images\n01443537_100.JPEG,n01443537
4,train\n01443537\images\n01443537_101.JPEG,n01443537
...,...,...
94995,train\n12267677\images\n12267677_95.JPEG,n12267677
94996,train\n12267677\images\n12267677_96.JPEG,n12267677
94997,train\n12267677\images\n12267677_97.JPEG,n12267677
94998,train\n12267677\images\n12267677_98.JPEG,n12267677


In [6]:
train_df.label.value_counts()

n01443537    500
n03992509    500
n03891332    500
n03902125    500
n03930313    500
            ... 
n02808440    500
n02814860    500
n02815834    500
n02823428    500
n12267677    500
Name: label, Length: 190, dtype: int64

In [7]:
train_df.to_csv('train.csv', index=False)

# Val Test Split

In [20]:
seen_test_indices = [5, 6, 10, 11, 12, 15, 16, 18, 22, 25, 27, 29, 32, 33, 36, 37, 43, 44, 48, 49]

In [22]:
with open("tiny-imagenet-200/val/val_annotations.txt", "r") as f:
    lines = f.readlines()

val_samples = {}
for line in lines:
    tmp = line.strip().split('\t')
    path = os.path.join("val", "images", tmp[0])
    label = tmp[1]
    if label in unseen_test_labels: continue

    if label in val_samples: val_samples[label].append(path)
    else: val_samples[label] = [path]

In [23]:
val_data = []
seen_test_samples = {}

for y, x in val_samples.items():
    for i, path in enumerate(x):
        if i in seen_test_indices:
            if y in seen_test_samples: seen_test_samples[y].append(path)
            else: seen_test_samples[y] = [path]
        else: val_data.append({
            "image_path": path,
            "label": y
        })

In [24]:
val_df = pd.DataFrame(val_data)
val_df

Unnamed: 0,image_path,label
0,val\images\val_0.JPEG,n03444034
1,val\images\val_284.JPEG,n03444034
2,val\images\val_355.JPEG,n03444034
3,val\images\val_505.JPEG,n03444034
4,val\images\val_1007.JPEG,n03444034
...,...,...
5695,val\images\val_8278.JPEG,n04118538
5696,val\images\val_8424.JPEG,n04118538
5697,val\images\val_9202.JPEG,n04118538
5698,val\images\val_9254.JPEG,n04118538


In [25]:
val_df.label.value_counts()

n03444034    30
n02699494    30
n03980874    30
n02123394    30
n07711569    30
             ..
n02074367    30
n03026506    30
n03544143    30
n02002724    30
n04118538    30
Name: label, Length: 190, dtype: int64

In [26]:
val_df.to_csv('val.csv', index=False)

# Create Seen Test

In [34]:
seen_seen_test_data = []

for y, x in seen_test_samples.items():
    random.shuffle(x)
    for i in range(10):
        seen_seen_test_data.append({
            'image1': x[i],
            'image2': x[len(x)-i-1],
            'label': 1
        })
    
    random.shuffle(x)
    for i in range(10):
        neg_id = random.sample([elem for i, elem in enumerate(seen_test_samples.keys()) if i != y], 1)[0]
        neg_sample = random.sample(seen_test_samples[neg_id], 1)[0]
        seen_seen_test_data.append({
            'image1': x[i],
            'image2': neg_sample,
            'label': 0
        })

In [35]:
seen_seen_test_df = pd.DataFrame(seen_seen_test_data)
seen_seen_test_df

Unnamed: 0,image1,image2,label
0,val\images\val_7470.JPEG,val\images\val_9939.JPEG,1
1,val\images\val_9418.JPEG,val\images\val_8190.JPEG,1
2,val\images\val_3780.JPEG,val\images\val_1262.JPEG,1
3,val\images\val_6988.JPEG,val\images\val_9382.JPEG,1
4,val\images\val_2556.JPEG,val\images\val_5237.JPEG,1
...,...,...,...
3795,val\images\val_4841.JPEG,val\images\val_5131.JPEG,0
3796,val\images\val_7419.JPEG,val\images\val_6651.JPEG,0
3797,val\images\val_9166.JPEG,val\images\val_5053.JPEG,0
3798,val\images\val_8965.JPEG,val\images\val_6997.JPEG,0


In [36]:
seen_seen_test_df.to_csv('seen_seen_test.csv', index=False)

# Create Unseen Test

In [27]:
with open("tiny-imagenet-200/val/val_annotations.txt", "r") as f:
    lines = f.readlines()

unseen_test_samples = {}
for line in lines:
    tmp = line.strip().split('\t')
    path = os.path.join("val", "images", tmp[0])
    label = tmp[1]
    if label in unseen_test_labels:
        if label in unseen_test_samples: unseen_test_samples[label].append(path)
        else: unseen_test_samples[label] = [path]

In [37]:
unseen_unseen_test_data = []

for y, x in unseen_test_samples.items():
    for i in range(10):
        aid = random.randint(0, len(x)-1)
        anchor = x[aid]
        pos_sample = random.sample([elem for i, elem in enumerate(x) if i != aid], 1)
        unseen_unseen_test_data.append({
            'image1': anchor,
            'image2': pos_sample[0],
            'label': 1
        })

        neg_id = random.sample([elem for i, elem in enumerate(unseen_test_samples.keys()) if i != y], 1)
        neg_sample = random.sample(unseen_test_samples[neg_id[0]], 1)
        unseen_unseen_test_data.append({
            'image1': anchor,
            'image2': neg_sample[0],
            'label': 0
        })

In [38]:
unseen_unseen_test_df = pd.DataFrame(unseen_unseen_test_data)
unseen_unseen_test_df

Unnamed: 0,image1,image2,label
0,val\images\val_1603.JPEG,val\images\val_898.JPEG,1
1,val\images\val_1603.JPEG,val\images\val_1292.JPEG,0
2,val\images\val_235.JPEG,val\images\val_84.JPEG,1
3,val\images\val_235.JPEG,val\images\val_84.JPEG,0
4,val\images\val_5677.JPEG,val\images\val_4645.JPEG,1
...,...,...,...
195,val\images\val_4380.JPEG,val\images\val_567.JPEG,0
196,val\images\val_8956.JPEG,val\images\val_5298.JPEG,1
197,val\images\val_8956.JPEG,val\images\val_6914.JPEG,0
198,val\images\val_3943.JPEG,val\images\val_4307.JPEG,1


In [39]:
unseen_unseen_test_df.to_csv('unseen_unseen_test.csv', index=False)

# Seen Unseen Test

In [46]:
val_labels = val_df.label.unique()
seen_test_labels = random.sample(list(val_labels), 10)

In [47]:
random.shuffle(unseen_test_labels)
seen_unseen_test_data = []
for seen_label, unseen_label in zip(seen_test_labels, unseen_test_labels):
    seen_paths = random.sample(seen_test_samples[seen_label], 20)
    unseen_paths = random.sample(unseen_test_samples[unseen_label], 20)
    for sp, up in zip(seen_paths, unseen_paths):
        seen_unseen_test_data.append({
            'image1': sp,
            'image2': up,
            'label': 0
        })

In [50]:
seen_unseen_test_df = pd.DataFrame(seen_unseen_test_data)
seen_unseen_test_df

Unnamed: 0,image1,image2,label
0,val\images\val_868.JPEG,val\images\val_9106.JPEG,0
1,val\images\val_1661.JPEG,val\images\val_9829.JPEG,0
2,val\images\val_5595.JPEG,val\images\val_2870.JPEG,0
3,val\images\val_7692.JPEG,val\images\val_4493.JPEG,0
4,val\images\val_7336.JPEG,val\images\val_6004.JPEG,0
...,...,...,...
195,val\images\val_1565.JPEG,val\images\val_674.JPEG,0
196,val\images\val_8134.JPEG,val\images\val_9428.JPEG,0
197,val\images\val_2285.JPEG,val\images\val_4909.JPEG,0
198,val\images\val_882.JPEG,val\images\val_8924.JPEG,0


In [51]:
seen_unseen_test_df.to_csv('seen_unseen_test.csv', index=False)