# Find indices for Long-tail Datasets

First, we load the PyTorch CIFAR10 dataset that the indices will be based off of.

In [1]:
import numpy as np
from torchvision import transforms
from torchvision.datasets import CIFAR10

transform = transforms.Compose([transforms.ToTensor()])
cifar10_dataset = CIFAR10(root="../data", train=True, download=True, transform=transform)
cifar10_images = [(255 * image.numpy()).astype(np.uint8) for image, _ in cifar10_dataset]
# np.in1d trick - https://stackoverflow.com/a/16216866
cifar10_images_bytes = np.array([cifar10_image.tobytes() for cifar10_image in cifar10_images])
cifar10_labels = [label for _, label in cifar10_dataset]

Files already downloaded and verified


We do the same for CIFAR100.

In [2]:
from torchvision.datasets import CIFAR100

cifar100_dataset = CIFAR100(root="../data", train=True, download=True, transform=transform)
cifar100_images = [(255 * image.numpy()).astype(np.uint8) for image, _ in cifar100_dataset]
# np.in1d trick - https://stackoverflow.com/a/16216866
cifar100_images_bytes = np.array([cifar100_images.tobytes() for cifar100_images in cifar100_images])
cifar100_labels = [label for _, label in cifar100_dataset]

Files already downloaded and verified


Then, we download the tfrecords from Cui et al.

In [3]:
import os
import zipfile

import gdown

if not os.path.exists("tfrecords/data/"):
    # Download from Google Drive
    # https://github.com/richardaecn/class-balanced-loss/blob/master/README.md#datasets
    url = "https://drive.google.com/uc?id=1NY3lWYRfsTWfsjFPxJUlPumy-WFeD7zK"
    output = "tfrecords.zip"
    gdown.download(url, output, quiet=False)

    # Unzip to `tfrecords/data/`
    with zipfile.ZipFile("tfrecords.zip", "r") as zip_ref:
        zip_ref.extractall("tfrecords")

    # Cleanup
    os.remove("tfrecords.zip")

We can now compare images one-by-one to find and save indices.

In [4]:
tfrecords_to_indices = {
    "tfrecords/data/cifar-10-data-im-0.1/train.tfrecords": "cifar10ir10.indices",
    "tfrecords/data/cifar-10-data-im-0.05/train.tfrecords": "cifar10ir20.indices",
    "tfrecords/data/cifar-10-data-im-0.02/train.tfrecords": "cifar10ir50.indices",
    "tfrecords/data/cifar-10-data-im-0.01/train.tfrecords": "cifar10ir100.indices",
    "tfrecords/data/cifar-10-data-im-0.005/train.tfrecords": "cifar10ir200.indices",
    "tfrecords/data/cifar-100-data-im-0.1/train.tfrecords": "cifar100ir10.indices",
    "tfrecords/data/cifar-100-data-im-0.05/train.tfrecords": "cifar100ir20.indices",
    "tfrecords/data/cifar-100-data-im-0.02/train.tfrecords": "cifar100ir50.indices",
    "tfrecords/data/cifar-100-data-im-0.01/train.tfrecords": "cifar100ir100.indices",
    "tfrecords/data/cifar-100-data-im-0.005/train.tfrecords": "cifar100ir200.indices",
}

In [5]:
import tensorflow as tf

for tfrecords_filepath, indices_filepath in tfrecords_to_indices.items():
    # Load TF dataset
    images, labels = [], []
    for raw_record in tf.data.TFRecordDataset([tfrecords_filepath]):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())
        image_bytes = example.features.feature['image'].bytes_list.value
        image_tf = tf.io.decode_raw(image_bytes, tf.uint8)
        image = tf.reshape(image_tf, [1, 3, 32, 32]).numpy()
        images.extend(image)
        label = example.features.feature['label'].int64_list.value
        labels.extend(label)
    cui_images = np.array(images)
    cui_labels = np.array(labels)
    cui_images_bytes = np.array([cui_image.tobytes() for cui_image in cui_images])

    # Find indices
    if "cifar-100-" in tfrecords_filepath:
        intersection = np.in1d(cifar100_images_bytes, cui_images_bytes)
    else:
        intersection = np.in1d(cifar10_images_bytes, cui_images_bytes)
    indices = np.where(intersection)[0]

    # Save indices
    with open(indices_filepath, "w") as f:
        for i in indices:
            f.write(f"{i}\n")

2023-03-04 14:10:30.197526: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 14:10:30.676687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-04 14:10:30.676730: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-03-04 14:10:31.074957: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but