## Download and unzip ZIP File

In [9]:
!pip install gdown -qqq

In [10]:
import gdown

url = 'https://drive.google.com/uc?id=1NY3lWYRfsTWfsjFPxJUlPumy-WFeD7zK'
output = 'data.zip'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1NY3lWYRfsTWfsjFPxJUlPumy-WFeD7zK
To: /root/pure-noise/data.zip
100%|██████████████████████████████████████████████████████████| 1.49G/1.49G [00:06<00:00, 217MB/s]


'data.zip'

In [11]:
import zipfile
with zipfile.ZipFile("data.zip", "r") as zip_ref:
    zip_ref.extractall(".")

## Load TFRecords to PyTorch

In [14]:
!pip install tensorflow tfrecord -qqq

In [20]:
import torch
from tfrecord.torch.dataset import TFRecordDataset

tfrecord_path = "data/cifar-10-data-im-0.01/train.tfrecords"
index_path = None
description = { "image": "byte", "label": "int" }
dataset = TFRecordDataset(tfrecord_path, index_path, description)
loader = torch.utils.data.DataLoader(dataset, batch_size=128)

In [22]:
data = next(iter(loader))
print(data["image"].shape)
print(data["label"].shape)

torch.Size([128, 3072])
torch.Size([128, 1])


## Convert TFRecords 

In [26]:
!pip install opencv-python -qqq

In [32]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import cv2, os, json
import numpy as np
from tqdm import tqdm

def read_and_decode(filename_queue):
    """Parses a single tf.Example into image and label tensors."""
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)
    features = tf.parse_single_example(
        serialized_example,
        features={
            "image": tf.FixedLenFeature([], tf.string),
            "label": tf.FixedLenFeature([], tf.int64),
        })
    image = tf.decode_raw(features["image"], tf.uint8)
    image.set_shape([3*32*32])
    label = tf.cast(features["label"], tf.int32)
    return image, label


def convert_from_tfrecords(data_root, dir_name, num_class, mode, output_path, json_file_prefix):
    if mode == 'valid':
        tfrecord_path = os.path.join(data_root, dir_name, 'eval.tfrecords')
    else:
        tfrecord_path = os.path.join(data_root, dir_name, 'train.tfrecords')
    filename_queue = tf.train.string_input_producer([tfrecord_path], shuffle=False, num_epochs=1)

    reader = tf.TFRecordReader()

    _, serialized_example = reader.read(filename_queue)
    image, label = read_and_decode(filename_queue)

    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    sess = tf.Session()
    sess.run(init_op)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    annotations = []
    try:
        step = 0
        while not coord.should_stop():
            images, labels = sess.run([image, label])
            images = cv2.cvtColor(images.reshape(3, 32, 32).transpose(1, 2, 0), cv2.COLOR_RGB2BGR)
            im_path = os.path.join(output_path, json_file_prefix, 'images', str(labels))
            if not os.path.exists(im_path):
                os.makedirs(im_path)
            save_path = os.path.join(im_path, '{}_{}.jpg'.format(mode, step))
            cv2.imwrite(save_path, images)
            annotations.append({'fpath': save_path, 'image_id': step, 'category_id':int(labels)})
            step += 1
    except tf.errors.OutOfRangeError:
        print('done')
    finally:
        coord.request_stop()

    with open(os.path.join(output_path, json_file_prefix, json_file_prefix+'_{}.json'.format(mode)), 'w') as f:
        json.dump({'annotations': annotations, 'num_classes': num_class}, f)

    print('Json has been saved to', os.path.join(output_path, json_file_prefix, json_file_prefix+'_{}.json'.format(mode)))

Instructions for updating:
non-resource variables are not supported in the long term


In [33]:
from types import SimpleNamespace

modes = ['train', 'valid']
args = SimpleNamespace(input_path="data/", output_path="converted_data/")

cifar10_im50 = {'dir': 'cifar-10-data-im-0.02', 'json': 'cifar10_imbalance50', 'class': 10}
cifar10_im100 = {'dir': 'cifar-10-data-im-0.01', 'json': 'cifar10_imbalance100', 'class':10}
cifar100_im50 = {'dir': 'cifar-100-data-im-0.02', 'json': 'cifar100_imbalance50', 'class':100}
cifar100_im100 = {'dir': 'cifar-100-data-im-0.01', 'json': 'cifar100_imbalance100', 'class': 100}

for m in modes:
    convert_from_tfrecords(
        args.input_path, cifar10_im50['dir'],
        cifar10_im50['class'], m, args.output_path,
        cifar10_im50['json']
    )
    convert_from_tfrecords(
        args.input_path, cifar10_im100['dir'],
        cifar10_im100['class'], m, args.output_path,
        cifar10_im100['json']
    )
    convert_from_tfrecords(
        args.input_path, cifar100_im100['dir'],
        cifar100_im100['class'], m, args.output_path,
        cifar100_im100['json']
    )
    convert_from_tfrecords(
        args.input_path, cifar100_im50['dir'],
        cifar100_im50['class'], m, args.output_path,
        cifar100_im50['json']
    )


Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(string_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensors(tensor).repeat(num_epochs)`.
Instructions for updating:
Prefer Dataset.range instead.
Instructions for updating:
Prefer Dataset.range instead.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
Queue-based inp

2022-12-22 10:03:24.524589: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/python3.7/site-packages/cv2/../../lib64:
2022-12-22 10:03:24.524668: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-12-22 10:03:24.524905: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, reb

done
Json has been saved to converted_data/cifar10_imbalance50/cifar10_imbalance50_train.json
done
Json has been saved to converted_data/cifar10_imbalance100/cifar10_imbalance100_train.json
done
Json has been saved to converted_data/cifar100_imbalance100/cifar100_imbalance100_train.json
done
Json has been saved to converted_data/cifar100_imbalance50/cifar100_imbalance50_train.json


2022-12-22 10:03:46.158595: W tensorflow/c/c_api.cc:291] Operation '{name:'input_producer/limit_epochs/epochs/Assign' id:12 op device:{requested: '', assigned: ''} def:{{{node input_producer/limit_epochs/epochs/Assign}} = Assign[T=DT_INT64, _class=["loc:@input_producer/limit_epochs/epochs"], _has_manual_control_dependencies=true, use_locking=true, validate_shape=true](input_producer/limit_epochs/epochs, input_producer/limit_epochs/Const)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


done
Json has been saved to converted_data/cifar10_imbalance50/cifar10_imbalance50_valid.json
done
Json has been saved to converted_data/cifar10_imbalance100/cifar10_imbalance100_valid.json
done
Json has been saved to converted_data/cifar100_imbalance100/cifar100_imbalance100_valid.json
done
Json has been saved to converted_data/cifar100_imbalance50/cifar100_imbalance50_valid.json
