In [1]:
import tensorflow as tf
from tensorflow import keras

print('TensorFlow version {}'.format(tf.__version__))
print('Keras version {}'.format(keras.__version__))


TensorFlow version 1.12.0
Keras version 2.1.6-tf


In [4]:
"""
Run this cell to convert the training/validation/test set to tfrecords

If you want to run this code, 
1. copy this cell to a .py file and add a line to change sys.path
2. change FLAGS.DEFAULT_DOWNLOAD to:
/data/cephfs/punim0811/Datasets/iNaturalist/
3. Change output_folder accordingly because I intentionally did not grant 
you writing permission to the above folder. 

Convertion time cost:
    train - 7800 seconds
    val - 100 seconds
    test - 1100 seconds
"""
from GeneralTools.misc_fun import FLAGS
FLAGS.DEFAULT_DOWNLOAD = '/media/richard/My Book/MyBackup/Data/Kaggle_iNaturalist_2019/'
from GeneralTools.inaturalist_func import images_to_tfrecords
import os.path
import json

key = 'train'  # choose from {'train', 'val', 'test'}
num_images_per_tfrecord = {'train': 11531, 'val': 3030, 'test': 17675}
target_size = 299  # change this if you want other image resolution
num_images_per_tfrecord = num_images_per_tfrecord[key]

# read json file
annotation_file = '{}2019.json'.format(key)
with open(os.path.join(FLAGS.DEFAULT_DOWNLOAD, annotation_file)) as data_file:
    image_annotations = json.load(data_file)

# extract image file names and classes if provided
images = image_annotations['images']
annotations = image_annotations['annotations'] if 'annotations' in image_annotations else None
image_names = [image['file_name'] for image in images]
image_class = None if annotations is None else [annotation['category_id'] for annotation in annotations]
image_index = 4
print('The {}-th validation image locates at {}; its class is {}'.format(
    image_index, image_names[image_index], 'unknown' if image_class is None else image_class[image_index]))
num_images = len(image_names)
print('There are {} images in {}'.format(num_images, annotation_file))

# configure folders to save the data
output_folder = os.path.join(
    FLAGS.DEFAULT_DOWNLOAD, 'tfrecords_{}/'.format(target_size))
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_filename = output_folder + key
print('The tfrecords are saved to {}'.format(output_filename))

# # uncomment the following lines to do the actual conversion
# images_to_tfrecords(
#     image_names, output_filename, num_images_per_tfrecord, 
#     image_class=image_class, target_size=299)


The 4-th validation image locates at train_val2019/Plants/739/ffa06f951e99de9d220aee2c3309b66c.jpg; its class is 739
There are 265213 images in train2019.json
The tfrecords are saved to /media/richard/My Book/MyBackup/Data/Kaggle_iNaturalist_2019/tfrecords_299/train


In [3]:
"""
This cell reads the train/vali/test tfrecords, and visualize one sample from
the batch.

If you want to run this code, 
1. copy this cell to a .py file and add a line to change sys.path
2. change FLAGS.DEFAULT_IN to:
/data/cephfs/punim0811/Datasets/iNaturalist/tfrecords_299/
or your local machine address accordingly if you decide to download 
some tfrecords file.
3. Note that on Spartan, im.show() would not work. However, you may save the 
example you want to visualize, e.g., im.save('test_image.jpg', 'JPEG')
"""
from GeneralTools.misc_fun import FLAGS
FLAGS.DEFAULT_IN = '/media/richard/My Book/MyBackup/Data/Kaggle_iNaturalist_2019/tfrecords_299/'
FLAGS.IMAGE_FORMAT = 'channels_last'
FLAGS.IMAGE_FORMAT_ALIAS = 'NHWC'
from GeneralTools.inaturalist_func import ReadTFRecords
import os
import tensorflow as tf
from PIL import Image
import numpy as np

batch_size = 2
target_size = 299
key = 'val'
data_size = {'train': 265213, 'val': 3030, 'test': 35350}
data_label = {'train': 1, 'val': 1, 'test': 0}
num_images = data_size[key]
skip_count = num_images % batch_size
num_labels = data_label[key]
    
filenames = os.listdir(FLAGS.DEFAULT_IN)
filenames = [filename.replace('.tfrecords', '') for filename in filenames if key in filename]
print(filenames)

dataset = ReadTFRecords(
    filenames, num_labels=num_labels, batch_size=1, 
    skip_count=skip_count, num_threads=8, decode_jpeg=True)
dataset.shape2image(3, target_size, target_size)
data_batch = dataset.next_batch()

with tf.Session() as sess:
    if key == 'test':
        x = sess.run(data_batch['x'])
    else:
        x, y = sess.run([data_batch['x'], data_batch['y']])

# visualize one sample from the batch
x_im = (x[0] + 1.0) * 127.5
im = Image.fromarray(x_im.astype(np.uint8), 'RGB')
im.show()


['val_000']
