In [3]:
import tensorflow as tf
import numpy as np
import os

In [4]:
def read_stl10_dataset(path_to_data):
    """
    :param path_to_data: the file containing the binary images from the STL-10 dataset
    :return: an array containing all the images
    """

    with open(path_to_data, 'rb') as f:
        # read whole file in uint8 chunks
        everything = np.fromfile(f, dtype=np.uint8)

        # We force the data into 3x96x96 chunks, since the
        # images are stored in "column-major order", meaning
        # that "the first 96*96 values are the red channel,
        # the next 96*96 are green, and the last are blue."
        # The -1 is since the size of the pictures depends
        # on the input file, and this way numpy determines
        # the size on its own.

        images = np.reshape(everything, (-1, 3, 96, 96))

        # Now transpose the images into a standard image format
        # readable by, for example, matplotlib.imshow
        # You might want to comment this line or reverse the shuffle
        # if you will use a learning algorithm like CNN, since they like
        # their channels separated.
        images = np.transpose(images, (0, 3, 2, 1))
        return images

In [5]:
data_path = '/home/thalles/Downloads/stl10_binary/unlabeled_X.bin'

In [6]:
images = read_stl10_dataset(data_path)
print(images.shape)

In [7]:
TRAIN_DATASET_DIR="./tfrecords/"
if not os.path.exists(TRAIN_DATASET_DIR):
    os.mkdir(TRAIN_DATASET_DIR)
    
TRAIN_FILE = 'train.tfrecords'
writer = tf.io.TFRecordWriter(os.path.join(TRAIN_DATASET_DIR,TRAIN_FILE))

In [8]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [9]:
def create_tfrecord_dataset(images, writer):

    # create training tfrecord
    read_imgs_counter = 0
    for i, image in enumerate(images):
            
        read_imgs_counter += 1
        image_h = image.shape[0]
        image_w = image.shape[1]

        img_raw = image.tostring()

        example = tf.train.Example(features=tf.train.Features(feature={
                'height': _int64_feature(image_h),
                'width': _int64_feature(image_w),
                'image_raw': _bytes_feature(img_raw)}))

        writer.write(example.SerializeToString())
    
    print("End of TfRecord. Total of image written:", read_imgs_counter)
    writer.close()

In [10]:
create_tfrecord_dataset(images, writer)

End of TfRecord. Total of image written: 100000
