# How To Create TFRecords
In this notebook, we learn how to create TFRecords to train TensorFlow models. We will create TFRecords from the Kaggle dataset of 512x512x3 jpegs [here][1]. This dataset contains the Melanoma Classification competition data (train 30,000 and test 10,000 ) and an additional 30,000 external images. It was published by [Alex Shonenkov][2]

There is a discussion post about these TFRecords [here][3] and Alex discusses where these images came from [here][4]

[1]: https://www.kaggle.com/shonenkov/melanoma-merged-external-data-512x512-jpeg
[2]: https://www.kaggle.com/shonenkov
[3]: https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/156245
[4]: https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/155859

# Load Meta Data

In [None]:
# LOAD LIBRARIES
import numpy as np, pandas as pd, os
import matplotlib.pyplot as plt, cv2
import tensorflow as tf, re, math
import tensorflow.keras.backend as K

In [None]:
tfrec_shape = 256
crop_size = {256: 256, 384: 370, 512: 500, 768: 750}
net_size = {256: 256, 384: 370, 512: 500, 768: 750}

CFG = dict(
    read_size=tfrec_shape,
    crop_size=crop_size[tfrec_shape],
    net_size=net_size[tfrec_shape],

    # DATA AUGMENTATION
    rot=180.0,
    shr=1.5,
    hzoom=6.0,
    wzoom=6.0,
    hshift=6.0,
    wshift=6.0,
    num_aug=55,

    # HAIR AUGMENTATION:
    # hair_augm = hair_augm[tfrec_shape],
)

In [None]:
# PATHS TO IMAGES
PATH = '../input/siim-isic-melanoma-classification/jpeg/train/'
IMGS = os.listdir(PATH)
print('There are %i train images'%(len(IMGS)))

In [None]:
# LOAD TRAIN META DATA
df = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
df.head()

In [None]:
# LOAD TEST META DATA
# test = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
# test.head()

# Label Encode Meta Data
It is more efficient to store this meta data as integers instead of strings. We will impute the Age NaNs to Age mean. Then all other NaNs will be convert to `-1` and the other strings will be converted to `0, 1, 2, 3, ...` in the order they appear in the printed lists below.

In [None]:
# COMBINE TRAIN AND TEST TO ENCODE TOGETHER
# cols = test.columns
# print([df[cols],test[cols]])
# comb = pd.concat([df[cols],test[cols]],ignore_index=True,axis=0).reset_index(drop=True)
# print(comb)

In [None]:
# LABEL ENCODE ALL STRINGS
# cats = ['patient_id','sex','anatom_site_general_challenge'] 
# for c in cats:
#     comb[c],mp = comb[c].factorize()
#     print(mp)
# print('Imputing Age NaN count =',comb.age_approx.isnull().sum())
# comb.age_approx.fillna(comb.age_approx.mean(),inplace=True)
# comb['age_approx'] = comb.age_approx.astype('int')

In [None]:
# REWRITE DATA TO DATAFRAMES
# df[cols] = comb.loc[:df.shape[0]-1,cols].values
# test[cols] = comb.loc[df.shape[0]:,cols].values

In [None]:
# LABEL ENCODE TRAIN SOURCE
# df.source,mp = df.source.factorize()
# print(mp)

# LABEL ENCODE ALL STRINGS
cats = ['patient_id','sex','anatom_site_general_challenge', 'diagnosis'] 
for c in cats:
    df[c],mp = df[c].factorize()
    print(mp)
print('Imputing Age NaN count =',df.age_approx.isnull().sum())
df.age_approx.fillna(df.age_approx.mean(),inplace=True)
df['age_approx'] = df.age_approx.astype('int')

In [None]:
df.drop(['benign_malignant'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df_0 = df[df.target==0]
df_1 = df[df.target==1]

df_0_80 = df_0[0:int(0.8*len(df_0))]
df_0_20 = df_0[int(0.8*len(df_0)):len(df_0)]
df_1_80 = df_1[0:int(0.8*len(df_1))]
df_1_20 = df_1[int(0.8*len(df_1)):len(df_1)]

df_80 = pd.concat([df_0_80,df_1_80],ignore_index=True,axis=0).reset_index(drop=True)
val_20 = pd.concat([df_0_20,df_1_20],ignore_index=True,axis=0).reset_index(drop=True)

In [None]:
print(len(df_0_80))
print(len(df_0_20))
print(len(df_1_80))
print(len(df_1_20))
print('')
print(len(df_80))
print(len(val_20))

In [None]:
plt.figure(figsize=(16, 16))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img_path = PATH + df_80.iloc[i].image_name + '.jpg'
    img = plt.imread(img_path)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
plt.tight_layout()

In [None]:
IMG_SIZE = (256, 256)

def center_crop(img, new_size=IMG_SIZE):
    (height, width, _) = img.shape

    new_width = min(width, height)
    new_height = min(width, height)

    left = int(np.ceil((width - new_width) / 2))
    right = width - int(np.floor((width - new_width) / 2))

    top = int(np.ceil((height - new_height) / 2))
    bottom = height - int(np.floor((height - new_height) / 2))

    center_cropped_img = img[top:bottom, left:right]

    center_cropped_img = cv2.resize(center_cropped_img, IMG_SIZE, interpolation=cv2.INTER_AREA)
    return center_cropped_img


plt.figure(figsize=(16, 16))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img_path = PATH + df_80.iloc[i].image_name + '.jpg'
    img = cv2.imread(img_path)
    img = center_crop(img)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    plt.imshow(img, cmap='gray')
    plt.axis('off')
plt.tight_layout() 

> # Write TFRecords - Train
All the code below comes from TensorFlow's docs [here][1]

[1]: https://www.tensorflow.org/tutorials/load_data/tfrecord

In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(feature0, feature1, feature2, feature3, feature4, feature5, feature6, feature7):
  feature = {
      'image': _bytes_feature(feature0),
      'image_name': _bytes_feature(feature1),
      'patient_id': _int64_feature(feature2),
      'sex': _int64_feature(feature3),
      'age_approx': _int64_feature(feature4),
      'anatom_site_general_challenge': _int64_feature(feature5),
      'diagnosis': _int64_feature(feature6),
      'target': _int64_feature(feature7)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
print(len(df_80))
print(len(val_20))

### Augmentation

In [None]:
def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transformmatrix which transforms indicies

    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear = math.pi * shear / 180.

    def get_3x3_mat(lst):
        return tf.reshape(tf.concat([lst], axis=0), [3, 3])

    # ROTATION MATRIX
    c1 = tf.math.cos(rotation)
    s1 = tf.math.sin(rotation)
    one = tf.constant([1], dtype='float32')
    zero = tf.constant([0], dtype='float32')

    rotation_matrix = get_3x3_mat([c1, s1, zero,
                                   -s1, c1, zero,
                                   zero, zero, one])
    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)

    shear_matrix = get_3x3_mat([one, s2, zero,
                                zero, c2, zero,
                                zero, zero, one])
    # ZOOM MATRIX
    zoom_matrix = get_3x3_mat([one / height_zoom, zero, zero,
                               zero, one / width_zoom, zero,
                               zero, zero, one])
    # SHIFT MATRIX
    shift_matrix = get_3x3_mat([one, zero, height_shift,
                                zero, one, width_shift,
                                zero, zero, one])

    return K.dot(K.dot(rotation_matrix, shear_matrix),
                 K.dot(zoom_matrix, shift_matrix))


def transform(image, cfg):
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image randomly rotated, sheared, zoomed, and shifted
    DIM = cfg["read_size"]
    XDIM = DIM % 2  # fix for size 331

    rot = cfg['rot'] * tf.random.normal([1], dtype='float32')
    shr = cfg['shr'] * tf.random.normal([1], dtype='float32')
    h_zoom = 1.0 + tf.random.normal([1], dtype='float32') / cfg['hzoom']
    w_zoom = 1.0 + tf.random.normal([1], dtype='float32') / cfg['wzoom']
    h_shift = cfg['hshift'] * tf.random.normal([1], dtype='float32')
    w_shift = cfg['wshift'] * tf.random.normal([1], dtype='float32')

    # GET TRANSFORMATION MATRIX
    m = get_mat(rot, shr, h_zoom, w_zoom, h_shift, w_shift)

    # LIST DESTINATION PIXEL INDICES
    x = tf.repeat(tf.range(DIM // 2, -DIM // 2, -1), DIM)
    y = tf.tile(tf.range(-DIM // 2, DIM // 2), [DIM])
    z = tf.ones([DIM * DIM], dtype='int32')
    idx = tf.stack([x, y, z])

    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m, tf.cast(idx, dtype='float32'))
    idx2 = K.cast(idx2, dtype='int32')
    idx2 = K.clip(idx2, -DIM // 2 + XDIM + 1, DIM // 2)

    # FIND ORIGIN PIXEL VALUES
    idx3 = tf.stack([DIM // 2 - idx2[0,], DIM // 2 - 1 + idx2[1,]])
    d = tf.gather_nd(image, tf.transpose(idx3))

    return tf.reshape(d, [DIM, DIM, 3])


def prepare_image(img, cfg=None, augment=True):
#     img = tf.image.decode_jpeg(img, channels=3)
#     img = tf.image.resize(img, [cfg['read_size'], cfg['read_size']])
#     img = tf.cast(img, tf.float32) / 255.0  # # Cast and normalize the image to [0,1]

    if augment:
        # Data augmentation
        img = transform(img, cfg)
#         img = tf.image.random_crop(img, [cfg['crop_size'], cfg['crop_size'], 3])
        # Coarse dropout
        # img = dropout(img, DIM=cfg['crop_size'], PROBABILITY=cfg['DROP_FREQ'], CT=cfg['DROP_CT'], SZ=cfg['DROP_SIZE'])
        # Other augmentations
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_hue(img, 0.01)
        img = tf.image.random_saturation(img, 0.7, 1.3)
        img = tf.image.random_contrast(img, 0.8, 1.2)
        img = tf.image.random_brightness(img, 0.1)
        # Hair augmentation
        # img = hair_aug_tf(img, augment=cfg['hair_augm'])
    else:
        img = tf.image.central_crop(img, cfg['crop_size'] / cfg['read_size'])

#     img = tf.image.resize(img, [cfg['net_size'], cfg['net_size']])
#     img = tf.reshape(img, [cfg['net_size'], cfg['net_size'], 3])
    return img

In [None]:
def write_tfrecord_aug(SIZE, df_w, tfrecord_name):
    LEN = len(df_w)
    NUM_AUG_IMAGES = CFG['num_aug']
    SIZE = SIZE // NUM_AUG_IMAGES
    CT = LEN // SIZE + int(LEN % SIZE != 0)
    for j in range(CT):
        print()
        print('Writing TFRecord %i of %i...' % (j, CT))

        CT2 = min(SIZE, LEN - j * SIZE)
        with tf.io.TFRecordWriter(tfrecord_name + '%.2i-%i.tfrec' % (j, CT2 * NUM_AUG_IMAGES)) as writer:
            for k in range(CT2):
                img = cv2.imread(PATH + df_w.iloc[SIZE * j + k].image_name + '.jpg')
                img = center_crop(img)
                #                 img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                imgNoAug = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                name = df_w.iloc[SIZE * j + k].image_name
                row = df_w.loc[df_w.image_name == name]
                example = serialize_example(
                    imgNoAug, str.encode(name),
                    row.patient_id.values[0],
                    row.sex.values[0],
                    row.age_approx.values[0],
                    row.anatom_site_general_challenge.values[0],
                    row.diagnosis.values[0],
                    row.target.values[0])
                writer.write(example)

                if (tfrecord_name == 'trainb' or tfrecord_name == 'valb') and row.target.values[0] == 1:
                    for l in range(NUM_AUG_IMAGES - 1):
                        imgAug = prepare_image(img, cfg=CFG).numpy()
                        imgAug = cv2.imencode('.jpg', imgAug, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                        exname = name + "_" + str(l)
                        example = serialize_example(
                            imgAug, str.encode(exname),
                            row.patient_id.values[0],
                            row.sex.values[0],
                            row.age_approx.values[0],
                            row.anatom_site_general_challenge.values[0],
                            row.diagnosis.values[0],
                            row.target.values[0])
                        writer.write(example)

                print(k * NUM_AUG_IMAGES, ', ', end='')


def write_tfrecord(SIZE, df_w, tfrecord_name):
    LEN = len(df_w)
    CT = LEN // SIZE + int(LEN % SIZE != 0)
    for j in range(CT):
        print()
        print('Writing TFRecord %i of %i...' % (j, CT))
        CT2 = min(SIZE, LEN - j * SIZE)
        with tf.io.TFRecordWriter(tfrecord_name + '%.2i-%i.tfrec' % (j, CT2)) as writer:
            for k in range(CT2):
                img = cv2.imread(PATH + df_w.iloc[SIZE * j + k].image_name + '.jpg')
                img = center_crop(img)
                #                 img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                imgNoAug = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                name = df_w.iloc[SIZE * j + k].image_name
                row = df_w.loc[df_w.image_name == name]
                example = serialize_example(
                    imgNoAug, str.encode(name),
                    row.patient_id.values[0],
                    row.sex.values[0],
                    row.age_approx.values[0],
                    row.anatom_site_general_challenge.values[0],
                    row.diagnosis.values[0],
                    row.target.values[0])
                writer.write(example)

                if k % 100 == 0:
                    print(k, ', ', end='')

In [None]:
# SIZE = 1766
# write_tfrecord(SIZE, df_80, 'train')

# SIZE = len(df_1_80) * CFG['num_aug'] // 15
# print(SIZE)
# write_tfrecord_aug(SIZE, df_1_80, 'trainb')

# SIZE = 1700
# write_tfrecord(SIZE, df_0_80, 'traina')

In [None]:
# SIZE = 441
# write_tfrecord(SIZE, val_20, 'val')

SIZE = len(df_1_20) * CFG['num_aug'] // 15
print(SIZE)
write_tfrecord_aug(SIZE, df_1_20, 'valb')

SIZE = len(df_0_20) // 15
print(SIZE)
write_tfrecord(SIZE, df_0_20, 'vala')

In [None]:
# ! ls -l

In [None]:
# print(IMG_SIZE[0])
# print(IMG_SIZE[1])

In [None]:
# zipFileName = f'melanoma-{IMG_SIZE[0]}x{IMG_SIZE[1]}.zip'
# print(zipFileName)
# !zip -r zipFileName ./

# Write TFRecords - Test

In [None]:
# def serialize_example2(feature0, feature1, feature2, feature3, feature4, feature5): 
#   feature = {
#       'image': _bytes_feature(feature0),
#       'image_name': _bytes_feature(feature1),
#       'patient_id': _int64_feature(feature2),
#       'sex': _int64_feature(feature3),
#       'age_approx': _int64_feature(feature4),
#       'anatom_site_general_challenge': _int64_feature(feature5),
#   }
#   example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
#   return example_proto.SerializeToString()

In [None]:
# SIZE = 687
# CT = len(IMGS2)//SIZE + int(len(IMGS2)%SIZE!=0)
# for j in range(CT):
#     print(); print('Writing TFRecord %i of %i...'%(j,CT))
#     CT2 = min(SIZE,len(IMGS2)-j*SIZE)
#     with tf.io.TFRecordWriter('test%.2i-%i.tfrec'%(j,CT2)) as writer:
#         for k in range(CT2):
#             img = cv2.imread(PATH2+IMGS2[SIZE*j+k])
#             img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
#             img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
#             name = IMGS2[SIZE*j+k].split('.')[0]
#             row = test.loc[test.image_name==name]
#             example = serialize_example2(
#                 img, str.encode(name),
#                 row.patient_id.values[0],
#                 row.sex.values[0],
#                 row.age_approx.values[0],                        
#                 row.anatom_site_general_challenge.values[0])
#             writer.write(example)
#             if k%100==0: print(k,', ',end='')

# Verify TFRecords
We will verify the TFRecords we just made by using code from the Flower Comp starter notebook [here][1] to display the TFRecords below.

[1]: https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu

In [None]:
# numpy and matplotlib defaults
np.set_printoptions(threshold=15, linewidth=80)
CLASSES = [0,1]

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    #if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
    #    numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return CLASSES[label], True
    correct = (label == correct_label)
    return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                CLASSES[correct_label] if not correct else ''), correct

def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch, predictions=None):
    """This will work with:
    display_batch_of_images(images)
    display_batch_of_images(images, predictions)
    display_batch_of_images((images, labels))
    display_batch_of_images((images, labels), predictions)
    """
    # data
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
        
    # auto-squaring: this will drop data that does not fit into square or square-ish rectangle
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = label
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
    
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
#     channels = tf.unstack (image, axis=-1)
#     image = tf.stack([channels[2], channels[1], channels[0]], axis=-1)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = example['image_name']
    return image, label # returns a dataset of (image, label) pairs

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
#     dataset = dataset.repeat() # the training dataset must repeat for several epochs
#     dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
# INITIALIZE VARIABLES
IMAGE_SIZE= [256,256]; BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob('train*.tfrec')
print('There are %i train images'%count_data_items(TRAINING_FILENAMES))

In [None]:
# DISPLAY TRAIN IMAGES
training_dataset = get_training_dataset()
training_dataset = training_dataset.unbatch().batch(600)
train_batch = iter(training_dataset)

display_batch_of_images(next(train_batch))