## Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from google.cloud import storage
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
# PATHS TO IMAGES
PATH = '../input/dog-breed-identification/train/'
PATH2 = '../input/dog-breed-identification/test/'
IMGS = os.listdir(PATH); IMGS2 = os.listdir(PATH2)
print('There are %i train images and %i test images'%(len(IMGS),len(IMGS2)))

In [None]:
# LOAD META DATA
df = pd.read_csv('../input/dog-breed-identification/sample_submission.csv')
df.rename({'id':'image_name'},axis=1,inplace=True)
df.head()

In [None]:
df.info()

In [None]:
X = df.image_name.values
y = df.image_name.values

In [None]:
len(y)

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
AUTO = tf.data.experimental.AUTOTUNE
from PIL import Image
import os
import IPython.display as display

Transforming the Test Dataset into TFRecords for TPU usage

In [None]:
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

In [None]:
test_df = pd.DataFrame()
test_df['image_path'] = X
test_df['image_name'] = y
test_df.head()


test_image_paths = test_df['image_path']
test_labels = test_df[['image_name']]

os.makedirs('./tfrecords/test/')

tfrecord_test_dir = './tfrecords/test/'

In [None]:
SHARDS = 128
nb_images = len(test_df)
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

In [None]:
def test_parse_function(filename, label):
    img_raw = tf.io.read_file('../input/dog-breed-identification/test/' + filename + '.jpg')
    return img_raw, label

In [None]:
files = tf.data.Dataset.from_tensor_slices((test_image_paths, test_labels))
dataset = files.map(test_parse_function)
dataset = dataset.batch(shard_size)

In [None]:
def to_tfrecord(tfrec_filewriter, img_bytes, label):
    
    feature = {
        "image": _bytestring_feature([img_bytes]), # one image in the list
        "image_name": _bytestring_feature([label[0]]),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords")
for shard, (image, label) in enumerate(dataset):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = tfrecord_test_dir + "{:03d}-{}.tfrec".format(shard, shard_size)
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
        example = to_tfrecord(out_file,
                              image.numpy()[i],
                              label.numpy()[i])
        out_file.write(example.SerializeToString())
    
    print("Wrote file {} containing {} records".format(filename, shard_size))

Test the reading of TFRecords created

In [None]:
IMAGE_SIZE = [224,224]
BATCH_SIZE = 128

def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        "image_name": tf.io.FixedLenFeature([], tf.string),   # shape [] means scalar
    }
    
    feature = tf.io.parse_single_example(example, features)
    print(feature)
    image = tf.image.decode_jpeg(feature['image'], channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [*IMAGE_SIZE])
    label = feature['image_name']
    return image, label

    
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

test_path = tf.io.gfile.glob(tfrecord_test_dir+ "*.tfrec")

test_dataset = tf.data.TFRecordDataset(test_path, num_parallel_reads=AUTO)
test_dataset = test_dataset.with_options(option_no_order)
test_dataset = test_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
test_dataset = test_dataset.batch(BATCH_SIZE)


In [None]:
for image, label in test_dataset.take(1):
    print(image.numpy().shape)
    print(label)

Google Cloud credentials

In [None]:
from google.cloud import storage

# For uploading to GCS buckets:
STORAGE_CLIENT = storage.Client.from_service_account_json('../input/cz4041/My Project 78884-3c1398ad9056.json')

In [None]:
def create_bucket(dataset_name):
    """Creates a new bucket. https://cloud.google.com/storage/docs/ """
    bucket = STORAGE_CLIENT.create_bucket(dataset_name)
    print('Bucket {} created'.format(bucket.name))

In [None]:
bucket_name = 'cz4041_test'         
try:
    create_bucket(bucket_name)   
except:
    pass

In [None]:
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket. https://cloud.google.com/storage/docs/"""
    blobs = STORAGE_CLIENT.list_blobs(bucket_name)
    for blob in blobs:
        print(blob.name)

In [None]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket. https://cloud.google.com/storage/docs/ """
    bucket = STORAGE_CLIENT.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))


In [None]:
test_files = os.listdir('./tfrecords/test')
print(test_files)

In [None]:
for file in test_files:
    local_data = './tfrecords/test/'+file
    file_name = file
    upload_blob(bucket_name, local_data, file_name)

print('\nData inside of the GCS Bucket ',bucket_name,':\n')
list_blobs(bucket_name)  