## Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from google.cloud import storage
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))


In [None]:
# PATHS TO IMAGES
PATH = '../input/dog-breed-identification/train/'
PATH2 = '../input/dog-breed-identification/test/'
IMGS = os.listdir(PATH); IMGS2 = os.listdir(PATH2)
print('There are %i train images and %i test images'%(len(IMGS),len(IMGS2)))

In [None]:
# LOAD META DATA
df = pd.read_csv('../input/dog-breed-identification/labels.csv')
df.rename({'id':'image_name'},axis=1,inplace=True)
df.head()


In [None]:
df.info()

In [None]:
x = pd.DataFrame(df['breed'].value_counts())
x.astype('int64').dtypes
x.info()

In [None]:
ax = x.plot.bar(figsize=(20,8),y='breed', rot=90)

Split the Kaggle Training data into Training and validation datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.image_name.values, df.breed.values, test_size=0.10, random_state=42, stratify=df[['breed']])

In [None]:
len(X_train)

In [None]:
len(X_test)

In [None]:
BATCH_SIZE = 64
STEPS_PER_EPOCH = len(X_train) // BATCH_SIZE
VAL_STEPS_PER_EPOCH = len(X_test) // BATCH_SIZE

In [None]:
STEPS_PER_EPOCH

In [None]:
VAL_STEPS_PER_EPOCH

In [None]:
train_df = pd.DataFrame()
train_df['image_name'] = X_train
train_df['breed'] = y_train
train_df.head()

In [None]:
x = pd.DataFrame(train_df['breed'].value_counts())
x.astype('int64').dtypes
ax = x.plot.bar(figsize=(17,6),y='breed', rot=90)

In [None]:
test_df = pd.DataFrame()
test_df['image_name'] = X_test
test_df['breed'] = y_test
test_df.head()

In [None]:
x = pd.DataFrame(test_df['breed'].value_counts())
x.astype('int64').dtypes
ax = x.plot.bar(figsize=(17,6),y='breed', rot=90)

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
AUTO = tf.data.experimental.AUTOTUNE
from PIL import Image
import os
import IPython.display as display

In [None]:
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

Encode the Dog Breed Classes into integer classes for classification

In [None]:
label_encoder = LabelEncoder().fit(df.breed.astype(str))
train_df.breed = label_encoder.transform(train_df.breed.astype(str))
keys = label_encoder.classes_
values = label_encoder.transform(label_encoder.classes_)
dictionary = dict(zip(keys, values))
label_encoder = LabelEncoder().fit(df.breed.astype(str))
test_df.breed = label_encoder.transform(test_df.breed.astype(str))

In [None]:
test_df.breed

The dictionary of the 120 Dog Breeds and their respective numerical classes

In [None]:
dictionary

In [None]:
import csv

csv_columns = dictionary.keys() 
dict_data = [dictionary]

csv_file = "./classes_mapping.csv"
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in dict_data:
            writer.writerow(data)
except IOError:
    print("I/O error")

In [None]:
train_image_paths = train_df['image_name']
train_labels = train_df[['breed']]

val_image_paths = test_df['image_name']
val_labels = test_df[['breed']]

os.makedirs('./tfrecords/train/')
os.makedirs('./tfrecords/val/')

tfrecord_train_dir = './tfrecords/train/'
tfrecord_val_dir = './tfrecords/val/'

## Train Data

Transforming the Training Dataset into TFRecords for TPU usage

In [None]:
SHARDS = 144
nb_images = len(train_df)
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

In [None]:
def train_parse_function(filename, label):
    print(label)
    img_raw = tf.io.read_file('../input/dog-breed-identification/train/' + filename + '.jpg')
    return img_raw, label

In [None]:
files = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))
dataset = files.map(train_parse_function)
dataset = dataset.batch(shard_size)

In [None]:
def to_tfrecord(tfrec_filewriter, img_bytes, label):
    one_hot_class = [np.eye(120)[label[0]]]
    
    feature = {
        "image": _bytestring_feature([img_bytes]), # one image in the list
        "breed": _int_feature([label[0]]),
        "breed_oh": _float_feature(one_hot_class[0].tolist())
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
print("Writing TFRecords")
for shard, (image, label) in enumerate(dataset):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = tfrecord_train_dir + "{:02d}-{}.tfrec".format(shard, shard_size)
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
        example = to_tfrecord(out_file,
                              image.numpy()[i],
                              label.numpy()[i])
        out_file.write(example.SerializeToString())
    
    print("Wrote file {} containing {} records".format(filename, shard_size))

## Test Data

Transforming the Validation Dataset into TFRecords for TPU usage

In [None]:
SHARDS = 16

nb_images = len(test_df)
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

In [None]:
files = tf.data.Dataset.from_tensor_slices((val_image_paths, val_labels))
dataset = files.map(train_parse_function)
dataset = dataset.batch(shard_size)

In [None]:
print("Writing TFRecords")
for shard, (image, label) in enumerate(dataset):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = tfrecord_val_dir + "{:02d}-{}.tfrec".format(shard, shard_size)
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in range(shard_size):
        example = to_tfrecord(out_file,
                              image.numpy()[i],
                              label.numpy()[i])
        out_file.write(example.SerializeToString())
    
    print("Wrote file {} containing {} records".format(filename, shard_size))

Test TFRecords Reading

In [None]:
IMAGE_SIZE = [224,224]
BATCH_SIZE = 128

def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        "breed": tf.io.FixedLenFeature([], tf.int64),   # shape [] means scalar
        "breed_oh": tf.io.VarLenFeature(tf.float32) # a certain number of floats
    }
    
    feature = tf.io.parse_single_example(example, features)
    print(feature)
    image = tf.image.decode_jpeg(feature['image'], channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [*IMAGE_SIZE])
    label = feature['breed']
    one_hot_class = tf.sparse.to_dense(feature['breed_oh'])
    one_hot_class = tf.reshape(one_hot_class, [120])
    return image, label, one_hot_class

    
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

train_path = tf.io.gfile.glob(tfrecord_train_dir+ "*.tfrec")
val_path = tf.io.gfile.glob(tfrecord_val_dir + "*.tfrec")

training_dataset = tf.data.TFRecordDataset(train_path, num_parallel_reads=AUTO)
training_dataset = training_dataset.with_options(option_no_order)
training_dataset = training_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
training_dataset = training_dataset.batch(BATCH_SIZE)

val_dataset = tf.data.TFRecordDataset(val_path, num_parallel_reads=AUTO)
val_dataset = val_dataset.with_options(option_no_order)
val_dataset = val_dataset.map(read_tfrecord, num_parallel_calls=AUTO)
val_dataset = val_dataset.batch(BATCH_SIZE)

In [None]:
for image, label,one_hot_class in training_dataset.take(1):
    print(image.numpy().shape)
    print(label)
    print(one_hot_class.numpy().shape)

## Upload to GS Bucket

Google Cloud credentials

In [None]:
from google.cloud import storage

# For uploading to GCS buckets:
STORAGE_CLIENT = storage.Client.from_service_account_json('../input/cz4041/My Project 78884-3c1398ad9056.json')

In [None]:
def create_bucket(dataset_name):
    """Creates a new bucket. https://cloud.google.com/storage/docs/ """
    bucket = STORAGE_CLIENT.create_bucket(dataset_name)
    print('Bucket {} created'.format(bucket.name))

In [None]:
bucket_name = 'cz4041_train_10'         
try:
    create_bucket(bucket_name)   
except:
    pass

In [None]:
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket. https://cloud.google.com/storage/docs/"""
    blobs = STORAGE_CLIENT.list_blobs(bucket_name)
    for blob in blobs:
        print(blob.name)

In [None]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket. https://cloud.google.com/storage/docs/ """
    bucket = STORAGE_CLIENT.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))

In [None]:
train_files = os.listdir('./tfrecords/train')
print(train_files)

In [None]:
for file in train_files:
    local_data = './tfrecords/train/'+file
    file_name = file
    upload_blob(bucket_name, local_data, file_name)

print('\nData inside of the GCS Bucket ',bucket_name,':\n')
list_blobs(bucket_name)  

In [None]:
test_files = os.listdir('./tfrecords/val')
print(test_files)

In [None]:
bucket_name = 'cz4041_val_10'         
try:
    create_bucket(bucket_name)   
except:
    pass

In [None]:
for file in test_files:
    local_data = './tfrecords/val/'+file
    file_name = file
    upload_blob(bucket_name, local_data, file_name)

print('\nData inside of the GCS Bucket ',bucket_name,':\n')
list_blobs(bucket_name)  

In [None]:
def download_to_kaggle(bucket_name,destination_directory,file_name):
    """Takes the data from your GCS Bucket and puts it into the working directory of your Kaggle notebook"""
    os.makedirs(destination_directory, exist_ok = True)
    full_file_path = os.path.join(destination_directory, file_name)
    blobs = STORAGE_CLIENT.list_blobs(bucket_name)
    for blob in blobs:
        blob.download_to_filename(full_file_path)

In [None]:
destination_directory = './from/'       
for file_name in test_files:
    download_to_kaggle(bucket_name,destination_directory,file_name)