In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# End to End Multi-class Dog Breed Classification

This notebook builds an end-to-end multi-class image classifier using TensorFlow 2.0 and TensorFlow Hub

## 1. Problem
Identifying the breed of a dog, given the image of a dog.

## 2. Data
Data is available in this competition

## 3. Evaluation
The evaluation is a file with prediction probabilities for each dog breed of each test image.

## Features
This is unstructured data, since we're dealing with images here, so it is a deep learning problem. 

## Importing the necessary libraries

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
import matplotlib.pyplot as plt
print('TensorFlow Version :', tf.__version__)
print('TensorFlow_Hub Version :', hub.__version__)

# Check if we're using GPU
print('GPU','Available!!, Noice' if tf.config.list_logical_devices('GPU') else 'Not Available')

## Getting our data(images) ready, by converting it into tensors.

Let's take a look at our labels.

In [None]:
labels_csv = pd.read_csv('../input/dog-breed-identification/labels.csv')

In [None]:
labels_csv[70:80]

### EDA

In [None]:
labels_csv.describe()

* Let's take a look at most common dog breed.
* Also this helps us figure out number of images for each breed.

In [None]:
print(labels_csv['breed'].value_counts()[:10])
labels_csv['breed'].value_counts()[:20].plot.bar(figsize=(20,10))

Hmmm, looks like these are the top 5 most common dog breeds in this data :


 `scottish_deerhound      126
 maltese_dog             117
 afghan_hound            116
 entlebucher             115
 bernese_mountain_dog    114`

* Overall frequency of `breed` column

In [None]:
labels_csv['breed'].value_counts(normalize=True).plot.bar(logx=False, figsize=(20,10))
print(labels_csv['breed'].value_counts().median())

We should know that for model training purposes, it's recommended that we use about 100 annotations per label, with minumum of 10 annotations.

What it means is, if we had atleast 100 values/images per each breed, our model would learn really good. 

But that's not the case here. We have roughly 82 images per breed.

Anyway, let's continue.

Let's try and view an image.

Since `scottish_deerhound` is the most common breed, let's take a look at it.

In [None]:
Image('../input/dog-breed-identification/train/0042188c895a2f14ef64a918ed9c7b64.jpg')

Let's look at another image,

In [None]:
Image('../input/dog-breed-identification/train/01e787576c003930f96c966f9c3e1d44.jpg')

Now as humans, we can identify some patterns/features of this breed. 

Let's look at some of the features of this breed :

1. It looks like a large greyhound cloaked in a wiry coat. 
2. This breed has long, slender legs, like a greyhound.
3. It has relatively narrow body, deep chest, tucked abdomen, arched loin      and long tail.
4. It is Dolichocephalic (long face).

Now let's see if our model can similarly figure out different features from this data.

Now let's map `images` and their `labels` 

In [None]:
# Create pathnames from image ID's
filenames = []

In [None]:
filenames = []
for filename in labels_csv['id']:
    filenames.append('../input/dog-breed-identification/train/' + filename + '.jpg')
filenames[:10]

In [None]:
# Check if number of filenames are equal to number of actual image files.
import os
if len(os.listdir('../input/dog-breed-identification/train/')) == len(filenames):
    print('Yes ! they match')
else:
    print('No, they don\'t')

Awesome, number of filenames are equal to number of image files.

Let's make one last check.

In [None]:
print(labels_csv['breed'][9000])
Image(filenames[9000])

WOAHHH ! What a beast, looks like a lion.

Since we've a list of training image filepaths, it's time to prepare our labels.

In [None]:
labels = labels_csv['breed']
labels = np.array(labels)
labels

# Or we can ,
# labels = labels_csv['breed'].to_numpy()

In [None]:
len(labels)

Now in case we had structured data, in order to find the missing values, we had `df.isnull().sum()`.

But since we're dealing with unstructured data, we have to check for missing values by comparing length of `filenames` and `labels`.

In [None]:
# Check if number of labels are equal to number of filenames.
import os
if len(filenames) == len(labels):
    print('Yessssss ! no missing values ;-)')
else:
    print('Nooooo ! Look\'s like we have missing values to deal with')

Beautiful, looks like we don't have any missing data.

In [None]:
# Let's find unique label values.
unique_breeds = np.unique(labels)
unique_breeds


In [None]:
len(unique_breeds)

Awesome ! Since we had 120 breed of dogs.

Now let's turn labels into boolean array.

In [None]:
print(labels[0])
labels[0] == unique_breeds

In [None]:
boolean_labels = [label == unique_breeds for label in labels]
boolean_labels[:2]

## Creating a validation set

In [None]:
from sklearn.model_selection import train_test_split
X = filenames
y = boolean_labels

Let's first train on 1000 images.

In [None]:
# Splitting into train and validation set.
X_train, X_valid, y_train, y_valid = train_test_split(X[:1000], y[:1000], test_size = 0.2, random_state = 42)

# Checking the dimensions of train and validation set
len(X_train), len(X_valid), len(y_train), len(y_valid)

In [None]:
# Let's look if everything is fine.
X_train[:1], y_train[:1]

## Preprocessing the images.

It's time to convert images into tensors :
1. Take an image filepath as input
2. Use tensorflow to read the file and save it to a variable, `image`.
3. Turn `image` ('.jpg') into tensor
4. Normalize image (convert colour channels from 0-255 to 0-1
5. Resize `image` to a shape of (224, 224)
6. Return the modified `image`.

Now before that, let's import an image

In [None]:
from matplotlib.pyplot import imread
image = imread(filenames[30])
image.shape

In [None]:
image.max(), image.min()

We can almost convert anything into a tensor using `tf.constant()`

In [None]:
tf.constant(image)

In [None]:
# Define image size
#IMG_SIZE = 224

# Create a function that preprocessess images
def image_process(image_path):
    '''
    Takes an image filepath and converts image into a tensor
    '''
    # Read image file
    image = tf.io.read_file(image_path)
    
    # Turn the jpg image into numerical tensor with 3 colour channels (red, green, blue)
    image = tf.image.decode_jpeg(image, channels = 3)
    
    # Convert the colour channel values from 0-255 to 0-1 values
    image = tf.image.convert_image_dtype(image, tf.float32)
    
    # Resize the image (224,224)
    image = tf.image.resize(image, size = [224,224])
    
    return image

## Turning data into batches (default=32)

If you're trying to process 10222 images in one go, they won't fit into the memory.

Hence process 32 images at a time.

In order to use tensorflow effectively, our data has to be in the form of tensor tuples, like this :
`(image, label)`

In [None]:
# Creating a function that return a tuple of image and tensor [(image, tensor)]
def get_image_label(image_path, label):
    '''
    Takes an image filepath name and the associated label,
    processes the image and returns a tuple of (image, label)
    '''
    image = image_process(image_path)
    return image, label

Now that we've converted our data into tuple of tensors `(image, label)`, let's turn all of our data(`X & y`) into batches.

In [None]:
# Creating a function to turn data into batches.
def data_batcher(X, y=None, batch_size = 32, valid_data = False, test_data = False):
    '''
    Creates batches of data out of image (X) and label (y) pairs.
    It shuffles if it's training data, but won't if it's validation data.
    Also accepts test data as input (it doesn't have labels).
    '''
    
    # If the data is test data, we won't have labels
    if test_data:
        print('Creating test data batches...')
    
        # Only filepaths, not labels
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X)))
        data_batch = data.map(image_process).batch(32)
        return data_batch

    # If the data is valid dataset, we don't shuffle it.
    elif valid_data:
        print('Creating validation data batches...')
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X), # filepaths
                                                   tf.constant(y)))# labels
        data_batch = data.map(get_image_label).batch(32) 
        return data_batch
    # If the data is training data set
    else:
        print('Creating training data batches...')
        # Turn filepaths and labels into tensors
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X), # filepaths
                                                   tf.constant(y))) # labels
        
        # Shuffle the pathnames and labels before mapping image processor function
        # ... is faster than shuffling images
        data = data.shuffle(buffer_size = len(X))
        
        # Create a image, label tuple and turns the image path into a preprocessed image
        data = data.map(get_image_label)
        
        # Turning the training data into batches.
        data_batch = data.batch(32) 
    return data_batch

In [None]:
# Create training and validation data batches.
train_data = data_batcher(X_train, y_train)
val_data = data_batcher(X_valid, y_valid, valid_data = True)

In [None]:
# Check different attributes of our data batches.
train_data.element_spec, val_data.element_spec

## Visualizing Data Batches

Although our data is now in batches, it can be hard to understand.

So let's visualize it.

In [None]:
# Creating a function to view images in a data batch
def show_lim_images(images,labels):
    '''
    Displays a plot of given number of images and their labels from a data batch.
    '''
    # Setting up the fig
    plt.figure(figsize=(10,10))
    # Loop through 25 for displaying 25 images
    for i in range(25):
        #create subplots (5 rows, 5 columns)
        ax = plt.subplot(5,5,i+1)
        # Display an image
        plt.imshow(images[i])
        # Add the image lable as title.
        plt.title(unique_breeds[labels[i].argmax()])
        # Turn the gridlines off
        plt.axis('off')

Now let's visualize it.

In [None]:
train_images, train_labels = next(train_data.as_numpy_iterator()) 
show_lim_images(train_images, train_labels)

In [None]:
valid_images, valid_labels = next(val_data.as_numpy_iterator()) 
show_lim_images(valid_images, valid_labels)

## Building the model

Before building a model, let's define a few things
1. The input shape(our images in tensors) to our model.
2. The output shape(image labels, in tensors) of our model.
3. URL of the model we want to use : https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/4

In [None]:
# Setup input shape to the model.
INPUT_SHAPE = [None, 224, 224,3] #batch, height, width, colour channels.
# Setup the output shape of the model.
OUTPUT_SHAPE = len(unique_breeds)
# Setup model URL from tensorflow hub.
MODEL_URL = 'https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/4'

Now that we've our inputs, outputs and model all set up, let's use `Keras` api from `TensorFlow` for creating deep learning model.

Let's create a function which:
1. Takes input, output shape along with model we've chosen.
2. Defines layers in the `Keras` model(Sequential fashion).
3. Compiles the model.
4. Builds the model.
5. Returns the model.

Source : https://www.tensorflow.org/guide/keras/overview

In [None]:
def create_model(input_shape = INPUT_SHAPE, output_shape = OUTPUT_SHAPE, model_url = MODEL_URL):
    print('Building model with : ', MODEL_URL, '...')
    
    #Setting up the model layers
    model = tf.keras.Sequential([
        hub.KerasLayer(MODEL_URL), #1st layer/input layer
        tf.keras.layers.Dense(units = OUTPUT_SHAPE,
                              activation='softmax')])#2nd/output layer
    
    # Compiling the model
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['accuracy'])
        
    # Building the model
    model.build(INPUT_SHAPE)
    
    return model    

In [None]:
model = create_model()
model.summary()

## Callbacks

Callbacks are like checkpoints that we have in games, it checks progress or stop training if there's no significant improvement in model.

So now we create a callback, 
* To stop the model early, if it's training for too long, because that'll lead to overfitting.


### Stopping Callback

Sometimes the model keeps on training evem though there is no improvement, which will cause overfitting. At times like that we use a callback to stop the model to train.

In [None]:
# Create early stopping 
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                  patience=3) 

## Training

Now we won't train on all of our data, just 1000 images (800-training, 200-validation), As this will save us time.

The final parameter we'll define before training is NUM_EPOCHS (also known as number of epochs).

NUM_EPOCHS defines number of passes of the data we'd like our model to do.

In [None]:
# Build a function to train and return a trained model (100 epochs)
def train_model():    
    """
    Trains a given model and returns the trained version.
    """
    # Create a model
    model = create_model()
    
    # Create new TensorBoard session everytime we train a model
    #tensorboard = create_tf_callback()

    # Fit the model to the data passing it the callbacks we created
    model.fit(x=train_data,
          epochs=100,
          validation_data=val_data,
          validation_freq=1, # check validation metrics every epoch
          callbacks=[early_stopping])
  
    # Return the fitted model.
    return model

In [None]:
# Fit the model to the data
model = train_model()

Looking at the accuracy and difference between accuracy and val_accuracy, it seems that the model is overfitting.

While it is doing exceptionally good on training data, it's not doing that good on validation data.

## Predictions

Making predictions with our model returns an array with a different value for each label.

In this case, making predictions on the validation data (200 images) returns an array (predictions) of arrays, each containing 120 different values (one for each unique dog breed).

These different values are the probabilities or the likelihood the model has predicted a certain image being a certain breed of dog. The higher the value, the more likely the model thinks a given image is a specific breed of dog.

In [None]:
# Make predictions on the validation data (not used to train on)
predictions = model.predict(val_data, verbose=1)
predictions[0] # Predictions of one image

In [None]:
def pred(index):
    '''
    Takes index value from the predictions and returns 
    highest confidence level index of the highest confidence value
    and dog breed.
    '''
    max_value = np.max(predictions[index])
    max_value_index = predictions[index].argmax()
    breed_at_that_index = unique_breeds[max_value_index]
    print(f"Confidence Level for first image : {max_value}")
    print(f"Index for the Max Value : {max_value_index}")
    print(f"Breed at that Index :  {breed_at_that_index}")

In [None]:
pred(0)

Hmmm, lets see according to the model, it's highly probable that the dog in the first image is a `cairn`, with a confidence of 45.13%.  Now this is variable, because we've shuffled the training data.

Now let's take our `val_data`(which is in batches) and get a list of it, or basically unbatch it.

In [None]:
# Turn prediction probabilities into their respective label (easier to understand)
def get_pred_label(prediction_probabilities):
  """
  Turns an array of prediction probabilities into a label.
  """
  return unique_breeds[np.argmax(prediction_probabilities)]

# Get a predicted label based on an array of prediction probabilities
pred_label = get_pred_label(predictions[0])

In [None]:
# Create a function to unbatch.
def unbatch(data):
    '''
    Takes a dataset(which is in batches), and unbatches it. 
    '''
    images = []
    labels = []
    for image, label in data.unbatch().as_numpy_iterator():
        images.append(image)
        labels.append(unique_breeds[np.argmax(label)])
    return images, labels

In [None]:
val_images, val_labels = unbatch(val_data)
val_images[0], val_labels[0]

## Let's visualize the predictions.

Now we've got ways to get:
1. Prediction labels
2. Validation labels(truth labels)
3. Validation images

Lets visualize it with a function.

In [None]:
def plot(pred_probs, labels, images, n=1):
    '''
    View prediction, actual truth, and image for sample n
    '''
    pred_prob, true_label, image = pred_probs[n], labels[n], images[n]
    
    #Getting pred 
    pred_label = get_pred_label(pred_prob)
    
    # Plot image & remove ticks
    plt.imshow(image)
    plt.xticks([])
    plt.yticks([])
    
    # Changing colour, depending upon whether the prediction is right or wrong.
    if pred_label == true_label:
        color='green'
    else:
        color='red'
    
    
    # Change plot title
    plt.title('{} - {:2.0f}%\n{}'.format(pred_label, 
                                   np.max(pred_prob)*100,
                                   true_label),
                                   color=color)

In [None]:
plot(predictions, val_labels, val_images,73)

Now that we've visualized the model's predictions, let's look at the top 10 predictions.

In [None]:
def plot_pred_conf(prediction_probabilities, labels, n=1):
  """
  Plots the top 10 highest prediction confidences along with
  the truth label for sample n.
  """
  pred_prob, true_label = prediction_probabilities[n], labels[n]

  # Get the predicted label
  pred_label = get_pred_label(pred_prob)

  # Find the top 10 prediction confidence indexes
  top_10_pred_indexes = pred_prob.argsort()[-10:][::-1]
  # Find the top 10 prediction confidence values
  top_10_pred_values = pred_prob[top_10_pred_indexes]
  # Find the top 10 prediction labels
  top_10_pred_labels = unique_breeds[top_10_pred_indexes]

  # Setup plot
  top_plot = plt.bar(np.arange(len(top_10_pred_labels)), 
                     top_10_pred_values, 
                     color="black")
  plt.xticks(np.arange(len(top_10_pred_labels)),
             labels=top_10_pred_labels,
             rotation="vertical")

  # Change color of true label
  if np.isin(true_label, top_10_pred_labels):
    top_plot[np.argmax(top_10_pred_labels == true_label)].set_color("red")
  else:
    pass

In [None]:
plot_pred_conf(prediction_probabilities=predictions,
               labels=val_labels,
               n=20)

In [None]:
# Let's check a few predictions and their different values
i_multiplier = 0
num_rows = 3
num_cols = 2
num_images = num_rows*num_cols
plt.figure(figsize=(5*2*num_cols, 5*num_rows))
for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot(pred_probs=predictions,
            labels=val_labels,
            images=val_images,
            n=i+i_multiplier)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_pred_conf(prediction_probabilities=predictions,
                labels=val_labels,
                n=i+i_multiplier)
plt.tight_layout(h_pad=1.0)
plt.show()

## Saving a model

In [None]:
from datetime import datetime
def save_model(model, suffix=None):
  """
  Saves a given model in a models directory and appends a suffix (str)
  for clarity and reuse.
  """
  # Create model directory with current time
  modeldir = os.path.join("",
                          datetime.now().strftime("%Y%m%d-%H%M%s"))
  model_path = modeldir + "-" + suffix + ".h5" # save format of model
  print(f"Saving model to: {model_path}...")
  model.save(model_path)
  return model_path

In [None]:
def load_model(model_path):
  """
  Loads a saved model from a specified path.
  """
  print(f"Loading saved model from: {model_path}")
  model = tf.keras.models.load_model(model_path,
                                     custom_objects={"KerasLayer":hub.KerasLayer})
  return model

In [None]:
# Save our model trained on 1000 images
save_model(model, suffix="1000-images-Adam")

# Training a model in full dataset

In [None]:
# Turn full training data in a data batch
full_data = data_batcher(X, y)

In [None]:
# Instantiate a new model for training on the full dataset
full_model = create_model()

Lets make some callbacks too.

In [None]:
# Create full model callbacks
full_model_early_stopping = tf.keras.callbacks.EarlyStopping(monitor="accuracy",
                                                             patience=3)

Fitting the model on full data.

In [None]:
# Fit the full model to the full training data
full_model.fit(x=full_data,
               epochs=5,
               callbacks=[full_model_early_stopping])

Let's save this model.

In [None]:
save_model(full_model, suffix="all-images-Adam")

In [None]:
# Load test image filenames (since we're using os.listdir(), these already have .jpg)
test = "/kaggle/input/dog-breed-identification/test/"
test_filenames = [test + fname for fname in os.listdir(test_path)]

test_filenames[:10]

In [None]:
# Create test data batch
test_data = data_batcher(test_filenames, test_data=True)

In [None]:
# Make predictions on test data batch using the loaded full model
test_preds = full_model.predict(test_data,
                                             verbose=1)

In [None]:
# Creating pandas DataFrame with empty columns
subm_df = pd.DataFrame(columns=["id"] + list(unique_breeds))
# Append test image ID's to predictions DataFrame
test = "/kaggle/input/dog-breed-identification/test/"
subm_df["id"] = [os.path.splitext(path)[0] for path in os.listdir(test)]
# Add the prediction probabilities to each dog breed column
subm_df[list(unique_breeds)] = test_predictions
subm_df.head()

In [None]:
# Taking a .csv output for submission
subm_df.to_csv("Submissions.csv",
                 index=False)