In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# standard imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf

# Get a look at labels

In [None]:
labels_csv = pd.read_csv('/kaggle/input/dog-breed-identification/labels.csv')
labels_csv.head()

In [None]:
labels_csv.describe()

In [None]:
labels_csv['breed'].value_counts().plot.bar(figsize=(20,12));

In [None]:
#median number of image in each class.
labels_csv['breed'].value_counts().median()

In [None]:
#viewing any image from the train data.
from IPython.display import Image
Image('/kaggle/input/dog-breed-identification/train/0a0c223352985ec154fd604d7ddceabd.jpg')

# Getting images and their labels

In [None]:
filenames = ['/kaggle/input/dog-breed-identification/train/' + fname + '.jpg' for fname in labels_csv['id']]
filenames[:5]

In [None]:
# Check wheather the the number of files matches number of actual images.
if len(os.listdir('/kaggle/input/dog-breed-identification/train/')) == len(filenames):
    print('Number of file matches number of actual images!')
else:
    print('Number of file doesnot matches number of actual images!!')

In [None]:
#visualizing images according to their index.
Image(filenames[900])

In [None]:
#finding the name of the above displayed dog.
labels_csv['breed'][900]

# Turning our data into numbers

In [None]:
labels = labels_csv['breed']
labels = np.array(labels)
labels

In [None]:
#Check wheather the number of labels matches the number of filenames.
if len(labels) == len(filenames):
    print('Number of labels matches the number of filenames.')
else:
    print('Number of labels doesnot matches the number of filenames')

In [None]:
#Finding the unique labels values
unique_breed = np.unique(labels) 
unique_breed

In [None]:
#Turn single label into an array of boolean.
print(labels[0])
labels[0] == unique_breed

In [None]:
#Turning every label into an array of boolean
boolean_labels = [labels == unique_breed for labels in labels]
boolean_labels[:2]

In [None]:
# Turining boolean arrays into integers.
print(labels[0])   #orginal index
print(np.where(unique_breed==labels[0]))    #index where labels occurs.
print(boolean_labels[0].argmax())     #index where label occurs in boolean array
print(boolean_labels[0].astype(int))   #there will be a 1 where sample label occurs

# Creating our own validation set

In [None]:
# setup x and y variables.
X = filenames
y = boolean_labels

In [None]:
# First starting with ~1000 images because we have lots of data to train for the very first attempt

#set number of images to set for the experiment.
NUM_IMAGES = 1000 #@param {type:"slider",min:1000,max:10000}

In [None]:
#let's split our data into train and validation.
from sklearn.model_selection import train_test_split

#spliting into training and validation of total size NUM_IMAGES.

X_train,X_val,y_train,y_val = train_test_split(X[:NUM_IMAGES],
                                                y[:NUM_IMAGES],
                                                test_size=0.2,
                                                random_state=42)
len(X_train),len(X_val),len(y_train),len(y_val)

In [None]:
X_train[:5],y_train[:5]

# Preprocessing Images
Turning images into tensors

Let's write a function to preprocess the image. The function will do the following tasks.

    The function will take an image filepath as input.
    Use the tensorflow to read the file and save it to the variable.
    Turn our variable (.jpg) into tensors.
    Normalize our image(convert color channel from 0-255 to 0-1).|
    Resize the image.
    Return the modified variable.


In [None]:
# converting images to numpy array

from matplotlib.pyplot import imread
image = imread(filenames[42])
image.shape

In [None]:
image

In [None]:
#lets conver them into tensor
tf.constant(image)[:2]

In [None]:
# Making a function to preprocess the data

# Define image size
IMG_SIZE = 224

def process_image(image_path):
  """
  Takes an image file path and turns it into a Tensor.
  """
  # Read in image file
  image = tf.io.read_file(image_path)
  # Turn the jpeg image into numerical Tensor with 3 colour channels (Red, Green, Blue)
  image = tf.image.decode_jpeg(image, channels=3)
  # Convert the colour channel values from 0-225 values to 0-1 values
  image = tf.image.convert_image_dtype(image, tf.float32)
  # Resize the image to our desired size (224, 244)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])
  return image


# Turning our data into batches

Why turn our data into batches?

We are trying to fit the 10000+ data images. They all might not fit into memory.

So,that's why we use 32(this is batch size) images at a time. we can change the batch size whenever we need.

In order to use the tensorflow effective we need to convert the images into tuple tensor which looks like (image,labels)


In [None]:
# Create a simple function to return a tuple (image, label)
def get_image_label(image_path, label):
  """
  Takes an image file path name and the associated label,
  processes the image and returns a tuple of (image, label).
  """
  image = process_image(image_path)
  return image, label

In [None]:
# Define the batch size, 32 is a good default
BATCH_SIZE = 32

# Create a function to turn data into batches
def create_data_batches(x, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
  """
  Creates batches of data out of image (x) and label (y) pairs.
  Shuffles the data if it's training data but doesn't shuffle it if it's validation data.
  Also accepts test data as input (no labels).
  """
  # If the data is a test dataset, we probably don't have labels
  if test_data:
    print("Creating test data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x))) # only filepaths
    data_batch = data.map(process_image).batch(BATCH_SIZE)
    return data_batch
  
  # If the data if a valid dataset, we don't need to shuffle it
  elif valid_data:
    print("Creating validation data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                               tf.constant(y))) # labels
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

  else:
    # If the data is a training dataset, we shuffle it
    print("Creating training data batches...")
    # Turn filepaths and labels into Tensors
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                              tf.constant(y))) # labels
    
    # Shuffling pathnames and labels before mapping image processor function is faster than shuffling images
    data = data.shuffle(buffer_size=len(x))

    # Create (image, label) tuples (this also turns the image path into a preprocessed image)
    data = data.map(get_image_label)

    # Turn the data into batches
    data_batch = data.batch(BATCH_SIZE)
  return data_batch

In [None]:
# Create training and validation data batches
train_data = create_data_batches(X_train, y_train)
val_data = create_data_batches(X_val, y_val, valid_data=True)

In [None]:
# Check out the different attributes of our data batches
train_data.element_spec, val_data.element_spec

# Creating and training a model.

Now our data is ready now lets model our data.

Before we build a model, there are a few things we need to define:

    The input shape (images, in the form of Tensors) to our model.
    The output shape (image labels, in the form of Tensors) of our model.



In [None]:
# Setting up input shape to the model
INPUT_SHAPE = [BATCH_SIZE, IMG_SIZE, IMG_SIZE, 3] # batch, height, width, colour channels

# Setting up output shape of the model
OUTPUT_SHAPE = len(unique_breed) # number of unique labels

In [None]:
INPUT_SHAPE

In [None]:
from tensorflow import keras
pretrained_base = keras.applications.MobileNetV2(
    input_shape = (224,224,3),
    include_top=False, weights='imagenet'
)

pretrained_base.trainable = False

In [None]:
from tensorflow.keras import layers

model = keras.Sequential([
    pretrained_base,
    #layers.Flatten(),
    #layers.Dense(120, activation='softmax'),
])

In [None]:
# Compile the model
model.compile(
      loss=tf.keras.losses.CategoricalCrossentropy(), # Our model wants to reduce this (how wrong its guesses are)
      optimizer=tf.keras.optimizers.Adam(), # A friend telling our model how to improve its guesses
      metrics=["accuracy"] # We'd like this to go up
  )

In [None]:
# Check details of model
model.summary()

In [None]:
# early_stopping = tf.keras.callbacks.EarlyStopping(
#     patience=5,
#     min_delta=0.01,
#     restore_best_weights=True,
#     )

In [None]:
#history =  model.fit(
#             x=train_data,
#             epochs=10,
#             validation_data=val_data,
#             validation_freq=1, # check validation metrics every epoch
#             verbose=1,
#             callbacks=[early_stopping])

In [None]:
#history_df = pd.DataFrame(history.history)
#history_df.loc[:, ['loss', 'val_loss']].plot();

In [None]:
#history_df.loc[:, ['accuracy', 'val_accuracy']].plot();

In [None]:
predictions = model.predict(val_data, verbose=1) # verbose shows us how long there is to go
predictions

In [None]:
top_5_classes_index = np.argsort(predictions)[0 , ::-1][:5]+1

print(top_5_classes_index)

top_5_classes = labels[top_5_classes_index]
print(top_5_classes)

In [None]:
labels.shape

In [None]:
# First prediction
# print(predictions[0])
# print(f"Max value (probability of prediction): {np.max(predictions[0])}") # the max probability value predicted by the model
# print(f"Sum: {np.sum(predictions[0])}") # because we used softmax activation in our model, this will be close to 1
# print(f"Max index: {np.argmax(predictions[0])}") # the index of where the max value in predictions[0] occurs
# print(f"Predicted label: {unique_breed[np.argmax(predictions[0])]}") # the predicted label

In [None]:
# Turn prediction probabilities into their respective label (easier to understand)
def get_pred_label(prediction_probabilities):
  """
  Turns an array of prediction probabilities into a label.
  """
  return labels[np.argmax(prediction_probabilities)]

# Get a predicted label based on an array of prediction probabilities
pred_label = get_pred_label(predictions[0])
pred_label

In [None]:
# Create a function to unbatch a batched dataset
def unbatchify(data):
  """
  Takes a batched dataset of (image, label) Tensors and returns separate arrays
  of images and labels.
  """
  images = []
  labels = []
  # Loop through unbatched data
  for image, label in data.unbatch().as_numpy_iterator():
    images.append(image)
    labels.append(unique_breed[np.argmax(label)])
  return images, labels

# Unbatchify the validation data
val_images, val_labels = unbatchify(val_data)
val_images[0], val_labels[0]

In [None]:
def plot_pred(prediction_probabilities, labels, images, n=1):
  """
  View the prediction, ground truth label and image for sample n.
  """
  pred_prob, true_label, image = prediction_probabilities[n], labels[n], images[n]
  
  # Get the pred label
  pred_label = get_pred_label(pred_prob)
  
  # Plot image & remove ticks
  plt.imshow(image)
  plt.xticks([])
  plt.yticks([])

  # Change the color of the title depending on if the prediction is right or wrong
  if pred_label == true_label:
    color = "green"
  else:
    color = "red"

  plt.title("{} {:2.0f}% ({})".format(pred_label,
                                      np.max(pred_prob)*100,
                                      true_label),
                                      color=color)

In [None]:
# View an example prediction, original image and truth label
plot_pred(prediction_probabilities=predictions,
          labels=val_labels,
          images=val_images)

In [None]:
plot_pred(prediction_probabilities=predictions,
          labels=val_labels,
          images=val_images, n=2)

In [None]:
plot_pred(prediction_probabilities=predictions,
          labels=val_labels,
          images=val_images, n=3)

In [None]:
plot_pred(prediction_probabilities=predictions,
          labels=val_labels,
          images=val_images, n=42)

In [None]:
print(f"Predicted label: {labels[np.argmax(predictions[0])]}") # the predicted label

In [None]:
predictions.shape

In [None]:
pred_labels=[]
for i in range(200): 
    pred_labels.append(get_pred_label(predictions[i]))

In [None]:
true_labels=[]
for i in range(200): 
    true_labels.append(labels[i])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, pred_labels))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(true_labels, pred_labels)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(true_labels, pred_labels)

In [None]:
pred_labels=[]
for i in range(200): 
    pred_labels.append(get_pred_label(predictions[i]))
y_test=val_labels
y_pred=pred_labels
#importing confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(val_labels, pred_labels)
print('Confusion Matrix\n')
print(confusion)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred))