# Dog Breed classification

## Imports

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import keras
import os
import albumentations as A
import PIL


%matplotlib inline

from PIL import Image as PIL_Image
from numpy import asarray
from albumentations.pytorch import ToTensorV2 
from keras.preprocessing.image import ImageDataGenerator, load_img
from IPython.display import Image
from functools import partial
from matplotlib.pyplot import imread
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

## Constants

In [None]:
#just a sample
NUM_IMAGES = 3000

# Define image size
IMG_SIZE = 224

# Define the batch size, 32 is a good default
BATCH_SIZE = 32

AUTOTUNE = tf.data.experimental.AUTOTUNE

## Data

In [None]:
#show labels

labels_df = pd.read_csv('../input/dog-breed-identification/labels.csv')
labels_df.head()

In [None]:
labels_df.describe()

In [None]:
# visualize data

labels_df['breed'].value_counts().plot.bar(figsize=(10,10))

In [None]:
#show one image

Image('../input/dog-breed-identification/train/001cdf01b096e06d78e9e5112d419397.jpg')

## Format data

In [None]:
#create image path in data

filenames = ["../input/dog-breed-identification/train/" + fname + ".jpg"for fname in labels_df['id']]
filenames[22]

In [None]:
# verify each image has is  path

if len(os.listdir("../input/dog-breed-identification/train/")) == len (filenames):
    print ("ok")
else:
    print ("not ok")

In [None]:
# check one image and is label

Image(filenames[25])

In [None]:
print(labels_df['breed'][25])

## turning labels into numbers

In [None]:
# tunr labels into numpy array

labels = labels_df['breed']
labels = np.array(labels)
labels

In [None]:
# check everything is ok

if len(labels) == len(filenames):
    print('everything is ok')
else:
    print ('ouch')

In [None]:
# labels encoding

unique_breed = np.unique(labels)
len(unique_breed)

In [None]:
#Turning every label into an array of boolean
boolean_labels = [labels == unique_breed for labels in labels]
boolean_labels[:2]

In [None]:
# Turining boolean arrays into integers.
print(labels[0])   #orginal index
print(np.where(unique_breed==labels[0]))    #index where labels occurs.
print(boolean_labels[0].argmax())     #index where label occurs in boolean array
print(boolean_labels[0].astype(int))   #there will be a 1 where sample label occurs

## creating sets

In [None]:
# setup x and y variables.

X = filenames
y = boolean_labels

In [None]:
#let's split our data into train and validation.
from sklearn.model_selection import train_test_split

#spliting into training and validation of total size NUM_IMAGES.

X_train,X_val,y_train,y_val = train_test_split(X[:NUM_IMAGES],
                                                y[:NUM_IMAGES],
                                                test_size=0.2,
                                                random_state=42)
len(X_train),len(X_val),len(y_train),len(y_val)

## Preprocessing 


In [None]:
# show shape from one image

image = imread(filenames[42])
image.shape

In [None]:
#lets conver them into tensor
# tf.constant(image)[:2]

In [None]:
# def transfrom_image(image_path):    
#     transform = A.Compose(
#         [A.CLAHE(),
#          A.RandomRotate90(),
#          A.Transpose(),
#          A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.50,
#                             rotate_limit=45, p=.75),
#          A.Blur(blur_limit=3),
#          A.OpticalDistortion(),
#          A.GridDistortion(),
#          A.HueSaturationValue()
#         ]      
#     )
#     image = image_to_array(image_path)
#     augmented_image = transform(image=image)['image']
    
#     return augmented_image

In [None]:
# def image_to_array(image_path):
#     """Return array that represents the images"""
    
#     image = tf.keras.preprocessing.image.load_img(image_path)
#     input_arr = keras.preprocessing.image.img_to_array(image)
#     image = np.array([input_arr])
   
#     return image
    

In [None]:
def process_image(image_path):
    """
    Takes an image file path and turns it into a Tensor.
    """
    print("image_path",image_path)
    # Read in image file
    image = tf.io.read_file(image_path)
    # Turn the jpeg image into numerical Tensor with 3 colour channels (Red, Green, Blue)
    image = tf.image.decode_jpeg(image, channels=3)
    # Convert the colour channel values from 0-225 values to 0-1 values
    image = tf.image.convert_image_dtype(image, tf.int32)
    # Resize the image to our desired size (224, 244)
    image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])


    return image



In [None]:
# Create a simple function to return a tuple (image, label)

def get_image_label(image_path, label):
    """
    Takes an image file path name and the associated label,
    processes the image and returns a tuple of (image, label).
    """
    image = process_image(image_path)
    return image, label

## create batches

In [None]:
# Create a function to turn data into batches

def create_data_batches(x, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
    """
    Creates batches of data out of image (x) and label (y) pairs.
    Shuffles the data if it's training data but doesn't shuffle it if it's validation data.
    Also accepts test data as input (no labels).
    """
    # If the data is a test dataset, we probably don't have labels
    if test_data:
        print("Creating test data batches...")
        data = tf.data.Dataset.from_tensor_slices((x)) # only filepaths  
        data = data.map(process_image)
        data_batch = data.batch(BATCH_SIZE)
        return data_batch

    # If the data if a valid dataset, we don't need to shuffle it
    elif valid_data:
        print("Creating validation data batches...")
        
        data = tf.data.Dataset.from_tensor_slices((x, # filepaths
                                           y)) # labels
        data = data.map(get_image_label)
        data_batch = data.batch(BATCH_SIZE)
        return data_batch

    else:
    # If the data is a training dataset, we shuffle it
        print("Creating training data batches...")
    # Turn filepaths and labels into Tensors
        data = tf.data.Dataset.from_tensor_slices((x, # filepaths
                                          y)) # labels

    # Shuffling pathnames and labels before mapping image processor function is faster than shuffling images
        data = data.shuffle(buffer_size=len(x))

    # Create (image, label) tuples (this also turns the image path into a preprocessed image)
        data = data.map(get_image_label)
               
    # Turn the data into batches
        data_batch = data.batch(BATCH_SIZE)
        
    return data_batch

In [None]:
# Create a function to unbatch a batched dataset
def unbatchify(data):
    """
    Takes a batched dataset of (image, label) Tensors and returns separate arrays
    of images and labels.
    """
    images = []
    labels = []
    # Loop through unbatched data
    for image, label in data.unbatch().as_numpy_iterator():
        images.append(image)
        labels.append(unique_breed[np.argmax(label)])
    return images, labels


In [None]:
# Create training and validation data batches

train_data = create_data_batches(X_train, y_train)
val_data = create_data_batches(X_val, y_val, valid_data=True)

In [None]:
# Check out the different attributes of our data batches

train_data.element_spec, val_data.element_spec

## create model

In [None]:
# Setting up input shape to the model
INPUT_SHAPE = [BATCH_SIZE, IMG_SIZE, IMG_SIZE, 3] # batch, height, width, colour channels

# Setting up output shape of the model
OUTPUT_SHAPE = len(unique_breed) # number of unique labels

# Create early stopping (once our model stops improving, stop training)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                  patience=3) # stops after 3 rounds of no improvements
NUM_EPOCHS = 100

pretrained_model = tf.keras.applications.MobileNetV2(input_shape = INPUT_SHAPE[1:], include_top = False, weights = "imagenet")
pretrained_model.trainable = False

In [None]:
  # Setup the model layers
model = tf.keras.Sequential([pretrained_model,                                 
                             tf.keras.layers.GlobalAveragePooling2D(),
                             tf.keras.layers.Flatten(),
                             tf.keras.layers.Dense(OUTPUT_SHAPE, activation="relu"),                            
                             tf.keras.layers.Dense(OUTPUT_SHAPE, activation="softmax")                                     
                                ])

In [None]:
  # Compile the model
model.compile(
      loss=tf.keras.losses.CategoricalCrossentropy(), # Our model wants to reduce this (how wrong its guesses are)
      optimizer=tf.keras.optimizers.Adam(), # A friend telling our model how to improve its guesses
      metrics=["accuracy"] # We'd like this to go up
  )

# Build the model
model.build(INPUT_SHAPE) # Let the model know what kind of inputs it'll be getting



In [None]:
for layer in model.layers[:20]:
    layer.trainable=False
for layer in model.layers[20:]:
    layer.trainable=True

In [None]:
datagen = ImageDataGenerator(
        featurewise_center=True,
        featurewise_std_normalization=True,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        horizontal_flip=True,
        validation_split=0.2)

In [None]:
from os.path import isfile, join, abspath, exists, isdir, expanduser

#Extracting different classes
dog_breeds = sorted(labels_df['breed'].unique())
n_classes = len(dog_breeds)

#Converting classes to numbers
class_to_num = dict(zip(dog_breeds,range(n_classes)))

image_size = (224,224,3)
data_dir = "../input/dog-breed-identification/train"
"""Return arrays that represents the images and processed target given where the images are saved, the dataframe that contains the id and the breeds and the image size"""
image_names = labels_df['id']
image_labels = labels_df['breed']
data_size = len(image_names)

X = np.zeros([data_size,image_size[0],image_size[1],image_size[2]],dtype = np.uint8)
y = np.zeros([data_size,1],dtype = np.uint8)

for i in range(data_size):
    img_name = image_names[i]
    img_dir = join(data_dir,img_name+'.jpg')
    img_pixels = load_img(img_dir,target_size=image_size)
    X[i] = img_pixels
    y[i] = class_to_num[image_labels[i]]

y = to_categorical(y)

ind = np.random.permutation(data_size)
X = X[ind]
y = y[ind]
print('Ouptut Data Size: ', X.shape)
print('Ouptut Label Size: ', y.shape)
 

In [None]:
#spliting into training and validation of total size NUM_IMAGES.

X_train,X_val,y_train,y_val = train_test_split(X[:NUM_IMAGES],
                                                y[:NUM_IMAGES],
                                                test_size=0.2,
                                                random_state=42)
len(X_train),len(X_val),len(y_train),len(y_val)

In [None]:
datagen = ImageDataGenerator(rescale=1./255.,validation_split=0.2)


train_generator = datagen.flow(X_train,
                               y_train,
                               batch_size=32,
                               shuffle=True,
                               seed=42,
                               subset='training')

valid_generator = datagen.flow(X_val,
                               y_val,
                               batch_size=32,
                               seed=42,
                               subset='validation')


STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

history = model.fit(train_generator,
          steps_per_epoch=STEP_SIZE_TRAIN,
          validation_data=valid_generator,
          validation_steps=STEP_SIZE_VALID,
           epochs=NUM_EPOCHS,
            validation_freq=1,
            callbacks=early_stopping)


In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
history_df.loc[:, ['accuracy', 'val_accuracy']].plot();

## Predictions

In [None]:
# # Make predictions on the validation data (not used to train on)
# predictions = model.predict(X_val, verbose=1) # verbose shows us how long there is to go
# predictions.shape

In [None]:
# # Turn prediction probabilities into their respective label (easier to understand)

# def get_pred_label(prediction_probabilities):
#   """
#   Turns an array of prediction probabilities into a label.
#   """
#   return unique_breed[np.argmax(prediction_probabilities)]


In [None]:
# # Unbatchify the validation data
# val_images, val_labels = unbatchify(val_data)

In [None]:
# def plot_pred(prediction_probabilities, labels, images, n=1):
#     """
#     View the prediction, ground truth label and image for sample n.
#     """
#     pred_prob, true_label, image = prediction_probabilities[n], labels[n], images[n]

#     # Get the pred label
#     pred_label = get_pred_label(pred_prob)

#     # Plot image & remove ticks
#     plt.imshow(image)
#     plt.xticks([])
#     plt.yticks([])

#     # Change the color of the title depending on if the prediction is right or wrong
#     if pred_label == true_label:
#         color = "green"
#     else:
#         color = "red"

#     plt.title("{} {:2.0f}% ({})".format(pred_label,
#                                   np.max(pred_prob)*100,
#                                   true_label),
#                                   color=color)

In [None]:
# plot_pred(predictions, y_val, X_val, n=2
#          )

In [None]:
# # pred_labels=[]
# for i in range(len(val_labels)): 
#     pred_labels.append(get_pred_label(predictions[i]))
# y_test=val_labels
# y_pred=pred_labels

# confusion = confusion_matrix(val_labels, pred_labels)
# print('Confusion Matrix\n')
# print(confusion)


# print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

# print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
# print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
# print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

# print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
# print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
# print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

# print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
# print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
# print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))



In [None]:
# print('\nClassification Report\n')
# print(classification_report(y_test, y_pred))
