# Dog Breed classification

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plte
%matplotlib inline
import tensorflow as tf
import tensorflow_hub as hub
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout 

## Data

In [None]:
#show labels

labels_df = pd.read_csv('../input/dog-breed-identification/labels.csv')
labels_df.head()

In [None]:
labels_df.describe()

In [None]:
# visualize data

labels_df['breed'].value_counts().plot.bar(figsize=(10,10))

In [None]:
labels_df['breed'].value_counts().median()

In [None]:
#show one image
from IPython.display import Image
Image('../input/dog-breed-identification/train/001cdf01b096e06d78e9e5112d419397.jpg')

## Format data

In [None]:
#create image path in data

filenames = ["../input/dog-breed-identification/train/" + fname + ".jpg"for fname in labels_df['id']]
filenames[22]

In [None]:
# verify each image has is  path

import os

if len(os.listdir("../input/dog-breed-identification/train/")) == len (filenames):
    print ("ok")
else:
    print ("not ok")

In [None]:
# check one image and is label

Image(filenames[25])

In [None]:
print(labels_df['breed'][25])

## turning data into numbers

In [None]:
# tunr labels into numpy array

labels = labels_df['breed']
labels = np.array(labels)
labels

In [None]:
# check everything is ok

if len(labels) == len(filenames):
    print('everything is ok')
else:
    print ('ouch')

In [None]:
# labels encoding

unique_breed = np.unique(labels)
unique_breed

In [None]:
#Turn single label into an array of boolean.
print(labels[0])
labels[0] == unique_breed

In [None]:
#Turning every label into an array of boolean
boolean_labels = [labels == unique_breed for labels in labels]
boolean_labels[:2]

In [None]:
# Turining boolean arrays into integers.
print(labels[0])   #orginal index
print(np.where(unique_breed==labels[0]))    #index where labels occurs.
print(boolean_labels[0].argmax())     #index where label occurs in boolean array
print(boolean_labels[0].astype(int))   #there will be a 1 where sample label occurs

## creating sets

In [None]:
# setup x and y variables.
X = filenames
y = boolean_labels

In [None]:
#just a sample
NUM_IMAGES = 1000 

In [None]:
#let's split our data into train and validation.
from sklearn.model_selection import train_test_split

#spliting into training and validation of total size NUM_IMAGES.

X_train,X_val,y_train,y_val = train_test_split(X[:NUM_IMAGES],
                                                y[:NUM_IMAGES],
                                                test_size=0.2,
                                                random_state=42)
len(X_train),len(X_val),len(y_train),len(y_val)

## Preprocessing


Preprocessing Images

Turning images into tensors

Let's write a function to preprocess the image. The function will do the following tasks.

    The function will take an image filepath as input.
    Use the tensorflow to read the file and save it to the variable.
    Turn our variable (.jpg) into tensors.
    Normalize our image(convert color channel from 0-255 to 0-1).|
    Resize the image.
    Return the modified variable.



In [None]:
from matplotlib.pyplot import imread
image = imread(filenames[42])
image.shape

In [None]:
#lets conver them into tensor
tf.constant(image)[:2]

In [None]:
# fonction for preprocessing
# Define image size
IMG_SIZE = 224

def process_image(image_path):
  """
  Takes an image file path and turns it into a Tensor.
  """
  # Read in image file
  image = tf.io.read_file(image_path)
  # Turn the jpeg image into numerical Tensor with 3 colour channels (Red, Green, Blue)
  image = tf.image.decode_jpeg(image, channels=3)
  # Convert the colour channel values from 0-225 values to 0-1 values
  image = tf.image.convert_image_dtype(image, tf.float32)
  # Resize the image to our desired size (224, 244)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])
  return image

## creat batches

In [None]:
# Create a simple function to return a tuple (image, label)
def get_image_label(image_path, label):
  """
  Takes an image file path name and the associated label,
  processes the image and returns a tuple of (image, label).
  """
  image = process_image(image_path)
  return image, label

In [None]:
# Define the batch size, 32 is a good default
BATCH_SIZE = 32

# Create a function to turn data into batches
def create_data_batches(x, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
  """
  Creates batches of data out of image (x) and label (y) pairs.
  Shuffles the data if it's training data but doesn't shuffle it if it's validation data.
  Also accepts test data as input (no labels).
  """
  # If the data is a test dataset, we probably don't have labels
  if test_data:
    print("Creating test data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x))) # only filepaths
    data_batch = data.map(process_image).batch(BATCH_SIZE)
    return data_batch
  
  # If the data if a valid dataset, we don't need to shuffle it
  elif valid_data:
    print("Creating validation data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                               tf.constant(y))) # labels
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

  else:
    # If the data is a training dataset, we shuffle it
    print("Creating training data batches...")
    # Turn filepaths and labels into Tensors
    data = tf.data.Dataset.from_tensor_slices((tf.constant(x), # filepaths
                                              tf.constant(y))) # labels
    
    # Shuffling pathnames and labels before mapping image processor function is faster than shuffling images
    data = data.shuffle(buffer_size=len(x))

    # Create (image, label) tuples (this also turns the image path into a preprocessed image)
    data = data.map(get_image_label)

    # Turn the data into batches
    data_batch = data.batch(BATCH_SIZE)
  return data_batch

In [None]:
# Create training and validation data batches
train_data = create_data_batches(X_train, y_train)
val_data = create_data_batches(X_val, y_val, valid_data=True)

In [None]:
# Check out the different attributes of our data batches
train_data.element_spec, val_data.element_spec

## create model

In [None]:
# Setting up input shape to the model
INPUT_SHAPE = [BATCH_SIZE, IMG_SIZE, IMG_SIZE, 3] # batch, height, width, colour channels

# Setting up output shape of the model
OUTPUT_SHAPE = len(unique_breed) # number of unique labels

# Setting up model URL from TensorFlow Hub
MODEL_URL = "https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/4"



In [None]:
INPUT_SHAPE

In [None]:
def create_model():
    model = Sequential()
    model.add(Conv2D(32,3,padding="same", activation="relu", input_shape=INPUT_SHAPE[1:]))
    model.add(MaxPool2D())

    model.add(Conv2D(32, 3, padding="same", activation="relu"))
    model.add(MaxPool2D())

    model.add(Conv2D(64, 3, padding="same", activation="relu"))
    model.add(MaxPool2D())
    model.add(Dropout(0.4))

    model.add(Flatten())
    model.add(Dense(128,activation="relu"))
    model.add(Dense(units=OUTPUT_SHAPE, activation="softmax"))
    
    return model

In [None]:
model.summary()

In [None]:
# Create early stopping (once our model stops improving, stop training)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                  patience=3) # stops after 3 rounds of no improvements

In [None]:
NUM_EPOCHS = 100
# Build a function to train and return a trained model
def train_model():
  """
  Trains a given model and returns the trained version.
  """
  # Create a model
  model = create_model()

  # Compile the model
  model.compile(
      loss=tf.keras.losses.CategoricalCrossentropy(), # Our model wants to reduce this (how wrong its guesses are)
      optimizer=tf.keras.optimizers.Adam(), # A friend telling our model how to improve its guesses
      metrics=["accuracy"] # We'd like this to go up
  )

  # Fit the model to the data passing it the callbacks we created
  model.fit(x=train_data,
            epochs=NUM_EPOCHS,
            validation_data=val_data,
            validation_freq=1, # check validation metrics every epoch
            callbacks=early_stopping)
  
  return model

In [None]:
my_model = train_model()

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
history_df.loc[:, ['accuracy', 'val_accuracy']].plot();