
**Forked from** https://www.kaggle.com/orangutan/keras-vgg19-starter

**For details**,.. https://www.kaggle.com/c/dog-breed-identification


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
from keras.applications.vgg19 import VGG19
from keras.applications.resnet50 import ResNet50
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten,  Conv2D, MaxPooling2D
from keras import backend as K
from keras.preprocessing.image import img_to_array, load_img


import os
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import cv2
import sys
import bcolz
import random
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

First we will read in the csv's so we can see some more information on the filenames and breeds

In [None]:
df_train = pd.read_csv('../input/dog-breed-identification/labels.csv')
df_test = pd.read_csv('../input/dog-breed-identification/sample_submission.csv')

In [None]:
df_train.head(10)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from glob import glob
from mpl_toolkits.axes_grid1 import ImageGrid

In [None]:
train_files = glob('../input/dog-breed-identification/train/*.jpg')
test_files = glob('../input/dog-breed-identification/test/*.jpg')

In [None]:
plt.imshow(plt.imread(train_files[100]))

In [None]:
targets_series = pd.Series(df_train['breed'])
one_hot = pd.get_dummies(targets_series, sparse = True)
one_hot_labels = np.asarray(one_hot)

In [None]:
!ls ../input/keras-pretrained-models/

Next we will read in all of the images for test and train, using a for loop through the values of the csv files. I have also set an im_size variable which sets the size for the image to be re-sized to,  90x90 px, you should play with this number to see how it affects accuracy.

In [None]:
im_size = 400

In [None]:
y_train = []
y_val = []

x_train_raw = bcolz.zeros((0,im_size,im_size,3),np.float32)
x_val_raw = bcolz.zeros((0,im_size,im_size,3),np.float32)

In [None]:
i = 0 
for f, breed in tqdm(df_train.values):
    # load an image from file
    image = load_img('../input/dog-breed-identification/train/{}.jpg'.format(f), target_size=(im_size, im_size))
    image = img_to_array(image)
    # prepare the image for the VGG model
    #image = preprocess_input(image)
    label = one_hot_labels[i]
    if random.randint(1,101) < 80: 
        x_train_raw.append(image)
        y_train.append(label)
    else:
        x_val_raw.append(image)
        y_val.append(label)
    i += 1

In [None]:
y_train_raw = np.array(y_train, np.uint8)
y_val_raw = np.array(y_val, np.uint8)
del(y_train,y_val)
import gc
gc.collect()

We check the shape of the outputs to make sure everyting went as expected.

In [None]:
print(x_train_raw.shape)
print(y_train_raw.shape)
print(x_val_raw.shape)
print(y_val_raw.shape)

In [None]:
def plotImages( images_arr, n_images=4):
    fig, axes = plt.subplots(n_images, n_images, figsize=(12,12))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow( img)
        ax.set_xticks(())
        ax.set_yticks(())
    plt.tight_layout()
plotImages(x_train_raw[0:16,]/255.)

In [None]:
datagen = ImageDataGenerator(
#         width_shift_range=0.2,
#         height_shift_range=0.2,
#         shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest')


We can see above that there are 120 different breeds. We can put this in a num_class variable below that can then be used when creating the CNN model.

In [None]:
num_class = y_train_raw.shape[1]

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_raw, y_train_raw, test_size=0.1, random_state=1)

In [None]:
del(x_train_raw)
gc.collect()

In [None]:
# Create the base pre-trained model
base_model = VGG19(weights = 'imagenet', include_top=False, input_shape=(im_size, im_size, 3))
#base_model = ResNet50(weights = 'imagenet', include_top=False, input_shape=(im_size, im_size, 3))
base_model.summary()

In [None]:
len(base_model.layers)

In [None]:
layers_to_remove = 2
if layers_to_remove >0:
    for i in range(0,layers_to_remove):
        base_model.layers.pop()
    base_model.summary()

In [None]:
fine_tuning_layers = 0
layers_to_freeze = len(base_model.layers) - fine_tuning_layers
print(layers_to_freeze)

for layer in base_model.layers[0:layers_to_freeze]:
    layer.trainable = False

In [None]:
# Add a new top layer
x = base_model.layers[layers_to_freeze-1+fine_tuning_layers].output
x = Dropout(0.4)(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same')(x)
x = Conv2D(32, (5, 5), padding='same')(x)
x = Dropout(0.4)(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same')(x)
x = Dropout(0.4)(x)
x = Conv2D(16, (2, 2), padding='same')(x)

x = Flatten()(x)

#x = Dense(64, activation='relu')(x)
x = Dropout(0.4)(x)
predictions = Dense(num_class, activation='softmax')(x)

# This is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

In [None]:
batch_size = 16
filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=1),checkpoint]

In [None]:
K.get_value(model.optimizer.lr)

In [None]:
model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size),
          steps_per_epoch=  X_train.shape[0]//batch_size,
          epochs=2,
          verbose=1,validation_data=(X_valid, Y_valid))

In [None]:
K.set_value(model.optimizer.lr, 0.001)
model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size),
          steps_per_epoch=  X_train.shape[0]//batch_size,
          epochs=3,
          verbose=1,validation_data=(X_valid, Y_valid),callbacks=callbacks_list)

In [None]:
for f in tqdm(df_test['id'].values):
    img = cv2.imread('../input/dog-breed-identification/test/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (im_size, im_size)))
x_test  = np.array(x_test, np.float32) / 255.
print(x_test.shape)

In [None]:
model.load_weights("weights.best.hdf5")

In [None]:
preds = model.predict(x_test, verbose=1)

In [None]:
sub = pd.DataFrame(preds)
# Set column names to those generated by the one-hot encoding earlier
col_names = one_hot.columns.values
sub.columns = col_names
# Insert the column id from the sample_submission at the start of the data frame
sub.insert(0, 'id', df_test['id'])
sub.head(5)

In [None]:
%pwd

In [None]:
sub.to_csv("My first submission.csv",index =False)