# Experiment 8 - Demo example of negative transfer using a MobileNet pre-trained on Dogs vs Cats

In this experiment we train a MobileNet model from scratch to perform classification using the Dogs vs Cats competition dataset from Kaggle (https://www.kaggle.com/c/dogs-vs-cats/data). 

This model is then used to transfer learning to task using a different domain of images, using the Chest X-Ray Dataset (https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia/download).

The models trained within the ImageNet dataset sometimes perform very good even with images in a very different domain, due the large number of features present in the ImageNet dataset. In this experiment we show a more evident case of negative transfer.

In [1]:
%tensorflow_version 1.x

import os
import numpy as np
import tensorflow as tf
import random as rn
import keras
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

from os import walk
from tqdm import *
from keras.utils import to_categorical
from keras.applications.imagenet_utils import preprocess_input
from keras.applications import mobilenet
from keras.models import Model, Sequential
from keras.layers import Flatten, Dense, GlobalAveragePooling2D, Reshape, Conv2D, Dropout, Activation, MaxPooling2D
from keras import optimizers
from keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


#### Set some variables to allow reproducibility

In [None]:
os.environ["PYTHONHASHSEED"] = "0"
# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(42)
# The below is necessary for starting core Python generated random numbers
# in a well-defined state.
rn.seed(12345)
# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.set_random_seed(1234)

## Download the Dogs vs Cats competition dataset from kaggle

### Set kaggle username and key

In [None]:
os.environ["KAGGLE_USERNAME"] = "{username}"
os.environ["KAGGLE_KEY"] = "{key}"

### Download using the Kaggle API

https://www.kaggle.com/docs/api

In [None]:
!mkdir dogs_vs_cats
!kaggle competitions download -c dogs-vs-cats -p dogs_vs_cats
!rm dogs_vs_cats/sampleSubmission.csv && rm dogs_vs_cats/test1.zip
!unzip dogs_vs_cats/train.zip -d dogs_vs_cats
!rm dogs_vs_cats/train.zip
!mkdir dogs_vs_cats/dogs && mkdir dogs_vs_cats/cats
!mv dogs_vs_cats/train/dog.* dogs_vs_cats/dogs && mv dogs_vs_cats/train/cat.* dogs_vs_cats/cats && rm -rf dogs_vs_cats/train

## Organize data and split into train, validation and test

In [6]:
dataset_path = 'dogs_vs_cats'
for (dirpath, dirnames, filenames) in walk(dataset_path):
    if (len(dirnames) > 0):
        folders_with_labels =  np.empty([len(dirnames), 3], dtype=object)
        i = 0
        for dir_name in sorted(dirnames, key=str.lower):
            folders_with_labels[i][0] = dir_name
            folders_with_labels[i][1] = i
            folders_with_labels[i][2] = []
            i += 1
print(folders_with_labels)

for path, label, imgs in folders_with_labels:
    for (dirpath, dirnames, filenames) in walk(os.path.join(dataset_path, path)):
        for file_name in filenames:
            imgs.append('{}/{},{}'.format(path, file_name, label))
            
print('-----------------------')
print("Total images per class...")
total = 0
for path, label, imgs in folders_with_labels:
    print(path, ":", len(imgs))
    total += len(imgs)
print('-----------------------')
print('Total of images:', total)

split_validation = 2500
split_test = 2500
train = []
val = []
test = []
for path, label, imgs in folders_with_labels:
    test.extend(imgs[0:split_test])
    val.extend(imgs[split_test:split_test+split_validation])
    train.extend(imgs[split_test+split_validation:len(imgs)])
    
print('-----------------------')
print('Total per set...')
print('x_train:', len(train))
print('x_test:', len(test))
print('x_val:', len(val))

from random import shuffle
shuffle(test)
shuffle(val)
shuffle(train)

print('-----------------------')
with open(os.path.join(dataset_path, 'test.txt'), 'w') as f:  
    f.writelines("%s\n" % item for item in test)
    print('File test.txt created.')
    
with open(os.path.join(dataset_path, 'val.txt'), 'w') as f:  
    f.writelines("%s\n" % item for item in val)
    print('File val.txt created.')
    
with open(os.path.join(dataset_path, 'train.txt'), 'w') as f:  
    f.writelines("%s\n" % item for item in train)
    print('File train.txt created.')

[['cats' 0 list([])]
 ['dogs' 1 list([])]]
-----------------------
Total images per class...
cats : 12500
dogs : 12500
-----------------------
Total of images: 25000
-----------------------
Total per set...
x_train: 15000
x_test: 5000
x_val: 5000
-----------------------
File test.txt created.
File val.txt created.
File train.txt created.


## Set initial parameters, functions and generators

In [None]:
dataset_path = 'dogs_vs_cats'
width = 224
height = 224
input_shape = (width, height, 3)
batch_size = 10
verbose = 1
classes_dict = {
    0: 'cat',
    1: 'dog'
}

Here we define 3 functions:


*   read_training_file() : Read the paths to images and labels for the dataset using each file (train, validation and test).
*   preprocess_image() : Preprocess the images in batch, set the size, the type and put in the TF format (scale pixels between -1 and 1).
*   data_generator() : Returns a generator that reads the images in batch and call the preprocess function, to be used in the training process with the fit_generator function.



In [None]:
def read_training_file(path, separator=' '):
    images = []
    labels = []
    with open(path, 'r') as f:
        for i in tqdm(f.readlines()):
            img_name, cls = i.strip().split(separator)
            cls = int(cls)
            images.append(img_name)
            labels.append(cls)
    return images, labels

def preprocess_image(images, images_path, width, height):
    pp_images = []

    for image in images:
        img = cv2.imread(os.path.join(images_path, image))
        if img.shape[0] != 224 or img.shape[1] != 224:
            img = cv2.resize(img, (width, height), interpolation=cv2.INTER_NEAREST)
        img = np.asarray(img, 'float32')
        img = preprocess_input(img, mode='tf')
        pp_images.append(img)

    return np.asarray(pp_images)

def data_generator(images, labels, batch_size, images_path, width, height):
    batch_start = 0
    while batch_start < len(images):
        batch_end = min(batch_start + batch_size, len(images))
        y_batch = labels[batch_start : batch_end]
        x_batch = preprocess_image(images[batch_start : batch_end],
                                   images_path,
                                   width,
                                   height)
        batch_start += batch_size
        if batch_start >= len(images):
            batch_start = 0
        yield (x_batch, y_batch)

Read the label files (train, val and test) and set two arrays:

1.   paths to images.
2.   labels as a binary class matrix, for use with categorical_crossentropy.



In [9]:
train_path = os.path.join(dataset_path, 'train.txt')
val_path = os.path.join(dataset_path, 'val.txt')
test_path = os.path.join(dataset_path, 'test.txt')

train_images, train_labels = read_training_file(train_path, ',')
val_images, val_labels = read_training_file(val_path, ',')
test_images, test_labels = read_training_file(test_path, ',')

print('Train: {} documents'.format(len(train_images)))
print('Validation: {} documents'.format(len(val_images)))
print('Test: {} documents'.format(len(test_images)))

100%|██████████| 15000/15000 [00:00<00:00, 845375.83it/s]
100%|██████████| 5000/5000 [00:00<00:00, 628982.06it/s]
100%|██████████| 5000/5000 [00:00<00:00, 842703.53it/s]

Train: 15000 documents
Validation: 5000 documents
Test: 5000 documents





Create generators for the training set and validation set.

In [None]:
train_gen = data_generator(train_images, train_labels, batch_size, dataset_path, width, height)
val_gen = data_generator(val_images, val_labels, batch_size, dataset_path, width, height)

num_batches_per_epoch = int(round(len(train_images) / float(batch_size)))
num_batches_per_epoch_val = int(round(len(val_images) / float(batch_size)))

## 1 - Training MobileNet with regularization + augmentation

### Create augmentation generator

Here we use the ImageDataGenerator from Keras to apply different types of augmentation.

In [None]:
image_gen = ImageDataGenerator(zoom_range=0.3, 
                               rotation_range=50,
                               width_shift_range=0.2, 
                               height_shift_range=0.2, 
                               shear_range=0.2, 
                               horizontal_flip=True, 
                               fill_mode='nearest')

In [None]:
def augmentation_generator(in_generator):

    for in_x, in_y in in_generator:
        g_x = image_gen.flow(in_x, in_y,
                             batch_size=in_x.shape[0])
        x, y = next(g_x)

        yield x, y

In [None]:
train_gen = augmentation_generator(train_gen)
val_gen = augmentation_generator(val_gen)

### Define and compile the MobileNet model architecture

Here we define and compile the MobileNet-based model architecture using alpha=0.5 with a GlobalAveragePooling + Conv2D 1x1 for classification.

In [None]:
base_mobilenet = mobilenet.MobileNet(include_top=False, alpha=0.5, weights=None, input_shape=input_shape)

x = base_mobilenet.output
x = GlobalAveragePooling2D()(x)
x = Reshape((1, 1, 512), name='reshape_1')(x)
x = Dropout(0.3, name='dropout')(x)
x = Conv2D(1, (1, 1),
           padding='same', name='conv_preds')(x)
x = Activation('sigmoid', name='act_sigmoid')(x)
x = Reshape((1,), name='reshape_2')(x)

model = Model(inputs=base_mobilenet.input, outputs=x)

In [15]:
# from keras.models import load_model
# model = load_model('8_mobilenet_dogs_cats.h5')





In [None]:
model.compile(optimizer=optimizers.SGD(lr=0.02), loss='binary_crossentropy', metrics=['accuracy'])

### Train the model.

In [None]:
num_epochs = 60

print('Start training...')
history = model.fit_generator(train_gen,
                                  steps_per_epoch=num_batches_per_epoch,
                                  epochs=num_epochs,
                                  verbose=verbose,
                                  validation_data=val_gen,
                                  validation_steps=num_batches_per_epoch_val)
print('Model trained.')

Train acc: 92.29%

Val acc: 91.97%

In [47]:
model.save('mobilenet_dogs_cats_60epochs.h5')
print('Model saved.')

Model saved.


### Plot training metrics 

Plot accuracy and loss for the training and validation sets.

In [None]:
sns.set(font_scale=1.5)
plt.figure(figsize=(10, 7))
plt.title('Accuracy')
plt.ylim((0.0, 1.0))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.legend(['Train', 'Val'], loc='upper left')

sns.set(font_scale=1.5)
plt.figure(figsize=(10, 7))
plt.title('Loss')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylim((0.0, 3.0))
plt.legend(['Train', 'Val'], loc='upper left')

### Evaluate the trained model with the test set.

In [17]:
num_batches_per_epoch_test = int(round(len(test_images) / float(batch_size)))

print('Evaluating trained model...')
result = model.evaluate_generator(generator=data_generator(test_images,
                                                               test_labels,
                                                               batch_size,
                                                               dataset_path, width, height),
                                      steps=num_batches_per_epoch_test)

print("Finished mobilenet.evaluate_generator")
print(model.metrics_names)
print(result)

Evaluating trained model...
Finished mobilenet.evaluate_generator
['loss', 'acc']
[0.20123412051331252, 0.9197999929189682]


## 2 - Training a MobileNet from scratch to classify the Chest X-Ray Dataset

### Download the Dataset using the Kaggle API

In [None]:
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia
!unzip chest-xray-pneumonia.zip
!rm chest-xray-pneumonia.zip
!rm -rf chest_xray/__MACOSX/
!rm -rf chest_xray/chest_xray/

### Organize data and split into train, validation and test

In [19]:
dataset_path = 'chest_xray'
for dset in ['train','val','test']:
    print('-----------------------')
    print('Set:', dset)
    set_path = os.path.join(dataset_path, dset)
    print(set_path)

    for (dirpath, dirnames, filenames) in walk(set_path):
        if (len(dirnames) > 0):
            folders_with_labels = np.empty([len(dirnames), 3], dtype=object)
            i = 0
            for dir_name in sorted(dirnames, key=str.lower):
                folders_with_labels[i][0] = dir_name
                folders_with_labels[i][1] = i
                folders_with_labels[i][2] = []
                i += 1
    print(folders_with_labels)

    # Fill image paths in the array from images in the folders
    for path, label, imgs in folders_with_labels:
        for (dirpath, dirnames, filenames) in walk(os.path.join(set_path, path)):
            for file_name in filenames:
                imgs.append('{}/{}/{},{}'.format(dset, path, file_name, label))

    print('')
    print("Total images per class...")
    total = 0
    for path, label, imgs in folders_with_labels:
        print(path, ":", len(imgs))
        total += len(imgs)
    print('')
    print('Total of images:', total)

    all_imgs = []
    for path, label, imgs in folders_with_labels:
        all_imgs.extend(imgs)
    print('images:', len(all_imgs))

    shuffle(all_imgs)

    with open(os.path.join(dataset_path, '{}.txt'.format(dset)), 'w') as f:  
        f.writelines("%s\n" % item for item in all_imgs)

-----------------------
Set: train
chest_xray/train
[['NORMAL' 0 list([])]
 ['PNEUMONIA' 1 list([])]]

Total images per class...
NORMAL : 1341
PNEUMONIA : 3875

Total of images: 5216
images: 5216
-----------------------
Set: val
chest_xray/val
[['NORMAL' 0 list([])]
 ['PNEUMONIA' 1 list([])]]

Total images per class...
NORMAL : 8
PNEUMONIA : 8

Total of images: 16
images: 16
-----------------------
Set: test
chest_xray/test
[['NORMAL' 0 list([])]
 ['PNEUMONIA' 1 list([])]]

Total images per class...
NORMAL : 234
PNEUMONIA : 390

Total of images: 624
images: 624


### Set initial parameters and generators

In [None]:
dataset_path = 'chest_xray'
width = 224
height = 224
input_shape = (width, height, 3)
batch_size = 10
verbose = 1
classes_dict = {
    0: 'NORMAL',
    1: 'PNEUMONIA'
}

In [21]:
train_path = os.path.join(dataset_path, 'train.txt')
val_path = os.path.join(dataset_path, 'val.txt')
test_path = os.path.join(dataset_path, 'test.txt')

train_images, train_labels = read_training_file(train_path, ',')
val_images, val_labels = read_training_file(val_path, ',')
test_images, test_labels = read_training_file(test_path, ',')

print('Train: {} documents'.format(len(train_images)))
print('Validation: {} documents'.format(len(val_images)))
print('Test: {} documents'.format(len(test_images)))

100%|██████████| 5216/5216 [00:00<00:00, 555830.53it/s]
100%|██████████| 16/16 [00:00<00:00, 5398.07it/s]
100%|██████████| 624/624 [00:00<00:00, 251730.85it/s]

Train: 5216 documents
Validation: 16 documents
Test: 624 documents





In [None]:
train_gen = data_generator(train_images, train_labels, batch_size, dataset_path, width, height)
val_gen = data_generator(val_images, val_labels, batch_size, dataset_path, width, height)

num_batches_per_epoch = int(round(len(train_images) / float(batch_size)))
num_batches_per_epoch_val = int(round(len(val_images) / float(batch_size)))

### Define model architecture

In [None]:
base_mobilenet = mobilenet.MobileNet(include_top=False, alpha=0.5, weights=None, input_shape=input_shape)

x = base_mobilenet.output
x = GlobalAveragePooling2D(name='gap_xray')(x)
x = Reshape((1, 1, 512), name='reshape_1_xray')(x)
x = Dropout(0.3, name='dropout_xray')(x)
x = Conv2D(1, (1, 1), padding='same', name='conv_preds_xray')(x)
x = Activation('sigmoid', name='act_sigmoid_xray')(x)
x = Reshape((1,), name='reshape_2_xray')(x)

model = Model(inputs=base_mobilenet.input, outputs=x)

In [None]:
# from keras.models import load_model
# model = load_model('8_mobilenet_xray_from_scratch.h5')

In [None]:
model.compile(optimizer=optimizers.SGD(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

### Train the model.

In [None]:
num_epochs = 40

print('Start training...')
history = model.fit_generator(train_gen,
                                  steps_per_epoch=num_batches_per_epoch,
                                  epochs=num_epochs,
                                  verbose=verbose,
                                  validation_data=val_gen,
                                  validation_steps=num_batches_per_epoch_val)
print('Model trained.')

Train: 99.98%

Val: 81.25%

In [28]:
# model.save('mobilenet_xray_from_scratch.h5')
# print('Model saved.')

Model saved.


### Evaluate the model.

In [26]:
num_batches_per_epoch_test = int(round(len(test_images) / float(batch_size)))

print('Evaluating trained model...')
result = model.evaluate_generator(generator=data_generator(test_images,
                                                               test_labels,
                                                               batch_size,
                                                               dataset_path, width, height),
                                      steps=num_batches_per_epoch_test)

print("Finished mobilenet.evaluate_generator")
print(model.metrics_names)
print(result)

Evaluating trained model...
Finished mobilenet.evaluate_generator
['loss', 'acc']
[0.9401362669549042, 0.7693548370753566]


## 3 - Training a MobileNet to classify the Chest X-Ray Dataset using the feature extraction from Dogs vs Cats

### Load the MobileNet model trained on Dogs vs Cats

In [None]:
from keras.models import load_model
base_dogs_cats = load_model('8_mobilenet_dogs_cats.h5')

In [None]:
base_dogs_cats.summary()

### Define model architecture

In [None]:
x = base_dogs_cats.get_layer('conv_pw_13_relu').output

x = GlobalAveragePooling2D(name='gap_xray')(x)
x = Reshape((1, 1, 512), name='reshape_1_xray')(x)
x = Dropout(0.3, name='dropout_xray')(x)
x = Conv2D(1, (1, 1), padding='same', name='conv_preds_xray')(x)
x = Activation('sigmoid', name='act_sigmoid_xray')(x)
x = Reshape((1,), name='reshape_2_xray')(x)

model = Model(inputs=base_dogs_cats.input, outputs=x)

In [None]:
# from keras.models import load_model
# model = load_model('8_mobilenet_xray_from_dogs_cats.h5')

### Set trainable and non-trainable layers


In [37]:
print('Not trainable')
for layer in model.layers[:87]:
    print(layer)
    layer.trainable = False
print('')
print('Trainable')
for layer in model.layers[87:]:
    print(layer, layer.name)
    layer.trainable = True


Not trainable
<keras.engine.input_layer.InputLayer object at 0x7fec3af855c0>
<keras.layers.convolutional.ZeroPadding2D object at 0x7fec3af85630>
<keras.layers.convolutional.Conv2D object at 0x7fec3af85828>
<keras.layers.normalization.BatchNormalization object at 0x7fec3af85668>
<keras.layers.advanced_activations.ReLU object at 0x7fec3af85978>
<keras.layers.convolutional.DepthwiseConv2D object at 0x7fec3af85a58>
<keras.layers.normalization.BatchNormalization object at 0x7fec3af85a90>
<keras.layers.advanced_activations.ReLU object at 0x7fec3af85b00>
<keras.layers.convolutional.Conv2D object at 0x7fec3af85dd8>
<keras.layers.normalization.BatchNormalization object at 0x7fec3af85e10>
<keras.layers.advanced_activations.ReLU object at 0x7fec3af85f98>
<keras.layers.convolutional.ZeroPadding2D object at 0x7fec3cebc128>
<keras.layers.convolutional.DepthwiseConv2D object at 0x7fec3cebc160>
<keras.layers.normalization.BatchNormalization object at 0x7fec3cebc1d0>
<keras.layers.advanced_activations.

In [None]:
model.compile(optimizer=optimizers.SGD(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

### Train the model.

In [None]:
num_epochs = 40

print('Start training...')
history = model.fit_generator(train_gen,
                                  steps_per_epoch=num_batches_per_epoch,
                                  epochs=num_epochs,
                                  verbose=verbose,
                                  validation_data=val_gen,
                                  validation_steps=num_batches_per_epoch_val)
print('Model trained.')

Train: 80.38%

Val: 50.00%

In [49]:
# model.save('mobilenet_xray_from_dogs_cats.h5')
# print('Model saved.')

Model saved.


### Evaluate the model.

In [33]:
num_batches_per_epoch_test = int(round(len(test_images) / float(batch_size)))

print('Evaluating trained model...')
result = model.evaluate_generator(generator=data_generator(test_images,
                                                               test_labels,
                                                               batch_size,
                                                               dataset_path, width, height),
                                      steps=num_batches_per_epoch_test)

print("Finished mobilenet.evaluate_generator")
print(model.metrics_names)
print(result)

Evaluating trained model...
Finished mobilenet.evaluate_generator
['loss', 'acc']
[0.9360114345627446, 0.62580645432876]
