# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Load-Libraries" data-toc-modified-id="Load-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Libraries</a></div><div class="lev1 toc-item"><a href="#Load-data/Create-data-Generators" data-toc-modified-id="Load-data/Create-data-Generators-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data/Create data Generators</a></div><div class="lev1 toc-item"><a href="#AUC-callback-function" data-toc-modified-id="AUC-callback-function-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>AUC callback function</a></div><div class="lev1 toc-item"><a href="#Load-the-model-&amp;-weights" data-toc-modified-id="Load-the-model-&amp;-weights-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load the model &amp; weights</a></div><div class="lev1 toc-item"><a href="#Training" data-toc-modified-id="Training-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Training</a></div><div class="lev1 toc-item"><a href="#Prediction" data-toc-modified-id="Prediction-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Prediction</a></div>

Training after specifying class weights. Also, calculating AUC after every epoch.

# Load Libraries

In [1]:
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras.models import Sequential, load_model, Model
from keras.layers import Activation, Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.applications.vgg16 import VGG16

from keras_tqdm import TQDMNotebookCallback

from datetime import datetime
import os

import numpy as np
import pandas as pd
import math

pd.options.display.max_rows = 40

Using TensorFlow backend.


# Load data/Create data Generators

In [2]:
validgen = ImageDataGenerator()

In [3]:
# 600/450 _ 500/375 _ 400/300 _ 300/225
img_width  = 600
img_height = 450

train_data_dir      = "data/train"
validation_data_dir = "data/valid"
test_data_dir       = "data/test"

batch_size_train = 16
batch_size_val   = 32

In [4]:
val_data = validgen.flow_from_directory(
        directory   = validation_data_dir,
        target_size = (img_height, img_width),
        batch_size  = 568,
        class_mode  = "binary",
        shuffle     = False).next()

Found 568 images belonging to 2 classes.


In [5]:
train_data = validgen.flow_from_directory(
        directory   = train_data_dir,
        target_size = (img_height, img_width),
        batch_size  = 1727,
        class_mode  = "binary",
        shuffle     = False).next()

Found 1727 images belonging to 2 classes.


In [6]:
datagen = ImageDataGenerator(
    rotation_range                = 20,
    width_shift_range             = 0.2,
    height_shift_range            = 0.2,
    horizontal_flip               = True)

In [7]:
train_gen = datagen.flow_from_directory(
        directory   = train_data_dir,
        target_size = (img_height, img_width),
        batch_size  = batch_size_train,
        class_mode  = "binary",
        shuffle     = True)

train_samples      = len(train_gen.filenames)

Found 1727 images belonging to 2 classes.


# AUC callback function

In [8]:
from sklearn.metrics import roc_auc_score

In [9]:
from sklearn.metrics import accuracy_score

In [10]:
from sklearn.metrics import log_loss

In [11]:
class auc_callback(keras.callbacks.Callback):
    def __init__(self, val_data, init_epoch):
        
        self.val_x   = val_data[0]
        self.val_y   = val_data[1]
        self.init_epoch = init_epoch
        
    
    def on_train_begin(self, logs={}):
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        self.model.save_weights('vgg-class-weights-epoch-' + str(self.init_epoch + epoch) + '.hdf5')
        
        val_pred = self.model.predict(self.val_x, batch_size=32, verbose=0)
        val_roc  = roc_auc_score(self.val_y, val_pred[:,0])
        val_loss = log_loss(self.val_y, np.append(1 - val_pred, val_pred, axis=1))
        val_acc  = accuracy_score(self.val_y, val_pred >= 0.5)
        
        print('\nVal AUC: ' + str(val_roc))
        print('\nVal Los: ' + str(val_loss))
        print('\nVal Acc: ' + str(val_acc) + '\n')
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return   

# Load the model & weights

In [12]:
vgg16 = VGG16(weights = 'imagenet',include_top=False)
x = vgg16.get_layer('block5_conv3').output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)
model_final = Model(inputs=vgg16.input, outputs=x)
model_final.compile(loss = 'binary_crossentropy',
                   optimizer = SGD(lr = 0.0001, momentum = 0.9, decay = 1e-5),
                   metrics = ['accuracy'])

In [20]:
model_final.load_weights('./weights/weights-iter-6-epoch-05.hdf5')

In [21]:
val_pred = model_final.predict(val_data[0], batch_size=32)

In [22]:
log_loss(val_data[1], np.append(1 - val_pred, val_pred, axis=1))

0.054385487479521967

In [23]:
accuracy_score(val_data[1], val_pred >= 0.5)

0.98415492957746475

In [24]:
roc_auc_score(val_data[1], val_pred[:,0])

0.99718010109071553

# Training

In [25]:
model_final.compile(loss = 'binary_crossentropy',
                   optimizer = SGD(lr = 0.0001, momentum = 0.9, decay = 1e-5, nesterov = True),
                   metrics = ['accuracy'])

In [26]:
model_final.fit_generator(generator        = train_gen,
                          epochs           = 10, 
                          steps_per_epoch  = math.ceil(1727 / batch_size_train), 
                          validation_data  = None, 
                          verbose          = 2,
                          callbacks        = [auc_callback(val_data, 0), TQDMNotebookCallback()],
                          class_weight     = {0: 1090/1727, 1: 637/1727})

Epoch 1/10

Val AUC: 0.997073689811

Val Los: 0.0747811062345

Val Acc: 0.978873239437

567s - loss: 0.0084 - acc: 0.9931


Epoch 2/10

Val AUC: 0.998230912477

Val Los: 0.0558442376276

Val Acc: 0.982394366197

565s - loss: 0.0064 - acc: 0.9942


Epoch 3/10

Val AUC: 0.997605746209

Val Los: 0.0569440585768

Val Acc: 0.985915492958

565s - loss: 0.0042 - acc: 0.9959


Epoch 4/10

Val AUC: 0.996900771482

Val Los: 0.0673487726995

Val Acc: 0.984154929577

565s - loss: 0.0073 - acc: 0.9931


Epoch 5/10

Val AUC: 0.996209098164

Val Los: 0.0905393810899

Val Acc: 0.980633802817

565s - loss: 0.0054 - acc: 0.9965


Epoch 6/10


KeyboardInterrupt: 

In [27]:
model_final.load_weights('./vgg-class-weights-epoch-1.hdf5')

In [28]:
val_pred = model_final.predict(val_data[0], batch_size=32)
log_loss(val_data[1], np.append(1 - val_pred, val_pred, axis=1))

0.055844237627639806

In [29]:
model_final.compile(loss = 'binary_crossentropy',
                   optimizer = SGD(lr = 0.00001, momentum = 0.9, decay = 1e-5, nesterov = True),
                   metrics = ['accuracy'])

In [None]:
model_final.fit_generator(generator        = train_gen,
                          epochs           = 10, 
                          steps_per_epoch  = math.ceil(1727 / batch_size_train), 
                          validation_data  = None, 
                          verbose          = 2,model_final.fit_generator(generator        = train_gen,
                          epochs           = 10, 
                          steps_per_epoch  = math.ceil(1727 / batch_size_train), 
                          validation_data  = None, 
                          verbose          = 2,
                          callbacks        = [auc_callback(val_data, 5), TQDMNotebookCallback()],
                          class_weight     = {0: 1090/1727, 1: 637/1727})
                          callbacks        = [auc_callback(val_data, 5), TQDMNotebookCallback()],
                          class_weight     = {0: 1090/1727, 1: 637/1727})




Epoch 1/10

Val AUC: 0.998057994147

Val Los: 0.0521872286755

Val Acc: 0.984154929577

566s - loss: 0.0059 - acc: 0.9965


Epoch 2/10


KeyboardInterrupt: 

In [13]:
model_final.load_weights('./vgg-class-weights-epoch-6.hdf5')

In [14]:
val_pred = model_final.predict(val_data[0], batch_size=32)
log_loss(val_data[1], np.append(1 - val_pred, val_pred, axis=1))

0.056707300425681326

In [15]:
accuracy_score(val_data[1], val_pred >= 0.5)

0.98063380281690138

In [16]:
roc_auc_score(val_data[1], val_pred[:,0])

0.99811119978717744

In [17]:
model_final.compile(loss = 'binary_crossentropy',
                   optimizer = SGD(lr = 0.00001, momentum = 0.9, decay = 1e-5, nesterov = True),
                   metrics = ['accuracy'])

In [18]:
model_final.fit_generator(generator        = train_gen,
                          epochs           = 10, 
                          steps_per_epoch  = math.ceil(1727 / batch_size_train), 
                          validation_data  = None, 
                          verbose          = 2,
                          callbacks        = [auc_callback(val_data, 7), TQDMNotebookCallback()],
                          class_weight     = {0: 1090/1727, 1: 637/1727})

Epoch 1/10

Val AUC: 0.998310720936

Val Los: 0.0665429110042

Val Acc: 0.982394366197

594s - loss: 0.0041 - acc: 0.9965


Epoch 2/10

Val AUC: 0.998151104017

Val Los: 0.0651814477338

Val Acc: 0.982394366197

565s - loss: 0.0042 - acc: 0.9965


Epoch 3/10


KeyboardInterrupt: 

# Prediction

In [19]:
model_final.load_weights('./vgg-class-weights-epoch-5.hdf5')
val_pred = model_final.predict(val_data[0], batch_size=32)
log_loss(val_data[1], np.append(1 - val_pred, val_pred, axis=1))

0.052187228675489271

In [20]:
batch_size_test = 32

test_gen = validgen.flow_from_directory(
        directory   = test_data_dir,
        target_size = (img_height, img_width),
        batch_size  = batch_size_test,
        class_mode  = "binary",
        shuffle     = False)

test_samples       = len(test_gen.filenames)

Found 1531 images belonging to 1 classes.


In [21]:
preds = model_final.predict_generator(test_gen, math.ceil(test_samples / batch_size_test))

In [22]:
preds_filenames = test_gen.filenames
preds_filenames = [int(x.replace("unknown/", "").replace(".jpg", "")) for x in preds_filenames]
df_result = pd.DataFrame({'name': preds_filenames, 'invasive': preds[:,0]})
df_result = df_result.sort_values("name")
df_result.index = df_result["name"]
df_result = df_result.drop(["name"], axis=1)
df_result.to_csv("submission_10.csv", encoding="utf8", index=True)
from IPython.display import FileLink
FileLink('submission_10.csv')

In [None]:
# Got 0.99179 on LB