### Create validation data

In [0]:
import os
import numpy as np
import glob

In [0]:
data_path = '../input/'
TRAIN = '../input/images_training_rev1/images_training_rev1'
TEST = '../input/images_test_rev1/images_test_rev1'
LABELS = '../input/training_solutions_rev1/training_solutions_rev1.csv'
# g = glob.glob(data_path+'train/*.jpg')
# shuf = np.random.permutation(g)
# for i in range(2000):
#     os.rename(shuf[i], data_path+ 'valid/' + shuf[i].split("/")[-1])

## Building VGG model in Keras

In [0]:
from keras.models import Sequential, Model
from keras.layers.core import Flatten, Dense, Dropout, Lambda, Reshape
from keras.layers import Input, MaxoutDense
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers import Conv2D, MaxPooling2D, Activation
from keras.optimizers import SGD, RMSprop, Adam
from keras import applications
import matplotlib.pylab as plt
%matplotlib inline

In [0]:
# Compile 
optimizer = Adam(lr=1e-5)
model = applications.VGG16(weights='imagenet', include_top= False, input_shape=(212,212, 3))
#Adding custom Layers 

x = model.output
x = Flatten()(x)
# x = Dense(100, activation="relu")(x)
x = Dropout(0.5)(x)
predictions = Dense(37, activation='sigmoid')(x)

# creating the final model 
model = Model(input = model.input, output = predictions)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy', 'mse'])

In [0]:
from random import shuffle
from scipy.misc import imresize  

class data_getter:    
    """
    Creates a class for handling train/valid/test data paths,
    training labels and image IDs.
    Useful for switching between sample and full datasets.
    """
    def __init__(self, path):    
        self.path = path 
        self.train_path = TRAIN
        #self.val_path = path + "valid"
        self.test_path = TEST
        
        def get_paths(directory):
            return [f for f in os.listdir(directory)]
        
        self.training_images_paths = get_paths(self.train_path)
        #self.validation_images_paths = get_paths(self.val_path)
        self.test_images_paths = get_paths(self.test_path)    
        
        def get_all_solutions():
        ### Import solutions file and load into self.solutions
            import csv
            all_solutions = {}
            #/'training_solutions_rev1.csv'
            with open(LABELS, 'r') as f:
                reader = csv.reader(f, delimiter=",")
                next(reader)
                for i, line in enumerate(reader):
                    all_solutions[line[0]] = [float(x) for x in line[1:]]
            return all_solutions
        
        self.all_solutions = get_all_solutions()

    def get_id(self,fname):
        return fname.replace(".jpg","").replace("data","")
        
    def find_label(self,val):
        return self.all_solutions[val]
        
# fetcher = data_getter('data/sample/')
fetcher = data_getter(data_path)
print(fetcher.train_path)

In [0]:
## from keras.preprocessing.image import ImageDataGenerator
from skimage.filters import gaussian 
import random 
from skimage.transform import resize

IDG = ImageDataGenerator()
def process_images(paths):
    """
    Import image at 'paths', decode, centre crop and prepare for batching. 
    """
    count = 2*len(paths)
    length_paths = len(paths)
    arr = np.zeros(shape=(count,3,212,212))
    
    theta = int(np.pi / 180 * np.random.uniform(-90.0, 90))
    
    fh = bool(random.getrandbits(1))
    fv = bool(random.getrandbits(1))
    
    transform_parameters ={ 'theta': theta, 'flip_horizontal': fh,  'flip_vertical': fv }
    
    for c, path in enumerate(paths):
        img = plt.imread(path).T
        img = img[:,106:106*3,106:106*3] #crop 424x424 -> 212x212
        # img = imresize(img,size=(106,106,3),interp="cubic").T # downsample to half res
        arr[c] = img
        img = IDG.apply_transform(img, transform_parameters)
        img = gaussian(img, sigma=0.5)
        arr[c + length_paths] = img
    return arr

def no_process_images(paths):
        img = plt.imread(path).T
        img = img[:,106:106*3,106:106*3] #crop 424x424 -> 212x212

        img = gaussian(img, sigma=0.5)
        arr[c] = img
    return arr

In [0]:
## Print some before/after processing images

#process_images([fetcher.train_path + '/' + fetcher.training_images_paths[100]])
im = plt.imread(fetcher.train_path + '/' + fetcher.training_images_paths[3])
# print(im.shape)
theta = int(np.pi / 180 * np.random.uniform(-90.0, 90))
fh = bool(random.getrandbits(1))
fv = bool(random.getrandbits(1))
transform_parameters ={'theta': theta, 'flip_horizontal': fh,  'flip_vertical': fv }

plt.imshow(im)
plt.show()
# im = im.T[:,106:106*3,106:106*3] #crop 424x424 -> 212x212
im = im.T
im = IDG.apply_transform(im, transform_parameters)
im = gaussian(im, sigma=0.5)


In [0]:
# Create generator that yields (current features X, current labels y)
def BatchGenerator(getter):
    while 1:
        for f in getter.training_images_paths:
            X_train = process_images([getter.train_path + '/' + fname for fname in [f]])
            X_train = np.reshape(X_train, (212, 212, 3, 2))
            id_ = getter.get_id(f)
            y_train = np.array(getter.find_label(id_))
            y_train = np.reshape(y_train,(1,37))
            y_train = np.hstack((y_train, y_train))
            yield (X_train, y_train)
            
def NoBatchGenerator(getter):
    while 1:
        for f in getter.training_images_paths:
            X_train = no_process_images([getter.train_path + '/' + fname for fname in [f]])
            X_train = np.reshape(X_train, (1, 212, 212, 3))
            id_ = getter.get_id(f)
            y_train = np.array(getter.find_label(id_))
            y_train = np.reshape(y_train,(1,37))
            # y_train = np.hstack((y_train, y_train, y_train, y_train))
            yield (X_train, y_train)
            
def ValBatchGenerator(getter):
    while 1:
        for f in getter.training_images_paths:
            X_train = process_images([getter.train_path + '/' + fname for fname in [f]])
            id_ = getter.get_id(f)
            y_train = np.array(getter.find_label(id_))
            y_train = np.reshape(y_train,(1,37))
            yield (X_train, y_train)

### Train model

In [0]:
from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.val_losses = []
        self.acc = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.acc.append(logs.get('acc'))
    
early_stopping = EarlyStopping(monitor='loss', patience=7, verbose=1, mode='auto')
history = LossHistory()

from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='tmp/weights.hdf5', verbose=1, save_best_only=True)

from keras.callbacks import ReduceLROnPlateau
reduceLR = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

batch_size = 32
steps_to_take = int(len(fetcher.training_images_paths)/batch_size)
#val_steps_to_take = int(len(fetcher.validation_images_paths)/batch_size)
                #typically be equal to the number of unique samples if your dataset
                #divided by the batch size.
        
        
#model = load_model('tmp/weights.hdf5')

hist = model.fit_generator(BatchGenerator(fetcher),
                    samples_per_epoch=steps_to_take, 
                    nb_epoch=12,
                    verbose=1,
                    callbacks=[history,checkpointer, reduceLR],
                   )

### Plot training/validation loss

In [0]:
plt.figure(figsize=(12,8))
plt.plot(hist.epoch,hist.history['acc'],label='Test')
# plt.plot(hist.epoch,hist.history['val_loss'],label='Validation',linestyle='--')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [0]:
plt.figure(figsize=(12,8))
plt.plot(hist.epoch,hist.history['loss'],label='Test')
# plt.plot(hist.epoch,hist.history['val_loss'],label='Validation',linestyle='--')
plt.xlabel("Epochs")
plt.ylabel("RMSE")
plt.legend()
plt.show()

### Model Predict

In [0]:
# # Load best model weights
# from keras.models import load_model
# model = load_model('tmp/weights.hdf5')

In [0]:
def TestBatchGenerator(getter):
    while 1:
        for f in getter.test_images_paths:
            X_train = process_images([getter.test_path + '/' + fname for fname in [f]])
            X_train = np.reshape(X_train, (1, 212, 212, 3))
            yield (X_train)

predictions = model.predict_generator(TestBatchGenerator(fetcher),
                       val_samples = len(fetcher.test_images_paths),
                        max_q_size = 32, use_multiprocessing=True, verbose = 1)

In [0]:
predictions.shape

In [0]:
header = open('../input/all_zeros_benchmark/all_zeros_benchmark.csv','r').readlines()[0]

with open('submission_1.csv','w') as outfile:
    outfile.write(header)
    for i in range(len(fetcher.test_images_paths)):
        id_ = (fetcher.get_id(fetcher.test_images_paths[i]))
        pred = predictions[i]
        outline = id_ + "," + ",".join([str(x) for x in pred])
        outfile.write(outline + "\n")