In [1]:
from os.path import join

from keras.applications import VGG16, InceptionV3
from keras.layers import GlobalAveragePooling2D, Dense, Dropout
from keras.models import Model, load_model
from keras.utils.np_utils import to_categorical

import pandas as pd
import csv
import os
import numpy as np
import json

from matplotlib import pyplot as plt
import sys
#sys.path.append("../data_preparation/")

from batch_generator import BatchGenerator, BatchSequence

from sklearn.metrics import recall_score, precision_score, f1_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## data folder

In [2]:
#datadir = os.getcwd()
input_path = os.path.abspath('data/')

train={}
test={}
validation={}
with open(os.path.join(input_path, 'train.json')) as json_data:
    train= json.load(json_data)
with open(os.path.join(input_path, 'test.json')) as json_data:
    test= json.load(json_data)
with open(os.path.join(input_path, 'validation.json')) as json_data:
    validation = json.load(json_data)

print('Train No. of images: %d'%(len(train['images'])))
print('Test No. of images: %d'%(len(test['images'])))
print('Validation No. of images: %d'%(len(validation['images'])))

# JSON TO PANDAS DATAFRAME
# train data
train_img_url=train['images']
train_img_url=pd.DataFrame(train_img_url)
train_ann=train['annotations']
train_ann=pd.DataFrame(train_ann)
train=pd.merge(train_img_url, train_ann, on='imageId', how='inner')

# test data
test=pd.DataFrame(test['images'])

# Validation Data
val_img_url=validation['images']
val_img_url=pd.DataFrame(val_img_url)
val_ann=validation['annotations']
val_ann=pd.DataFrame(val_ann)
validation=pd.merge(val_img_url, val_ann, on='imageId', how='inner')

datas = {'Train': train, 'Test': test, 'Validation': validation}
for data in datas.values():
    data['imageId'] = data['imageId'].astype(np.uint32)

In [42]:
# np.array(train.labelId)
#images_path_train = os.path.abspath('data/train/')
images_path_validation = os.path.abspath('data/validation/')
images_path_test = os.path.abspath('data/test/')

In [3]:
def multiple_batch_generator(generator_num, **kwargs):
    """A generator to work with multiple inputs models
    
    We create a model with a list of multiple input layers when 
    we use :func:`keras.layers.concatenate`. However,
    :class:`batch_generator.BatchGenerator` returns a single tuple 
    with two arrays, which does not fit to a model with a multiple 
    input layers. Thus, with this generator, we create the necessary 
    input for such models.
    
    Arguments:
        generator_num {int} -- number of generators should be created
        \**kwargs -- See :class:`batch_generator.BatchGenerator`
        
    Yields:
        ([ndarray,...,ndarray], ndarray) -- in the tuple; list contains feature arrays from each generator, array out of the list contains the label set
    """
    generators_list = [BatchGenerator(**kwargs) for i in range(generator_num)]
    
    while True:
        Xy_list = [gen.next() for gen in generators_list]
        yield [Xy[0] for Xy in Xy_list], Xy_list[0][1]

# testing pretrained

In [17]:
# Run this on the predictions to fix the label order
def remap(predictions):
    ordering = [1, 10, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 11, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 12, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 13, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 14, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 15, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 16, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 17, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 19, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 2, 20, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 21, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 22, 220, 221, 222, 223, 224, 225, 226, 227, 228, 23, 24, 25, 26, 27, 28, 29, 3, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 4, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 5, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 6, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 7, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 8, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 9, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
    fixed = np.zeros(predictions.shape)

    for j in range(predictions.shape[0]):
        for i in range(predictions.shape[1]):
            fixed[j][ordering[i]-1] = predictions[j][i]
            
    return fixed

# VGG16

In [None]:
y_test = np.zeros((39706,1))
predict_gen = BatchSequence(input_dir=images_path_test, y=y_test, batch_size=128)
modelvgg16 = load_model('../models/VGG16-finetuned-1_epochs.h5')
predictions = modelvgg16.predict_generator(predict_gen, verbose=1)



In [118]:
threshold = 0
y_pred = (predictions > threshold)

results =[ " ".join([str(i-1) for i in j if i !=0]) for j in y_pred*range(1,229)]
results = pd.Series(results, name='label_id')
submission = pd.concat([pd.Series(range(1,y_pred.shape[0]+1), name='image_id', dtype=object), results], axis=1)
submission.to_csv("vgg16.csv", index=False, quoting=csv.QUOTE_NONE)

# VGG19

In [17]:
y_test = np.zeros((39706,1))
predict_gen = BatchSequence(input_dir=images_path_test, y=y_test, batch_size=128)
modelvgg19 = load_model('models/VGG19-finetuned-4000_steps.h5')
modelvgg19.compile(optimizer='adam', loss='binary_crossentropy')
predictions = modelvgg19.predict_generator(predict_gen, verbose=1)





In [29]:
predictions = np.load('stacked_predictions_15000_test.npy')

In [18]:
np.save('vgg19_compiled_predictions_test', predictions)

In [32]:
preds = remap(predictions)

In [4]:
predictions.shape

(39706, 228)

In [30]:
def apply_thresholds(predictions, thresholds):
    opt = np.empty(predictions.shape, dtype='int')
    for i,t in enumerate(thresholds):
        opt[:,i] = (predictions[:,i] > t).astype(int)
    return opt

In [42]:
#predictions = remap(predictions)

threshold = 0.15
t = np.load('optimize_thresholds.npy')

y_pred = (predictions > threshold)
y_pred2 = apply_thresholds(predictions, t)

results_glb =[ " ".join([str(i) for i in j if i !=0]) for j in y_pred*range(1,229)]
results_glb = pd.Series(results_glb, name='label_id')
submission_glb = pd.concat([pd.Series(range(1,y_pred.shape[0]+1), name='image_id', dtype=object), results_glb], axis=1)

results_opt =[ " ".join([str(i) for i in j if i !=0]) for j in y_pred2*range(1,229)]
results_opt = pd.Series(results_opt, name='label_id')
submission_opt = pd.concat([pd.Series(range(1,y_pred.shape[0]+1), name='image_id', dtype=object), results_opt], axis=1)

In [40]:
submission_opt

Unnamed: 0,image_id,label_id
0,1,17 20 62 66 105 214
1,2,17 62 66 105 153
2,3,2 20 44 66 154 171 180 186
3,4,53 66 138 153 164 190 222
4,5,44 62 66 70 133 153 171 184
5,6,17 20 62 66 98 105 153 222
6,7,66 95 171 175 184 189 214
7,8,20 62 66 105 116 222
8,9,66 74 138 144 171 176 193 217
9,10,19 49 61 105 106 222


In [43]:
submission_glb.to_csv("stacked_final_glb_15.csv", index=False, quoting=csv.QUOTE_NONE)
#submission_opt.to_csv("stacked_final_opt_1530.csv", index=False, quoting=csv.QUOTE_NONE)

In [None]:
y_train = np.array([j[1:] for j in y_train])


### Define multisequencegenerator

In [19]:
def multiple_sequence_generator(generator_num, **kwargs):
    """A generator to work with multiple inputs models
    
    We create a model with a list of multiple input layers when 
    we use :func:`keras.layers.concatenate`. However,
    :class:`batch_generator.BatchGenerator` returns a single tuple 
    with two arrays, which does not fit to a model with a multiple 
    input layers. Thus, with this generator, we create the necessary 
    input for such models.
    
    Arguments:
        generator_num {int} -- number of generators should be created
        \**kwargs -- See :class:`batch_generator.BatchGenerator`
        
    Yields:
        ([ndarray,...,ndarray], ndarray) -- in the tuple; list contains feature arrays from each generator, array out of the list contains the label set
    """
    generators_list = [SequenceGenerator(**kwargs) for i in range(generator_num)]
    
    while True:
        Xy_list = [gen.next() for gen in generators_list]
        yield [Xy[0] for Xy in Xy_list], Xy_list[0][1]

### Create testing predictions

In [8]:
model = load_model('models/stacked-700_steps.h5')
y_test = np.zeros((39706,1))
batch_size = 64
test_steps = int(len(y_test)/batch_size)
input_num = 4#len(model.input_layers)

In [9]:
test_generator_multi = multiple_batch_generator(generator_num=input_num,
                                               input_dir=images_path_test,
                                               y=y_test,
                                               batch_size=batch_size)

In [10]:
predictions = model.predict_generator(test_generator_multi, steps=test_steps, verbose=1)



In [None]:
np.save('stacked_predictions_test', test_predictions)

### Create validation predictions

In [None]:
import gzip, pickle
with gzip.open('data/y_validation.pickle','rb') as fp:
    y_validation = pickle.load(fp)

y_valid = np.zeros((9897,1))

#valid_steps = int(len(y_valid)/batch_size)+1
predict_gen = BatchSequence(input_dir=images_path_validation, y=y_valid, batch_size=128)
modelvgg19 = load_model('models/VGG19-finetuned-4000_steps.h5')

vgg19_valid_predictions = modelvgg19.predict_generator(predict_gen, verbose=1)

np.save('vgg19_validation_predictions', vgg19_valid_predictions)


In [21]:
vgg19_valid_predictions.shape

(9897, 228)

In [None]:
np.save('stacked_predictions_validation', valid_predictions)

#### predict_generator with multiple inputs

In [43]:
batch_size = 50
val_steps = int(len(y_validation[:500])/batch_size)

In [44]:
input_num = len(model.input_layers)

In [45]:
val_generator_multi = multiple_batch_generator(generator_num=input_num,
                                               input_dir=images_path_validation,
                                               y=y_validation
                                               [:500],
                                               batch_size=batch_size)

In [38]:
stackedmodel = load_model('..')

# Validation

In [39]:
# Set the paths
input_path = os.path.abspath('../../mlipdata/')

with open('pickles/binarizer.pickle', 'rb') as pickle_file:
    binarizer = pickle.load(pickle_file)

In [40]:
images_path_validation = os.path.join(input_path, 'files/validation/')

In [41]:
with file_io.FileIO('../../mlipdata/server_validation.pickle', mode='rb') as fp:
        data = gzip.GzipFile(fileobj=fp)
        y_validation = cPickle.load(data)
y_validation = np.array([j[1:] for j in y_validation])     

In [42]:
validation={}
with open(os.path.join(input_path, 'validation.json')) as json_data:
    validation = json.load(json_data)

validation_img_url = validation['images']
validation_img_url = pd.DataFrame(validation_img_url)
validation_ann = validation['annotations']
validation_ann = pd.DataFrame(validation_ann)
validation = pd.merge(validation_img_url, validation_ann, on='imageId', how='inner')
validation['imageId'] = validation['imageId'].astype(np.uint32)

#y_validation = np.array(validation.labelId)
#y_validation_bin = binarizer.transform(y_validation)

del validation_img_url
del validation_ann
del validation

#### predict_generator with multiple inputs

In [43]:
batch_size = 50
val_steps = int(len(y_validation[:500])/batch_size)

In [44]:
input_num = len(model.input_layers)

In [45]:
val_generator_multi = multiple_batch_generator(generator_num=input_num,
                                               input_dir=images_path_validation,
                                               y=y_validation
                                               [:500],
                                               batch_size=batch_size)

In [46]:
predictions = model.predict_generator(val_generator_multi, steps=val_steps, verbose=1)



In [47]:
len(predictions)

500

In [19]:
y_true = y_validation[:500]
y_pred = (predictions > 0.5).astype(int)

NameError: name 'y_validation' is not defined

In [49]:
pr = precision_score(y_true, y_pred, average='micro')
rc = recall_score(y_true, y_pred, average='micro')
f1 = f1_score(y_true, y_pred, average='micro')

print("Precision: {} Recall: {} F1: {}".format(pr, rc, f1))

Precision: 0.0715523392111 Recall: 0.181536074476 F1: 0.102646585758


In [46]:
predictions = model.predict_generator(val_generator_multi, steps=val_steps, verbose=1)



In [48]:
y_true = y_validation[:500]
y_pred = (predictions > 0.5).astype(int)

In [49]:
pr = precision_score(y_true, y_pred, average='micro')
rc = recall_score(y_true, y_pred, average='micro')
f1 = f1_score(y_true, y_pred, average='micro')

print("Precision: {} Recall: {} F1: {}".format(pr, rc, f1))

Precision: 0.0715523392111 Recall: 0.181536074476 F1: 0.102646585758


ValueError: Error when checking model : the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 4 array(s), but instead got the following list of 1 arrays: [array([[[[146, 143, 138],
         [202, 173, 159],
         [185, 147, 124],
         ...,
         [197, 182, 149],
         [198, 187, 151],
         [192, 185, 148]],

        [[147, 144, 137],
 ...