In [1]:
import time
import io
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import bson
from skimage.data import imread
from sklearn.preprocessing import LabelEncoder
import cv2
from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras.layers import Conv2D, Dense, Dropout, Flatten, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


### Previous

In [None]:
'''
Decode bson data from a file to multiple documents as a generator. 
Reads from the file object in chunks and parses bson in chunks, yielding one document at a time.
'''
data = bson.decode_file_iter(open('input/train_example.bson', 'rb'))

In [None]:
def data_generator(batch_size=4):
    print('in data_generator')
    count_product = 0
    images = []
    y_label = []
    
    while True:
        count = 0
        print('count: ' + str(count))
        for c, d in enumerate(data):
            category_id = d['category_id']
            for e, pic in enumerate(d['imgs']):
                picture = imread(io.BytesIO(pic['picture']))
                images.append(picture)
                y_label.append(category_id)
                count = count + 1
            if count >= batch_size:
                break;

        images = np.asarray(images)                
        print('images.shape: ' + print(images.shape))
        print(y_label)
        
        y_label = to_categorical(y_label, 36)

        yield (images, y_label)

In [None]:
def data_gen(generator=data):
    while True:
        images=[]
        category=[]
        prod_to_category = dict()
        images_per_category=[]
        flag=0

        for c, d in enumerate(data):
            product_id = d['_id']
            category_id = d['category_id'] # This won't be in Test data
            #prod_to_category[product_id] = category_id
            for e, pic in enumerate(d['imgs']):
                category.append(category_id)
                picture = imread(io.BytesIO(pic['picture']))
                #picture=pic['picture']
                images.append(picture)
        yield np.array(images), encode_labels(np.asarray(category))

In [None]:
data = bson.decode_file_iter(open('input/train_example.bson', 'rb'))
i, c = data_gen(generator=data)

### Current

In [2]:
def encode_labels(labels, num_classes=523):
    encoder = LabelEncoder()
    encoder.fit(labels)

    encoded_Y = encoder.transform(labels)
    return to_categorical(encoded_Y, num_classes)

In [12]:
def imread(buf):
    return cv2.imdecode(np.frombuffer(buf, np.uint8), cv2.IMREAD_ANYCOLOR)

def img2feat(im):
    x = cv2.resize(im, (32, 32), interpolation=cv2.INTER_AREA)
    x = np.float32(x) / 255
    return x

def datagen(batch_size):
    data = bson.decode_file_iter(open('input/train.bson', 'rb'))
    
    while True:
        count = 0
        X = np.zeros((batch_size, 32, 32, 3), dtype=np.float32)
        images=[]
        category=[]
        
#        prod_to_category = dict()
#        images_per_category=[]

        for c, d in enumerate(data):
            if count >= batch_size:
                X = np.asarray(images)
                X = X.reshape(X.shape[0], -1)
#                print('X.shape: ' + str(X.shape))
                yield X, encode_labels(np.asarray(category))
                
                count = 0
                X = np.zeros((batch_size, 32, 32, 3), dtype=np.float32)
                images=[]
                category=[]
            
            else:    
                product_id = d['_id']
                category_id = d['category_id'] # This won't be in Test data

    #            print("category_id: " + str(category_id))

                #prod_to_category[product_id] = category_id
                for e, pic in enumerate(d['imgs']):
                    category.append(category_id)
                    picture = imread(io.BytesIO(pic['picture']).getbuffer())
                    images.append(img2feat(picture))
                    count = count + 1

In [4]:
model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=3072, name='fc1'))
model.add(Dense(523, activation='softmax', name='classifier'))
model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
fc1 (Dense)                  (None, 1024)              3146752   
_________________________________________________________________
classifier (Dense)           (None, 523)               536075    
Total params: 3,682,827
Trainable params: 3,682,827
Non-trainable params: 0
_________________________________________________________________


In [13]:
#data = bson.decode_file_iter(open('input/train_example.bson', 'rb'))

#model.fit_generator(generator=datagen(batch_size=64), steps_per_epoch=2000, epochs=1, verbose=2)
early_stopping = EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint = ModelCheckpoint(filepath='cdiscount', verbose=1, save_best_only=True)

start = time.time()

model.fit_generator(generator=datagen(batch_size=50), steps_per_epoch=20, validation_data=datagen(batch_size=50), 
                    validation_steps=2, callbacks=[early_stopping, model_checkpoint], epochs=10, verbose=2)
end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
#model = load_model('cdiscount_vgg16')

#y_pred = model.predict(X_test, batch_size=64)
#score = model.evaluate(X_test, Y_test, verbose=1)

#print(score)


Epoch 1/10
Epoch 00000: val_loss improved from inf to 15.32017, saving model to cdiscount
1s - loss: 14.7408 - acc: 0.0855 - val_loss: 15.3202 - val_acc: 0.0495
Epoch 2/10
Epoch 00001: val_loss did not improve
0s - loss: 14.9925 - acc: 0.0698 - val_loss: 15.7989 - val_acc: 0.0198
Epoch 3/10
Epoch 00002: val_loss did not improve
0s - loss: 14.8172 - acc: 0.0807 - val_loss: 15.6346 - val_acc: 0.0300
Epoch 4/10
Epoch 00003: val_loss improved from 15.32017 to 12.79967, saving model to cdiscount
1s - loss: 14.6895 - acc: 0.0886 - val_loss: 12.7997 - val_acc: 0.2059
Epoch 5/10
Epoch 00004: val_loss improved from 12.79967 to 12.44764, saving model to cdiscount
0s - loss: 14.3049 - acc: 0.1125 - val_loss: 12.4476 - val_acc: 0.2277
Epoch 6/10
Epoch 00005: val_loss did not improve
0s - loss: 14.6863 - acc: 0.0888 - val_loss: 15.7957 - val_acc: 0.0200
Epoch 7/10
Epoch 00006: val_loss did not improve
0s - loss: 14.1557 - acc: 0.1217 - val_loss: 15.7957 - val_acc: 0.0200
Epoch 8/10
Epoch 00007: val