In [24]:
import tensorflow as tf
import pandas as pd
import numpy as np

import gc
import os

from skimage.transform import resize
from matplotlib import pyplot as plt
from tqdm import tqdm
%matplotlib inline

IMG_SIZE = 64
DATA_PATH = './data/processed/'

### Data

In [25]:
dataset_files = ['train-1.npy', 'train-2.npy', 'train-3.npy', 'train-4.npy']
labels_remap = {}

In [26]:
def prepare_data(dataset_files, save=False):
    # resize all images to [IMG_SIZE x IMG_SIZE]
    # 
    # if `save` is set to True: save processed data to `filename` file
    #
    # returns: 
    #    images -- np.array() of images
    #    labels -- np.array() of image labels
    
    images = []
    labels = []
       
    ind = 0
    for name in dataset_files:
        print("processing file '{}'".format(name))
        data = np.load('./data/' + name)
        for elem in tqdm(data):
            img, label = elem
            images.append(resize(img, (IMG_SIZE, IMG_SIZE), mode='constant'))

            if label not in labels_remap:
                labels_remap[label] = ind
                ind += 1
            labels.append(labels_remap[label])
    
    images = np.array(images)
    labels = np.array(labels)
    
    print('Deleting *.npy files')
    for name in dataset_files:
        os.remove('./data/' + name)
    
    if save:
        print('Saving on disk')
        np.savez('./data/train_test', images=images, labels=labels)
        np.save('./misc/labels_remap', labels_remap)
            
    return np.array(images), np.array(labels)

In [61]:
def predict2csv(model, x_test, filename='pred.csv'):
    inv_labels = {v: k for k, v in labels_remap.items()}
    pred = np.argmax(model.predict(x_test, verbose=1), axis=1)
    
    res = []
    ind = 1
    for label in pred:
        res.append([ind, inv_labels[label]])
        ind += 1

    res = np.array(res)
    
    df = pd.DataFrame(res)
    df.to_csv(filename, index=False, header=['Id', 'Category'])

Now we can process all the data and save to one file (this requires more then 8 gigs of RAM)

In [29]:
images, labels = prepare_data(dataset_files, save=True)

processing file 'train-1.npy'


100%|██████████| 83246/83246 [01:07<00:00, 1231.40it/s]


processing file 'train-2.npy'


100%|██████████| 83247/83247 [01:07<00:00, 1232.12it/s]


processing file 'train-3.npy'


100%|██████████| 83247/83247 [01:09<00:00, 1190.80it/s]


processing file 'train-4.npy'


100%|██████████| 83247/83247 [01:10<00:00, 1174.66it/s]


Deleting *.npy files
Saving on disk


If we reload notebook and have data already

In [31]:
data = np.load('data/train_test.npz')
images = data['images']
labels = data['labels']

### Creating model

In [63]:
import keras

from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D

from sklearn.model_selection import train_test_split

Final data preparation

In [34]:
images = images.reshape(images.shape[0], IMG_SIZE, IMG_SIZE, 1)
labels = keras.utils.to_categorical(labels, num_classes=1000)

In [40]:
x_train, x_test, y_train, y_test = train_test_split(
                                        images, labels, test_size=0.25, random_state=42)

In [64]:
model = Sequential()

model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(IMG_SIZE, IMG_SIZE, 1)))
model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(2000, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1000, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

### Training and evaluation

In [65]:
model.fit(x_train, y_train,
          batch_size=150,
          epochs=4,
          verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fc825503eb8>

In [66]:
score = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.28862170740878323
Test accuracy: 0.9361898927288671


Now do training on the rest of the set (we call this test set here). `.fit` method trains with weights from previous run.

In [None]:
model.fit(x_test, y_test,
          batch_size=150,
          epochs=3,
          verbose=1)

Epoch 1/3
Epoch 2/3

Finaly we create predictions csv file

In [59]:
kg_test = []

ind = 0    
for img in tqdm(np.load('./data/test.npy')):
    kg_test.append(resize(img, (IMG_SIZE, IMG_SIZE), mode='constant'))

kg_test = np.array(kg_test)
kg_test = kg_test.reshape(kg_test.shape[0], IMG_SIZE, IMG_SIZE, 1)

predict2csv(model, kg_test)

100%|██████████| 83247/83247 [01:09<00:00, 1204.99it/s]




NameError: name 'ans' is not defined

In [67]:
predict2csv(model, kg_test)

