In [1]:
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential, model_from_json
from keras.utils import to_categorical
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import RMSprop
from keras.layers.convolutional import Conv2D
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, concatenate , Input, Reshape, Dense
from keras.preprocessing.image import array_to_img, img_to_array, load_img
import numpy as np

Using TensorFlow backend.


In [5]:
XML_dir_name = '../Spline_Out/XML/'
images_dir_name = '../Spline_Out/Pics/'

# Read a file and return a string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def load_data(vocab_data_dir, images_data_dir):
    text = []
    images = []
    # Load all the files and order them
    XML_filenames = listdir(vocab_data_dir)
    images_filenames = listdir(images_data_dir)
    imagesAndXML = images_filenames + XML_filenames
    imagesAndXML.sort()
    imagesAndXML.remove('.DS_Store')
    imagesAndXML.remove('.DS_Store')
    imagesAndXML.remove('.ipynb_checkpoints')
    imagesAndXML.remove('.ipynb_checkpoints')
    for f in imagesAndXML:
        print(f)
    for filename in (imagesAndXML):
        if filename[-3:] == "png":
            # Load the images and prepare into Numpy arrays
            image = img_to_array(load_img(images_data_dir+filename, target_size=(256, 256)))
            images.append(image)
        else:
            # Load the boostrap tokens and rap them in a start and end tag
            syntax = '<START> ' + load_doc(vocab_data_dir+filename) + ' <END>'
            # Seperate all the words with a single space
            syntax = ' '.join(syntax.split())
            # Add a space after each comma
            syntax = syntax.replace(',', ' ,')
            text.append(syntax)
    images = np.array(images, dtype=float)
    return images, text

train_features, texts = load_data(XML_dir_name, images_dir_name)

KT Anna Skirt-FT.png
KT Anna Skirt-FT.val
KT Anna Skirt-PKT.png
KT Anna Skirt-PKT.val
KT Anna Skirt-WB-X.png
KT Anna Skirt-WB-X.val
KT Anna Skirt-WB.png
KT Anna Skirt-WB.val
KT Jojo Sweatshirt-BK.png
KT Jojo Sweatshirt-BK.val
KT Jojo Sweatshirt-FT.png
KT Jojo Sweatshirt-FT.val
KT Jojo Sweatshirt-NKBND.png
KT Jojo Sweatshirt-NKBND.val
KT Jojo Sweatshirt-PKT.png
KT Jojo Sweatshirt-PKT.val
KT Jojo Sweatshirt-SLV.png
KT Jojo Sweatshirt-SLV.val
KT Olivia Skirt-BK.png
KT Olivia Skirt-BK.val
KT Olivia Skirt-FT.png
KT Olivia Skirt-FT.val
KT Rumi Sweatshirt-Cuff.png
KT Rumi Sweatshirt-Cuff.val
KT Rumi Sweatshirt-NKBDG.png
KT Rumi Sweatshirt-NKBDG.val
KT Rumi Sweatshirt-NKBND.png
KT Rumi Sweatshirt-NKBND.val
KT Rumi Sweatshirt-SLV.png
KT Rumi Sweatshirt-SLV.val
KT Rumi Sweatshirt-WB.png
KT Rumi Sweatshirt-WB.val
KT longsleeve crew neck-FT.png
KT longsleeve crew neck-FT.val
KT longsleeve crew neck-SLV.png
KT longsleeve crew neck-SLV.val


In [14]:
# We will cap each input sequence to 100 tokens
max_caption_len = 100
# Initialize the function that will create our vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)

# Read a document and return a string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# Load all the HTML files
X = []
all_filenames = listdir('../Spline_Out/XML/')
all_filenames.sort()
all_filenames.remove('.DS_Store')
all_filenames.remove('.ipynb_checkpoints')
for filename in all_filenames:
    X.append(load_doc('../Spline_Out/XML/'+filename))

# Create the vocabulary from the html files
tokenizer.fit_on_texts(X)

# Add +1 to leave space for empty words
vocab_size = len(tokenizer.word_index) + 1
# Translate each word in text file to the matching vocabulary index
train_sequences = tokenizer.texts_to_sequences(X)
# The longest set of boostrap tokens
max_sequence = max(len(s) for s in train_sequences)
# The longest XML file
max_length = max(len(s) for s in sequences)


def preprocess_data(sequences, features):
    X, y, image_data = list(), list(), list()
    for img_no, seq in enumerate(sequences):
        for i in range(1, len(seq)):
            # Add the sentence until the current count(i) and add the current count to the output
            in_seq, out_seq = seq[:i], seq[i]
            # Pad all the input token sentences to max_sequence
            in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]
            # Turn the output into one-hot encoding
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Add the corresponding image to the boostrap token file
            image_data.append(features[img_no])
            # Cap the input sentence to 48 tokens and add it
            X.append(in_seq[-806:])
            y.append(out_seq)
    return np.array(X), np.array(y), np.array(image_data)

X, y, image_data = preprocess_data(train_sequences, train_features)

In [16]:
print(image_data.shape)
print(X.shape)
print(y.shape)

(6089, 256, 256, 3)
(6089, 806)
(6089, 682)


In [17]:
#Create the encoder
image_model = Sequential()
image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(256, 256, 3,)))
image_model.add(Conv2D(16, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(32, (3,3), activation='relu', padding='same'))
image_model.add(Conv2D(32, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(64, (3,3), activation='relu', padding='same'))
image_model.add(Conv2D(64, (3,3), activation='relu', padding='same', strides=2))
image_model.add(Conv2D(128, (3,3), activation='relu', padding='same'))

image_model.add(Flatten())
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))
image_model.add(Dense(1024, activation='relu'))
image_model.add(Dropout(0.3))

image_model.add(RepeatVector(max_length))

visual_input = Input(shape=(256, 256, 3,))
encoded_image = image_model(visual_input)

language_input = Input(shape=(max_length,))
language_model = Embedding(vocab_size, 50, input_length=max_length, mask_zero=True)(language_input)
language_model = LSTM(128, return_sequences=True)(language_model)
language_model = LSTM(128, return_sequences=True)(language_model)

#Create the decoder
decoder = concatenate([encoded_image, language_model])
decoder = LSTM(512, return_sequences=True)(decoder)
decoder = LSTM(512, return_sequences=False)(decoder)
decoder = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[visual_input, language_input], outputs=decoder)
optimizer = RMSprop(lr=0.0001, clipvalue=1.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 806)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 806, 50)      34100       input_8[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 256, 256, 3)  0                                            
__________________________________________________________________________________________________
lstm_13 (LSTM)                  (None, 806, 128)     91648       embedding_4[0][0]                
__________________________________________________________________________________________________
sequential

In [18]:
#Save the model for every 2nd epoch
filepath="org-weights-epoch-{epoch:04d}--val_loss-{val_loss:.4f}--loss-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, period=20)
callbacks_list = [checkpoint]

In [None]:
# Train the model
model.fit([image_data, X], y, batch_size=64, shuffle=False, validation_split=0.1, callbacks=callbacks_list, verbose=1, epochs=50)

Train on 5480 samples, validate on 609 samples
Epoch 1/50
