In [1]:
import numpy as np
import pandas as pd
import pickle
import csv
from os import listdir
import keras as K
from keras.models import Model
from keras.models import Sequential
from keras.layers import Input, LSTM, GRU, Embedding, TimeDistributed,BatchNormalization, Dense,Dropout, RepeatVector, Activation, Flatten
from keras.preprocessing import image, sequence
from keras.layers.wrappers import Bidirectional
from keras.applications import VGG16
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from keras.optimizers import Adam
import scipy
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from keras.callbacks import ModelCheckpoint
from matplotlib.pyplot import imshow
from keras.preprocessing.text import Tokenizer
from keras.applications.imagenet_utils import preprocess_input

Using TensorFlow backend.


Populating the interactive namespace from numpy and matplotlib


In [2]:
import json
json1_file = open('captions.json')
json1_str = json1_file.read()
caption_data = json.loads(json1_str)

In [None]:
print(caption_data)

In [3]:
embedding_dim = 128

In [None]:
"""total_string = "<sss> " + row[4] + " . " + row[5] + " " + row[6]"""

In [4]:
all_caption_list = []

with open('preprocess.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    count = 0
    for row in csv_reader:
        if count == 0:
            count = count+1
            continue
        row_insert = []
        row_insert.append(row[0])
        total_string = ""
        check = 1
        try :
            total_string = "<sss> " + caption_data[row[1]+".png"] + " <eee>"
        except:
            try:
                print(row[1] ,end='..', flush=True)
                total_string = "<sss> " + caption_data[row[2]+".png"] + " <eee>"
            except:
                print(" and .." + row[2],end='\n', flush=True)
                check = 0
        if check == 1:
            row_insert.append(total_string)
            all_caption_list.append(row_insert)

with open("train_caption_final.csv", "w", newline='') as output:
    writer = csv.writer(output)
    writer.writerows(all_caption_list)

CXR1137_IM-0093-12012.. and ..CXR1137_IM-0093-4004
CXR1142_IM-0096-1001.. and ..CXR1142_IM-0096-2001
CXR1297_IM-0195-1001.. and ..CXR1297_IM-0195-4004
CXR16_IM-0389-1001.. and ..CXR16_IM-0389-2001
CXR1690_IM-0452-1001-0001.. and ..CXR1690_IM-0452-1001-0002
CXR1778_IM-0509-1001.. and ..CXR1778_IM-0509-2001
CXR2115_IM-0744-1001.. and ..CXR2115_IM-0744-2001
CXR2765_IM-1210-1001.. and ..CXR2765_IM-1210-2001
CXR3367_IM-1619-3001.. and ..CXR3367_IM-1619-4001
CXR3376_IM-1625-0001-0001.. and ..CXR3376_IM-1625-0001-0002
CXR3434_IM-1662-1001.. and ..CXR3434_IM-1662-2001
CXR614_IM-2200-1001.. and ..CXR614_IM-2200-4001
CXR894_IM-2404-0001-0001.. and ..CXR894_IM-2404-0001-0002


In [5]:
encoded_images = pickle.load( open( "visual_features.pickle", "rb" ) )

In [6]:
#find vocab_size
#find total_sample
def initialize():
        df = pd.read_csv('train_caption_final.csv', delimiter=',')
        nb_samples = df.shape[0]
        iteration = df.iterrows()
        caps = []
        for i in range(nb_samples):
            x = iteration.__next__()
            caps.append(x[1][1])

        total_samples=0
        for text in caps:
            total_samples+=len(text.split())-1
        print("Total samples : "+str(total_samples))
        
        words = [txt.split() for txt in caps]
        unique = []
        for word in words:
            unique.extend(word)
        unique = list(set(unique))
        vocab_size = len(unique)
        word_index = {}
        index_word = {}
        for i, word in enumerate(unique):
            word_index[word]=i
            index_word[i]=word

        max_len = 0
        for caption in caps:
            if(len(caption.split()) > max_len):
                max_len = len(caption.split())
        max_cap_len = max_len
        print("Vocabulary size: "+str(vocab_size))
        print("Maximum caption length: "+str(max_cap_len))
        
        return max_cap_len, vocab_size, total_samples, word_index, index_word


In [7]:
max_capt_len, vocab_size, total_samples, word_to_index, index_to_word = initialize()

Total samples : 120904
Vocabulary size: 2556
Maximum caption length: 178


In [8]:
state_size = max_capt_len
embedding_size = 256
image_feature_size = 2048
num_words = 10000

In [9]:
image_feature_input = Input(shape=(image_feature_size,),
                              name='image_feature_input')

In [10]:
decoder_transfer_map = Dense(state_size,
                             activation='tanh',
                             name='decoder_transfer_map')

In [11]:
report_decoder_input = Input(shape=(max_capt_len, ), name='decoder_input')

In [12]:
decoder_embedding = Embedding(input_dim=vocab_size,
                              output_dim=embedding_size,
                              name='decoder_embedding')

In [13]:
decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)

In [14]:
decoder_dense = Dense(vocab_size,
                      activation='linear',
                      name='decoder_output')

In [15]:
def connect_decoder(transfer_values):
    
    initial_state = decoder_transfer_map(transfer_values)
    
    net = report_decoder_input
    
    net = decoder_embedding(net)
    
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)
    
    net = Flatten()(net)
    decoder_output = decoder_dense(net)
    
    return decoder_output

In [16]:
decoder_output = connect_decoder(transfer_values=image_feature_input)

decoder_model = Model(inputs=[image_feature_input, report_decoder_input],
                      outputs=[decoder_output])
decoder_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
#Old Model
"""def create_model(return_model = False):
        
        image_decod_input = Input(shape=(2048,))
        image_decod = Dense(embedding_dim, input_dim=2048, activation='relu')(image_decod_input)
        image_decod = RepeatVector(max_capt_len)(image_decod)
        
        
        capt_decod_input = Input(shape=(max_capt_len,))
        capt_decod = Embedding(vocab_size, 256, input_length=max_capt_len)(capt_decod_input)
        capt_decod = LSTM(256,return_sequences=True)(capt_decod)
        capt_decod = TimeDistributed(Dense(embedding_dim))(capt_decod)
        
        model = K.layers.concatenate([image_decod, capt_decod])
        model = LSTM(1000,return_sequences=False)(model)
        model = Dense(vocab_size)(model)
        prediction = Activation('softmax')(model)
        
        model = Model(inputs=[image_decod_input, capt_decod_input], outputs=prediction)

        if(return_model==True):
            return model

        model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
        return model"""

In [18]:
def generate_a_batch(batch_size = 32):
        partial_caps = []
        next_words = []
        images = []
        gen_count = 0
        df = pd.read_csv('train_caption_final.csv', delimiter=',')
        nb_samples = df.shape[0]
        iter = df.iterrows()
        caps = []
        imgs = []
        for i in range(nb_samples):
            x = iter.__next__()
            caps.append(x[1][1])
            imgs.append(x[1][0])


        total_count = 0
        while True:
            image_counter = -1
            for text in caps:
                image_counter+=1
                current_image = encoded_images[imgs[image_counter]][0]
                #print(current_image.shape)
                for i in range(len(text.split())-1):
                    total_count+=1
                    partial = [word_to_index[txt] for txt in text.split()[:i+1]]
                    partial_caps.append(partial)
                    next = np.zeros(vocab_size)
                    next[word_to_index[text.split()[i+1]]] = 1
                    next_words.append(next)
                    images.append(current_image)
                    #print(images[0].shape)

                    if total_count>=batch_size:
                        next_words = np.asarray(next_words)
                        images = np.asarray(images)
                        partial_caps = sequence.pad_sequences(partial_caps, maxlen=max_capt_len, padding='post')
                        total_count = 0
                        gen_count+=1
                        if gen_count%10 == 0:
                            print("batch count: "+str(gen_count))
                        yield [[images, partial_caps], next_words]
                        partial_caps = []
                        next_words = []
                        images = []

In [19]:
batch_size = 16
epochs = 10

In [None]:
#image_caption_model = create_model()
file_name = 'saved_weights_epoch_{epoch:02d}.hdf5'
checkpoint = ModelCheckpoint(file_name, monitor='loss', verbose=1, save_best_only=True, mode='min')
checkpoints_list = [checkpoint]
decoder_model.fit_generator(generate_a_batch(batch_size=batch_size), steps_per_epoch=total_samples/batch_size, epochs=epochs, verbose=2, callbacks=checkpoints_list)

Epoch 1/10
batch count: 10
