## Image Caption Generator

In [1]:
from pycocotools.coco import COCO
import pandas as pd
import numpy as np
import cv2
from keras.preprocessing import image, sequence

Using TensorFlow backend.


In [3]:
annFile_train = "annotations/captions_train2014.json"
annFile_val = "annotations/captions_val2014.json"

coco_train = COCO(annFile_train)
coco_val = COCO(annFile_val)

loading annotations into memory...
Done (t=1.93s)
creating index...
index created!
loading annotations into memory...
Done (t=0.93s)
creating index...
index created!


In [4]:
imgIds_train = coco_train.getImgIds()
imgIds_val = coco_val.getImgIds()

annIds_train = coco_train.getAnnIds(imgIds_train)
annIds_val = coco_val.getAnnIds(imgIds_val)
anns_train = coco_train.loadAnns(annIds_train)
anns_val = coco_val.loadAnns(annIds_val)

imgs_train = coco_train.loadImgs(imgIds_train)
imgs_val = coco_val.loadImgs(imgIds_val)

In [5]:
imgs_train_df = pd.DataFrame(imgs_train)
imgs_val_df = pd.DataFrame(imgs_val)

anns_train_df = pd.DataFrame(anns_train)
anns_val_df = pd.DataFrame(anns_val)

In [5]:
imgs_train_df.head()

Unnamed: 0,coco_url,date_captured,file_name,flickr_url,height,id,license,width
0,http://mscoco.org/images/262145,2013-11-20 02:07:55,COCO_train2014_000000262145.jpg,http://farm8.staticflickr.com/7187/6967031859_...,427,262145,2,640
1,http://mscoco.org/images/131074,2013-11-21 01:03:06,COCO_train2014_000000131074.jpg,http://farm9.staticflickr.com/8308/7908210548_...,428,131074,1,640
2,http://mscoco.org/images/131075,2013-11-24 01:06:02,COCO_train2014_000000131075.jpg,http://farm8.staticflickr.com/7252/7104000401_...,640,131075,5,478
3,http://mscoco.org/images/393221,2013-11-16 12:47:32,COCO_train2014_000000393221.jpg,http://farm6.staticflickr.com/5340/8991584979_...,640,393221,3,427
4,http://mscoco.org/images/393223,2013-11-21 20:08:57,COCO_train2014_000000393223.jpg,http://farm1.staticflickr.com/28/45521803_c5cb...,480,393223,3,640


In [6]:
imgs_val_df.head()

Unnamed: 0,coco_url,date_captured,file_name,flickr_url,height,id,license,width
0,http://mscoco.org/images/262148,2013-11-20 05:50:03,COCO_val2014_000000262148.jpg,http://farm5.staticflickr.com/4028/4549977479_...,512,262148,1,640
1,http://mscoco.org/images/393225,2013-11-20 03:17:27,COCO_val2014_000000393225.jpg,http://farm9.staticflickr.com/8052/8445121630_...,428,393225,3,640
2,http://mscoco.org/images/393226,2013-11-17 20:36:07,COCO_val2014_000000393226.jpg,http://farm9.staticflickr.com/8003/7321339838_...,480,393226,4,640
3,http://mscoco.org/images/109229,2013-11-18 17:27:34,COCO_val2014_000000109229.jpg,http://farm3.staticflickr.com/2342/1794334693_...,640,109229,1,514
4,http://mscoco.org/images/131089,2013-11-22 22:23:05,COCO_val2014_000000131089.jpg,http://farm5.staticflickr.com/4117/4866736626_...,427,131089,1,640


In [7]:
anns_train_df.head()

Unnamed: 0,caption,id,image_id
0,People shopping in an open market for vegetables.,694,262145
1,An open market full of people and piles of veg...,1054,262145
2,People are shopping at an open air produce mar...,1456,262145
3,Large piles of carrots and potatoes at a crowd...,5248,262145
4,People shop for vegetables like carrots and po...,5254,262145


In [8]:
anns_val_df.head()

Unnamed: 0,caption,id,image_id
0,The skateboarder is putting on a show using th...,284571,262148
1,A skateboarder pulling tricks on top of a picn...,286347,262148
2,A man riding on a skateboard on top of a table.,286899,262148
3,A skate boarder doing a trick on a picnic table.,287571,262148
4,A person is riding a skateboard on a picnic ta...,288021,262148


In [6]:
img_train_new = imgs_train_df.drop(['coco_url','date_captured','flickr_url','height','width','license'], axis=1)
img_val_new = imgs_val_df.drop(['coco_url','date_captured','flickr_url','height','width','license'], axis=1)

img_train_new.rename(columns={'id': 'image_id'}, inplace=True)
img_val_new.rename(columns={'id': 'image_id'}, inplace=True)

anns_train_df['caption'] = anns_train_df['caption'].apply(lambda x: '<start>' + x + '<end>')
anns_val_df['caption'] = anns_val_df['caption'].apply(lambda x: '<start>' + x + '<end>')

train_df = pd.merge(anns_train_df, img_train_new, on='image_id', how='left')
val_df = pd.merge(anns_val_df, img_val_new, on='image_id', how='left')

In [10]:
train_df.head()

Unnamed: 0,caption,id,image_id,file_name
0,<start>People shopping in an open market for v...,694,262145,COCO_train2014_000000262145.jpg
1,<start>An open market full of people and piles...,1054,262145,COCO_train2014_000000262145.jpg
2,<start>People are shopping at an open air prod...,1456,262145,COCO_train2014_000000262145.jpg
3,<start>Large piles of carrots and potatoes at ...,5248,262145,COCO_train2014_000000262145.jpg
4,<start>People shop for vegetables like carrots...,5254,262145,COCO_train2014_000000262145.jpg


In [11]:
val_df.head()

Unnamed: 0,caption,id,image_id,file_name
0,<start>The skateboarder is putting on a show u...,284571,262148,COCO_val2014_000000262148.jpg
1,<start>A skateboarder pulling tricks on top of...,286347,262148,COCO_val2014_000000262148.jpg
2,<start>A man riding on a skateboard on top of ...,286899,262148,COCO_val2014_000000262148.jpg
3,<start>A skate boarder doing a trick on a picn...,287571,262148,COCO_val2014_000000262148.jpg
4,<start>A person is riding a skateboard on a pi...,288021,262148,COCO_val2014_000000262148.jpg


In [7]:
train_df.drop(['id'], axis=1, inplace=True)
val_df.drop(['id'], axis=1, inplace=True)

In [8]:
train_imgs = set(train_df['file_name'])
len(train_imgs)*5

413915

Processing images and save array

In [8]:
img_dir = "images/"
images = []
for img_file in train_imgs:
    image = cv2.imread(img_dir + img_file)
    image.resize((3,224,224))
    for i in range(0,5):
        images.append(image)
    


KeyboardInterrupt: 

In [12]:
len(images)

413915

Creating vocabulary

In [9]:
caps = train_df['caption']
total_samples=0
for text in caps:
    total_samples+=len(text.split())-1
print "Total samples : "+str(total_samples)

In [10]:
words = [txt.split() for txt in caps]
unique = []
for word in words:
    unique.extend(word)

unique = list(set(unique))
vocab_size = len(unique)
word_index = {}
index_word = {}
for i, word in enumerate(unique):
    word_index[word]=i
    index_word[i]=word
    
max_len = 0
for caption in caps:
    if(len(caption.split()) > max_len):
        max_len = len(caption.split())
max_cap_len = max_len
print "Vocabulary size: "+str(vocab_size)
print "Maximum caption length: "+str(max_cap_len)

Vocabulary size: 54028
Maximum caption length: 50


Creating partial captions and next words

The amount of items on array is too big, so the kernel die while executing the loop below. I will build a generator to use on model.

In [None]:
partial_caps = []
next_words = []
for text in caps:
    for i in range(len(text.split())-1):
        partial = [word_index[txt] for txt in text.split()[:i+1]]
        partial_caps.append(partial)
        next = np.zeros(vocab_size)
        next[word_index[text.split()[i+1]]] = 1
        next_words.append(next)
        


In [None]:
next_words.npasarray(next_words)
partial_caps = sequence.pad_sequences(partial_caps, maxlen=max_cap_len, padding='post')