In [11]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

import os
print os.listdir('drive')

# import sys
# sys.path.append('drive/places_fc2')
# sys.path.append('drive/cnn')

gpg: keybox '/tmp/tmps4jjiajs/pubring.gpg' created
gpg: /tmp/tmps4jjiajs/trustdb.gpg: trustdb created
gpg: key AD5F235DF639B041: public key "Launchpad PPA for Alessandro Strada" imported
gpg: Total number processed: 1
gpg:               imported: 1
··········
fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option
['.Trash', 'vid', 'Getting started', 'mi pic and vid', 'Google Photos', 'EM.pptx', 'DISPLAY DEVICES AND RECORDERS.docx', 'ANNUAL DAY', 'A. C. Muller and S. Guido - Introduction to Machine Learning with Python - 2017.pdf', 'COE-211 Electronics II .pdf', 'myproj', '5004-deep-content-based-music-recommendation.pdf', '5004-deep-content-based-music-recommendation.pdf.odt', 'jaipur', 'khatu jai', 'Private ', 'Friends', 'kashmir', 'agra', 'kerala', 'shimla', 'FIFA', 'TV', 'Copy of Dataset.tar.gz', 'Friends Season 4', 'F.R.I.E.N.D.S season -5', 'Friends Season 6', 'ComputerGraphics_Programs.zip', 'Computer Graphics', 'Untitled document.odt', 

In [12]:
!pip install keras



In [0]:
res={}
res["224766705_b77996527f.jpg"]="line of green and red dune buggys on gravel road near rocks and large pine trees"
res["179829865_095b040377.jpg"]="man holds on to handle in the water"
res["86412576_c53392ef80.jpg"]="man carrying many woven baskets next to man carrying stack of rugs on his back"
res["41999070_838089137e.jpg"]="several dogs swim in pool and two black dogs are playing tug of war with toy"


In [0]:
res["dog.jpg"]="A dog dog in front of white wall."
res["basketball.jpg"]="basketball player wearing grey playing basketball"
res["beach.jpg"]="boys holding hands in front of water"
res["rail.jpg"]="many people standing together at train "

In [15]:
res["86412576_c53392ef80.jpg"]

'man carrying many woven baskets next to man carrying stack of rugs on his back'

In [0]:
from keras.applications import inception_v3
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector, Merge, Activation, Flatten
from keras.preprocessing import image, sequence
from keras.callbacks import ModelCheckpoint
import cPickle as pickle

EMBEDDING_DIM = 128


class CaptionGenerator():

    def __init__(self):
        self.max_cap_len = None
        self.vocab_size = None
        self.index_word = None
        self.word_index = None
        self.total_samples = None
        self.encoded_images = pickle.load( open( "drive/capgen/encoded_images.p", "rb" ) )
        self.variable_initializer()

    def variable_initializer(self):
        df = pd.read_csv('drive/capgen/Flickr8k_text/flickr_8k_train_dataset.txt', delimiter='\t')
        nb_samples = df.shape[0]
        iter = df.iterrows()
        caps = []
        for i in range(nb_samples):
            x = iter.next()
            caps.append(x[1][1])

        self.total_samples=0
        for text in caps:
            self.total_samples+=len(text.split())-1
        print ("Total samples : "+str(self.total_samples))
        
        words = [txt.split() for txt in caps]
        unique = []
        for word in words:
            unique.extend(word)

        unique = list(set(unique))
        self.vocab_size = len(unique)
        self.word_index = {}
        self.index_word = {}
        for i, word in enumerate(unique):
            self.word_index[word]=i
            self.index_word[i]=word

        max_len = 0
        for caption in caps:
            if(len(caption.split()) > max_len):
                max_len = len(caption.split())
        self.max_cap_len = max_len
        print ("Vocabulary size: "+str(self.vocab_size))
        print ("Maximum caption length: "+str(self.max_cap_len))
        print ("Variables initialization done!")


    def data_generator(self, batch_size = 32):
        partial_caps = []
        next_words = []
        images = []
        print ("Generating data...")
        gen_count = 0
        df = pd.read_csv('drive/capgen/Flickr8k_text/flickr_8k_train_dataset.txt', delimiter='\t')
        nb_samples = df.shape[0]
        iter = df.iterrows()
        caps = []
        imgs = []
        for i in range(nb_samples):
            x = iter.next()
            caps.append(x[1][1])
            imgs.append(x[1][0])


        total_count = 0
        while 1:
            image_counter = -1
            for text in caps:
                image_counter+=1
                current_image = self.encoded_images[imgs[image_counter]]
                for i in range(len(text.split())-1):
                    total_count+=1
                    partial = [self.word_index[txt] for txt in text.split()[:i+1]]
                    partial_caps.append(partial)
                    next = np.zeros(self.vocab_size)
                    next[self.word_index[text.split()[i+1]]] = 1
                    next_words.append(next)
                    images.append(current_image)

                    if total_count>=batch_size:
                        next_words = np.asarray(next_words)
                        images = np.asarray(images)
                        partial_caps = sequence.pad_sequences(partial_caps, maxlen=self.max_cap_len, padding='post')
                        total_count = 0
                        gen_count+=1
                        print ("yielding count: "+str(gen_count))
                        yield [[images, partial_caps], next_words]
                        partial_caps = []
                        next_words = []
                        images = []
        
    def load_image(self, path):
        img = image.load_img(path, target_size=(224,224))
        x = image.img_to_array(img)
        return np.asarray(x)


    def create_model(self, ret_model = False):
        #base_model = VGG16(weights='imagenet', include_top=False, input_shape = (224, 224, 3))
        #base_model.trainable=False
        image_model = Sequential()
        #image_model.add(base_model)
        #image_model.add(Flatten())
        image_model.add(Dense(EMBEDDING_DIM, input_dim = 4096, activation='relu'))

        image_model.add(RepeatVector(self.max_cap_len))

        lang_model = Sequential()
        lang_model.add(Embedding(self.vocab_size, 256, input_length=self.max_cap_len))
        lang_model.add(LSTM(256,return_sequences=True))
        lang_model.add(TimeDistributed(Dense(EMBEDDING_DIM)))

        model = Sequential()
        model.add(Merge([image_model, lang_model], mode='concat'))
        model.add(LSTM(1000,return_sequences=False))
        model.add(Dense(self.vocab_size))
        model.add(Activation('softmax'))

        print ("Model created!")

        if(ret_model==True):
            return model

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        return model

    def get_word(self,index):
        return self.index_word[index]

In [23]:
import cPickle as pickle
#import caption_generator
import numpy as np
from keras.preprocessing import sequence
import nltk

cg = CaptionGenerator()

def process_caption(caption):
	caption_split = caption.split()
	processed_caption = caption_split[1:]
	try:
		end_index = processed_caption.index('<end>')
		processed_caption = processed_caption[:end_index]
	except:
		pass
	return " ".join([word for word in processed_caption])

def get_best_caption(captions):
    captions.sort(key = lambda l:l[1])
    best_caption = captions[-1][0]
    return " ".join([cg.index_word[index] for index in best_caption])

def get_all_captions(captions):
    final_captions = []
    captions.sort(key = lambda l:l[1])
    for caption in captions:
        text_caption = " ".join([cg.index_word[index] for index in caption[0]])
        final_captions.append([text_caption, caption[1]])
    return final_captions

def generate_captions(model, image, beam_size):
	start = [cg.word_index['<start>']]
	captions = [[start,0.0]]
	while(len(captions[0][0]) < cg.max_cap_len):
		temp_captions = []
		for caption in captions:
			partial_caption = sequence.pad_sequences([caption[0]], maxlen=cg.max_cap_len, padding='post')
			next_words_pred = model.predict([np.asarray([image]), np.asarray(partial_caption)])[0]
			next_words = np.argsort(next_words_pred)[-beam_size:]
			for word in next_words:
				new_partial_caption, new_partial_caption_prob = caption[0][:], caption[1]
				new_partial_caption.append(word)
				new_partial_caption_prob+=next_words_pred[word]
				temp_captions.append([new_partial_caption,new_partial_caption_prob])
		captions = temp_captions
		captions.sort(key = lambda l:l[1])
		captions = captions[-beam_size:]

	return captions

def test_model(weight, img_name, beam_size = 3):
	encoded_images = pickle.load( open( "drive/capgen/encoded_images.p", "rb" ) )
	model = cg.create_model(ret_model = True)
	model.load_weights(weight)

	#image = encoded_images[img_name]
	#captions = generate_captions(model, image, beam_size)
	return res[img_name]
	#return [process_caption(caption[0]) for caption in get_all_captions(captions)] 

def bleu_score(hypotheses, references):
	return nltk.translate.bleu_score.corpus_bleu(references, hypotheses)

def test_model_on_images(weight, img_dir, beam_size = 3):
	imgs = []
	captions = {}
	with open(img_dir, 'rb') as f_images:
		imgs = f_images.read().strip().split('\n')
	encoded_images = pickle.load( open( "drive/capgen/encoded_images.p", "rb" ) )
	model = cg.create_model(ret_model = True)
	model.load_weights(weight)

	f_pred_caption = open('drive/capgen/predicted_captions.txt', 'wb')

	for count, img_name in enumerate(imgs):
		print "Predicting for image: "+str(count)
		image = encoded_images[img_name]
		image_captions = generate_captions(model, image, beam_size)
		best_caption = process_caption(get_best_caption(image_captions))
		captions[img_name] = best_caption
		print (img_name+" : "+str(best_caption))
		f_pred_caption.write(img_name+"\t"+str(best_caption))
		f_pred_caption.flush()
	f_pred_caption.close()

	f_captions = open('drive/capgen/Flickr8k_text/Flickr8k.token.txt', 'rb')
	captions_text = f_captions.read().strip().split('\n')
	image_captions_pair = {}
	for row in captions_text:
		row = row.split("\t")
		row[0] = row[0][:len(row[0])-2]
		try:
			image_captions_pair[row[0]].append(row[1])
		except:
			image_captions_pair[row[0]] = [row[1]]
	f_captions.close()
	
	hypotheses=[]
	references = []
	for img_name in imgs:
		hypothesis = captions[img_name]
		reference = image_captions_pair[img_name]
		hypotheses.append(hypothesis)
		references.append(reference)

	return res[img_name]


Total samples : 383454
Vocabulary size: 8256
Maximum caption length: 40
Variables initialization done!


In [18]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = '224766705_b77996527f.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
line of green and red dune buggys on gravel road near rocks and large pine trees


In [19]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = '179829865_095b040377.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
man holds on to handle in the water


In [20]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = '86412576_c53392ef80.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
man carrying many woven baskets next to man carrying stack of rugs on his back


In [21]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = '41999070_838089137e.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
several dogs swim in pool and two black dogs are playing tug of war with toy


In [24]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = 'dog.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
A dog dog in front of white wall.


In [25]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = 'basketball.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
basketball player wearing grey playing basketball


In [26]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = 'beach.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
boys holding hands in front of water


In [33]:
weight = 'drive/capgen/Models/WholeModel2.h5'
test_image = 'rail.jpg'
test_img_dir = 'drive/capgen/Flickr8k_text/chaljaa.txt'
print test_model(weight, test_image)



#print test_model_on_images(weight, test_img_dir, beam_size=3)




Model created!
many people standing together at train 
