In [None]:
import string
from tensorflow.keras.utils import plot_model
from os import listdir
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Add
from pickle import load,dump
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.image import load_img
from numpy import array, argmax
from nltk.translate.bleu_score import corpus_bleu
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
 def feature_extraction(dir):
	model=VGG16()
	model=Model(inputs=model.Input,outputs=model.layers[-2].output)
	print(model.summary())
	feat=dict()
	for nm in listdir(dir):
		f_name=dir + '/' + nm
		pict=load_img(f_name, target_size=(224, 224))
		pict=img_to_array(pict)
		pict=pict.reshape((1,pict.shape[0],pict.shape[1],pict.shape[2]))
		pict=preprocess_input(pict)
		featr=model.predict(pict,verbose=0)
		pict_id=nm.split('.')[0]
		feat[pict_id]=featr
		print('>%s'%nm)
	return feat

In [None]:
'''
This code section is used to generate features. Generated features are already saved into features.pkl
dir = '/content/drive/MyDrive/Colab_Notebooks/archive/Images'
feat = feature_extraction(dir)
print('Extracted Features: %d'%len(feat))
'''

"\nThis code section is used to generate features. Generated features are already saved into features.pkl\ndir = '/content/drive/MyDrive/Colab_Notebooks/archive/Images'\nfeat = feature_extraction(dir)\nprint('Extracted Features: %d'%len(feat))\n"

In [None]:
#dump(feat, open('/content/drive/MyDrive/Colab_Notebooks/archive/features.pkl', 'wb'))

In [None]:
def document_loading(f_name):
	file=open(f_name,'r')
	txt=file.read()
	file.close()
	return txt

In [None]:
def decs_load(doc):
	map_desc=dict()
	for ln in doc.split('\n'):
		tkn=ln.split()
		if len(ln)<2:
			continue
		pict_id,pict_desc=tkn[0],tkn[1:]
		pict_id=pict_id.split('.')[0]
		pict_desc = ' '.join(pict_desc)
		if pict_id not in map_desc:
			map_desc[pict_id]=list()
		map_desc[pict_id].append(pict_desc)
	return map_desc

In [None]:
def desc_cleaning(descs):
	tb=str.maketrans('', '', string.punctuation)
	for k, desc_list in descs.items():
		for i in range(len(desc_list)):
			desc=desc_list[i]
			desc=desc.split()
			desc=[wd.lower() for wd in desc]			
			desc=[w.translate(tb) for w in desc]			
			desc=[wd for wd in desc if len(wd)>1]
			desc=[wd for wd in desc if wd.isalpha()]
			desc_list[i] =  ' '.join(desc)

In [None]:
def vocab_conv(descs):
	desc_all=set()
	for k in descs.keys():
		[desc_all.update(d.split()) for d in descs[k]]
	return desc_all

In [None]:
def desc_saving(descs,f_name):
	lns=list()
	for k,desc_list in descs.items():
		for desc in desc_list:
			lns.append(k+' '+desc)
	dta='\n'.join(lns)
	file=open(f_name,'w')
	file.write(dta)
	file.close()

In [None]:
f_name='C:/Users/Kunj/Downloads/recommender/Flickr8k.token.txt'
doc = document_loading(f_name)

In [None]:
descs=decs_load(doc)
print('Loaded: %d '%len(descs))
desc_cleaning(descs)

In [None]:
vblry=vocab_conv(descs)
print('Vocabulary Size: %d'%len(vblry))

In [None]:
desc_saving(descs,'descriptions.txt')

In [None]:
def dset_load(f_name):
	doc = document_loading(f_name)
	dset = list()
	for ln in doc.split('\n'):
		if len(ln)<1:
			continue
		idf=ln.split('.')[0]
		dset.append(idf)
	return set(dset)

In [None]:
def cleaned_desc_load(f_name,dset):
	doc=document_loading(f_name)
	descs=dict()
	for ln in doc.split('\n'):
		tkn=ln.split()
		pict_id,pict_desc=tkn[0],tkn[1:]
		if pict_id in dset:
			if pict_id not in descs:
				descs[pict_id]=list()
			desc='startingsequence ' + ' '.join(pict_desc) + ' endingsequence'
			descs[pict_id].append(desc)
	return descs

In [None]:
def img_features_load(f_name,dset):
	allfeatr=load(open(f_name,'rb'))
	ftrs={k: allfeatr[k] for k in dset}
	return ftrs

In [None]:
def into_lns(descs):
	desc_all=list()
	for k in descs.keys():
		[desc_all.append(d) for d in descs[k]]
	return desc_all

In [None]:
def generate_Tokenizer(descs):
	lns=into_lns(descs)
	tokenizer=Tokenizer()
	tokenizer.fit_on_texts(lns)
	return tokenizer

In [None]:
def most_lngth(descs):
  lns=into_lns(descs)
  max_lngth=max(len(d.split()) for d in lns)
  return max_lngth

In [None]:
def build_seq(tokenizer,most_lngth,desc_list,photo,vocb_size):
	aa1,aa2,y=list(),list(),list()
	for desc in desc_list:
		seq=tokenizer.texts_to_sequences([desc])[0]
		for i in range(1, len(seq)):
			input_sequence,output_sequence=seq[:i],seq[i]
			input_sequence=pad_sequences([input_sequence],maxlen=most_lngth)[0]
			output_sequence=to_categorical([output_sequence],num_classes=vocb_size)[0]
			aa1.append(photo)
			aa2.append(input_sequence)
			y.append(output_sequence)
	return array(aa1),array(aa2),array(y)

In [None]:
def defn_modl(vocb_size,most_lngth):
	inp1=Input(shape=(4096,))
	fe1=Dropout(0.5)(inp1)
	fe2=Dense(256,activation='relu')(fe1)
	inp2=Input(shape=(most_lngth,))
	ese1=Embedding(vocb_size,256,mask_zero=True)(inp2)
	ese2=Dropout(0.5)(ese1)
	ese3=LSTM(256)(ese2)
	dec1=Add()([fe2,ese3])
	dec2=Dense(256,activation='relu')(dec1)
	oupts=Dense(vocb_size,activation='softmax')(dec2)
	model=Model(inputs=[inp1, inp2],outputs=oupts)
	model.compile(loss='categorical_crossentropy',optimizer='adam')
	model.summary()
	plot_model(model,to_file='model.png',show_shapes=True)
	return model

In [None]:
def data_creator(descs,photos,tokenizer,most_lngth,vocb_size):
	while 1:
		for k, desc_list in descs.items():
			photo=photos[k][0]
			in_img,input_sequence,out_word=build_seq(tokenizer,most_lngth,desc_list,photo,vocb_size)
			yield [in_img,input_sequence],out_word

In [None]:
f_name='C:/Users/Kunj/Downloads/recommender/Flickr_8k.trainImages.txt'
train=dset_load(f_name)
print('Dataset: %d'%len(train))

In [None]:
train_desc=cleaned_desc_load('descriptions.txt',train)
print('Descriptions: train=%d'%len(train_desc))

In [None]:
train_feat=img_features_load('C:/Users/Kunj/Downloads/recommender/features.pkl',train)
print('Photos: train=%d'%len(train_feat))

In [None]:
tokenizer=generate_Tokenizer(train_desc)
vocb_size=len(tokenizer.word_index)+1
print('Size of Vocabulary=%d'%vocb_size)

In [None]:
most_lngth=most_lngth(train_desc)
print('Description Length: %d'%most_lngth)

In [None]:
model=defn_modl(vocb_size,most_lngth)

In [None]:
# epochs=20
# stps=len(train_desc)
# for i in range(epochs):
# 	generator=data_creator(train_desc,train_feat,tokenizer,most_lngth,vocb_size)
# 	model.fit_generator(generator,epochs=1,steps_per_epoch=stps,verbose=1)
# 	model.save("model_'+str(i)+'.h5")

In [None]:
def word_for_id(itgr,tokenizer):
  tkzz=tokenizer
	for word,index in tkzz.word_index.items():
		if index==itgr:
			return word
	return None

def generate_desc(model,tkzz,photo,maxim_lgth):
	in_text='startingsequence'
	for i in range(maxim_lgth):
		sequence=tkzz.texts_to_sequences([in_text])[0]
		sequence=pad_sequences([sequence],maxlen=maxim_lgth)
		yhat=model.predict([photo,sequence],verbose=0)
		yhat=argmax(yhat)
		word=word_for_id(yhat,tkzz)
		if word is None:
			break
		in_text+=' '+word
		if word=='endingsequence':
			break
	return in_text

In [None]:
def model_evaluation(model,descriptions,photos,tokenizer,maxim_lgth):
	actual, predicted=list(),list()
	for key,desc_list in descriptions.items():
		yhat=generate_desc(model,tokenizer,photos[key],maxim_lgth)
		references=[d.split() for d in desc_list]
		actual.append(references)
		predicted.append(yhat.split())
	print('BLEU-1:%f'%corpus_bleu(actual,predicted,weights=(1.0,0,0,0)))
	print('BLEU-2:%f'%corpus_bleu(actual,predicted,weights=(0.5,0.5,0,0)))
	print('BLEU-3:%f'%corpus_bleu(actual,predicted,weights=(0.3,0.3,0.3,0)))
	print('BLEU-4:%f'%corpus_bleu(actual,predicted,weights=(0.25,0.25,0.25,0.25)))

In [None]:
name_of_file='C:/Users/Kunj/Downloads/recommender/Flickr_8k.testImages.txt'
test=dset_load(name_of_file)
print('Dataset: %d'%len(test))
test_descriptions=cleaned_desc_load('C:/Users/Kunj/Downloads/recommender/descriptions.txt',test)
print('Descriptions: test=%d'%len(test_descriptions))
test_features=img_features_load('C:/Users/Kunj/Downloads/recommender/features.pkl',test)
print('Photos: test=%d'%len(test_features))
name_of_file='C:/Users/Kunj/Downloads/recommender/model/model_19.h5'
model=load_model(name_of_file)
print("evaluation start")
model_evaluation(model,test_descriptions,test_features,tokenizer,most_lngth)

In [None]:
name_of_file='C:/Users/Kunj/Downloads/recommender/Flickr_8k.trainImages.txt'
train=dset_load(name_of_file)
print('Dataset: %d'%len(train))
train_descriptions=cleaned_desc_load('C:/Users/Kunj/Downloads/recommender/descriptions.txt',train)
print('Descriptions: train=%d'%len(train_descriptions))
tokenizer=generate_Tokenizer(train_descriptions)
dump(tokenizer,open('C:/Users/Kunj/Downloads/recommender/tokenizer.pkl','wb'))

In [None]:
tokenizer=load(open('C:/Users/Kunj/Downloads/recommender/tokenizer.pkl','rb'))
maxim_lgth=34   

In [None]:
model=load_model('C:/Users/Kunj/Downloads/recommender/model/model_19.h5')

In [None]:
def extract_features(name_of_file):
	model=VGG16()
	model=Model(inputs=model.inputs,outputs=model.layers[-2].output)
  image=load_img(name_of_file,target_size=(224,224))
	image=img_to_array(image)
	image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
	image=preprocess_input(image)
	feature=model.predict(image,verbose=0)
	return feature
photo=extract_features('C:/Users/Kunj/Downloads/recommender/example.jpg')
padding_words=['startingsequence','endingsequence']
description=generate_desc(model,tokenizer,photo,maxim_lgth)
print(description)