<a href="https://colab.research.google.com/github/rahulkasaudhan99/Image_Captioning/blob/main/Image_Captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

In [2]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
filename = "/content/drive/MyDrive/Colab Notebooks/Dataset/Flickr8k_text/Flickr8k.token.txt"
# load descriptions
doc = load_doc(filename)

In [3]:
# extract descriptions for images
def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# remove filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# store description
		mapping[image_id].append(image_desc)
	return mapping
 
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


In [4]:
print(type(descriptions))

<class 'dict'>


In [5]:
dict_items = descriptions.items()

first_two = list(dict_items)[:2]
#print(type(first_two))
for x in first_two:
  for i in x[1]:
    print(x[0],i)
  print()

1000268201_693b08cb0e A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e A girl going into a wooden building .
1000268201_693b08cb0e A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e A little girl climbing the stairs to her playhouse .
1000268201_693b08cb0e A little girl in a pink dress going into a wooden cabin .

1001773457_577c3a7d70 A black dog and a spotted dog are fighting
1001773457_577c3a7d70 A black dog and a tri-colored dog playing with each other on the road .
1001773457_577c3a7d70 A black dog and a white dog with brown spots are staring at each other in the street .
1001773457_577c3a7d70 Two dogs of different breeds looking at each other on the road .
1001773457_577c3a7d70 Two dogs on pavement moving toward each other .



In [None]:
import string
 
def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)
 
# clean descriptions
clean_descriptions(descriptions)

In [None]:
print(type(descriptions))

<class 'dict'>


In [6]:
dict_items = descriptions.items()

first_two = list(dict_items)[:2]
#print(type(first_two))
for x in first_two:
  for i in x[1]:
    print(x[0],i)
  print()

1000268201_693b08cb0e A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e A girl going into a wooden building .
1000268201_693b08cb0e A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e A little girl climbing the stairs to her playhouse .
1000268201_693b08cb0e A little girl in a pink dress going into a wooden cabin .

1001773457_577c3a7d70 A black dog and a spotted dog are fighting
1001773457_577c3a7d70 A black dog and a tri-colored dog playing with each other on the road .
1001773457_577c3a7d70 A black dog and a white dog with brown spots are staring at each other in the street .
1001773457_577c3a7d70 Two dogs of different breeds looking at each other on the road .
1001773457_577c3a7d70 Two dogs on pavement moving toward each other .



In [7]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
	# build a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc
 
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))


Vocabulary Size: 9630


In [8]:
print(type(vocabulary))
for x in vocabulary:
  print(x)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
tank
balconies
fleeing
stay
Hall
ripped
sum
possessions
sips
smaller
bulldozer
baseball
moss-covered
poling
bananas
circles
hotel
Mission
plantains
reading
menacingly
wade
Martial
Gothic
Rollerblades
piggyback
shook
segway
wrestle
church
chase
operators
dirt
hello
Middle
teases
drive
garner
int
pooping
union
Eleven
Twome
graffiti-covered
fair
squeak
lifeboat
beckons
Crowded
billboard
brown
jeep
Mets
washed
gravity
non-working
clips
queue
pinball
short-haired
unzipping
pain
grinds
wood-stacked
environment
unner
sprawled
warming
hilly
wakeboarding
formula
Texas
expressions
fadora
electricity
large-boned
Mexico
safety
roadway
Mani
sidwalk
blurs
pouch
attampts
vendor
blossoms
assists
rounded
mishap
Row
lollipop
descends
dolly
Lean
grazing
apart
shrubbery
groceries
from
signer
Street
pulls
Youth
tucking
rubs
overlooking
catches
hairclips
flowers
spiritual
throwing
bonfire
HOMELESS
buildings
downpour
usual
equpitment
vision
bor

In [9]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()
 
# save descriptions
save_descriptions(descriptions, 'descriptions.txt')

In [10]:
#print(type())

In [11]:
model = VGG16()
# re-structure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# summarize
print(model.summary())

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     14758

In [None]:
# extract features from each photo in the directory
def extract_features(directory):
	# load the model
	model = VGG16()
	# re-structure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
	# summarize
	print(model.summary())
	# extract features from each photo
	features = dict()
	for name in listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature
		features[image_id] = feature
		print('>%s' % name)
	return features

# extract features from all images
directory = '/content/drive/MyDrive/Colab Notebooks/Dataset/Flickr8k_Dataset/Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

>399679638_d3036da331.jpg
>385835044_4aa11f6990.jpg
>396360611_941e5849a3.jpg
>384577800_fc325af410.jpg
>391723162_3bdeb7ea33.jpg
>388386075_9ac3a89ada.jpg
>394463341_5311c53783.jpg
>391579205_c8373b5411.jpg
>393987665_91d28f0ed0.jpg
>389643437_9a9830a3ba.jpg
>399246804_b4b5dc70e1.jpg
>390671130_09fdccd52f.jpg
>385186343_464f5fc186.jpg
>396763804_3b7f1e12a8.jpg
>392976422_c8d0514bc3.jpg
>387974450_bcd205daac.jpg
>390987167_2d5905b459.jpg
>394161692_2576920777.jpg
>397815951_3b02090324.jpg
>386470686_1ae9242878.jpg
>390360326_26f5936189.jpg
>397286183_745abbf40d.jpg
>390986651_c801db91a0.jpg
>390992388_d74daee638.jpg
>400562847_e15aba0aac.jpg
>399212516_d68046b277.jpg
>392467282_00bb22e201.jpg
>397451339_76a84bd310.jpg
>387830531_e89c192b92.jpg
>386656845_4e77c3e3da.jpg
>393284934_d38e1cd6fe.jpg
>396179143_e1511336e1.jpg
>393810324_1c33760a95.jpg
>387078972_514a38dc33.jpg
>397547349_1fd14b95af.jpg
>391106734_d374bc3080.jpg
>391020801_aaaae1e42b.jpg
>397982550_cf9f5cdb74.jpg
>391324644_d

KeyboardInterrupt: ignored