### Prepare text

##### Load the file with all the descriptions in the format -> image name description. 
##### Eg - 2252123185_487f21e336.jpg Stadium full of people watch game. People are cheering.

In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [None]:
filename = 'Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)

##### load_descriptions() function takes the loaded document text and returns a dictionary of photo identifiers to descriptions. Eg - key is [2252123185_487f21e336] and values corresponding to this key are [[Stadium full of people watching game.], [People are cheering.]]. So basically value contains a list of strings.

In [None]:
# extract descriptions for images
def load_descriptions(doc):
    mapping = dict()
    # process lines
    for line in doc.split('\n'):
        # split line by white space
        tokens = line.split()
        if len(line) < 2:
            continue
        # take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # remove filename from image id
        image_id = image_id.split('.')[0]
        # convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
        # store description
        mapping[image_id].append(image_desc)
    return mapping

In [None]:
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

##### Convert all words to lowercase.
##### Remove all punctuation.
##### Remove all words that are one character or less in length (e.g. ‘a’).
##### Remove all words with numbers in them.

In [None]:
import string
 
def clean_descriptions(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [word.lower() for word in desc]
            # remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # store as string
            desc_list[i] =  ' '.join(desc)

In [None]:
# clean descriptions
clean_descriptions(descriptions)

#### Building Vocabulary of the sentences
##### take all the different sentences corresponding to a key in the dictionary above and keep putting them in a set

In [None]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

In [None]:
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

#####  save the dictionary of image identifiers and descriptions to a new file named descriptions.txt, with one image identifier and description per line.
##### descriptions.txt contains the processed text and will look like this - >
###### 2252123185_487f21e336 bunch on people are seated in stadium
###### 2252123185_487f21e336 crowded stadium is full of people watching an event
###### 2252123185_487f21e336 crowd of people fill up packed stadium
###### 2252123185_487f21e336 crowd sitting in an indoor stadium
###### 2252123185_487f21e336 stadium full of people watch game

In [None]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [None]:
# save descriptions
save_descriptions(descriptions, 'descriptions.txt')