## Prepare training text

descriptions: dict['image_id'] = list(cleaned/processed descriptions) <br>
Cleans the given text descriptions and saves them to 'descriptions.txt'.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import string
from pickle import load

In [None]:
def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

# extract individual descriptions for images -> dict, where image_id is key, list of image descriptions is value
def load_descriptions(doc):
  all_descriptions_dict = dict()
  # process lines
  for line in doc.split('\n'):
    # split line by space
    tokens = line.split()
    if len(line) < 2:
      continue
    # take the first token as the image id, the rest as the description
    image_id, image_desc = tokens[0], tokens[1:]
    # remove filename, leaving just image id
    image_id = image_id.split('.')[0]
    # convert description tokens back to string
    image_desc = ' '.join(image_desc)
    # create the list if needed
    if image_id not in all_descriptions_dict:
      all_descriptions_dict[image_id] = list()
    # store description
    all_descriptions_dict[image_id].append(image_desc)
  return all_descriptions_dict

def clean_descriptions(descriptions):
  # Create translation table that will remove punctuation
  translation_table = str.maketrans('', '', string.punctuation)
  for image_id, desc_list in descriptions.items():
    for i in range(len(desc_list)): # for every desc in image desc list
      desc = desc_list[i]
      # tokenize
      desc = desc.split()
      # convert to lower case
      desc = [word.lower() for word in desc]
      # remove punctuation from each token
      desc = [w.translate(translation_table) for w in desc]
      # remove hanging 's' and 'a'
      desc = [word for word in desc if len(word)>1]
      # remove tokens with numbers in them
      desc = [word for word in desc if word.isalpha()]
      # store as a string, replacing current desc
      desc_list[i] = ' '.join(desc)

# save descriptions to file, one description per line. Each line: image id + ' ' + desc
def save_descriptions(descriptions, filename):
  lines = list()
  for image_id, desc_list in descriptions.items():
    for desc in desc_list:
      lines.append(image_id + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [None]:
def cleanImageCaptionsTrainingData(filename):
  doc = load_doc(filename) # load unprocessed descriptions
  descriptions = load_descriptions # process descriptions
  clean_descriptions(descriptions) # clean descriptions
  save_descriptions(descriptions, 'descriptions.txt')

if __name__ == "__main__":
  cleanImageCaptionsTrainingData(filename)

In [None]:
cleanImageCaptionsTrainingData('/content/drive/My Drive/ImageCaptioningProject/Flickr8k_text/Flickr8k.token.txt')