In [0]:
# For CS6240 paper
# This is proof-of-concept (PoC) implementation for the extension of "Incorporating a Generative answer model in GQA" in the paper
# Done by Group 3 (Ma Xiaoping, Stephanie Lew)

# This python notebook has been tested in Google Colab only.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
%tensorflow_version 1.x
# specify tensorflow version

TensorFlow 1.x selected.


In [4]:
import tensorflow as tf
print(tf.__version__)

1.15.2


In [0]:
# global variables

# maximum number of words, which determines the size of the vector representation of a word
MAX_NUM_WORDS = 3200

# For words that are not in the top MAX_NUM_WORDS
NULLTOKEN = "NULLTOKEN"

MAX_QUESTION_LENGTH=25
MAX_ANSWER_LENGTH=10

# VQA V2 existing features (downloaded from https://github.com/MILVLG/mcan-vqa)
MEDIA_FEATURE_DIMENSION = 2048
MEDIA_MAX_LENGTH = 70

# Question type
QUESTION_TYPE_DIMENSION = 65
MAX_QUESTION_TYPE_LENGTH = 1


# a buffer to store images to save loading time
# MAX_IMAGE_BUFFER_SIZE = 10000
MAX_IMAGE_BUFFER_SIZE = 20000

try:
  print (len(image_data_buffer))
except NameError as e:
  # init the buffer only if it is not init before
  image_data_buffer = {}

# num of candidate answers to be generated
NUM_CANDIDATE_ANSWERS = 50

In [6]:
# prepare data
# vaq v2, the original questions and answers can be downloaded from: https://visualqa.org/download.html

# follow https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/ to preprocess text data

import json
import os
import pprint
import pandas as pd
import re

main_path = "/content/gdrive/My Drive/cs6240_project_codes/"
output_path = os.path.join(main_path, "outputs")

data_main_path = os.path.join(main_path, "data/vqa_v2/")

question_train_file = os.path.join(data_main_path, "v2_OpenEnded_mscoco_train2014_questions.json")
# load questions to a dictionary

with open(question_train_file) as f:
  questions = json.load(f)

qlist = questions["questions"]
qindex_example = 9
print ("qlist[{}] = {}".format(qindex_example, qlist[qindex_example]))
qid_example = qlist[9]['question_id']
# one example in qlist: qlist[9] = {'image_id': 524291, 'question': 'Is the dog looking at a tennis ball or frisbee?', 'question_id': 524291002}

print ("len(qlist) = {}".format(len(qlist)))

answer_train_file = os.path.join(data_main_path, "v2_mscoco_train2014_annotations.json")
# load answers to a dictionary, need to extract the answers and free it to save memory

with open(answer_train_file) as f:
  answers = json.load(f)

alist = answers["annotations"]

for i in range(len(alist)):
  if alist[i]['question_id'] == qid_example:
    pprint.pprint(alist[i])
    aindex = i
    break

for ans in alist:
  del ans['answers']

# remove 'answers' and keep 'multiple_choice_answer'
print ("alist[aindex] = {}".format(alist[aindex]))

def get_text_len(s):
	# the count includes punctuation marks
	return len(re.findall(r"[\w']+|[.,!?;]", s))
 
qdict = {}
question_lenths = []
for question in qlist:
	qdict[question['question_id']] = question
	question_lenths.append(get_text_len(question['question']))
print ("len(qlist) = {}, len(qdict) = {}".format(len(qlist), len(qdict)))

question_lenths.sort()
print ("the maximum question length is ", question_lenths[-1])
for p in [0.7, 0.8, 0.9, 0.95, 0.99, 0.999]:
	print ("the {} percentile question length is {}".format( p*100, question_lenths[ int(len(question_lenths) * p)]))
question_lenths = None

adict = {}
answer_lenths = []
for ans in alist:
	adict[ans['question_id']] = ans
	answer_lenths.append(get_text_len(ans['multiple_choice_answer']))
print ("len(alist) = {}, len(adict) = {}".format(len(alist), len(adict)))
answer_lenths.sort()
print ("the maximum answer length is ", answer_lenths[-1])
for p in [0.7, 0.8, 0.9, 0.95, 0.99, 0.999]:
	print ("the {} percentile answer length is {}".format( p*100, answer_lenths[ int(len(answer_lenths) * p)]))
answer_lenths = None

#def append_startword_endword_to_sentence(s):
#  return "{} {} {}".format(SENTENCE_START, s, SENTENCE_END)

qa_dict = {'question_id':[], 'question':[], 'answer':[], 'image_id':[], 'question_type':[], 'answer_type':[]}
for k in qdict.keys():
	# qa_dict[k] = {'question': qdict[k], 'answer': adict[k]}
	assert(k == qdict[k]['question_id'])
	assert(k == adict[k]['question_id'])
	qa_dict['question_id'].append(k)
	#qa_dict['question'].append(append_startword_endword_to_sentence(qdict[k]['question']))
	#qa_dict['answer'].append(append_startword_endword_to_sentence(adict[k]['multiple_choice_answer']))
	qa_dict['question'].append(qdict[k]['question'])
	qa_dict['answer'].append(adict[k]['multiple_choice_answer'])
	qa_dict['image_id'].append(qdict[k]['image_id'])
	qa_dict['question_type'].append(adict[k]['question_type'])
	qa_dict['answer_type'].append(adict[k]['answer_type'])

print ("len(qa_dict['question_id']) = {}".format(len(qa_dict['question_id'])))

qa_df = pd.DataFrame.from_dict(qa_dict)
print ("qa_df.shape = {}".format(qa_df.shape))
print ("qa_df.iloc[0] =\n{}".format(qa_df.iloc[0]))


qlist[9] = {'image_id': 524291, 'question': 'Is the dog looking at a tennis ball or frisbee?', 'question_id': 524291002}
len(qlist) = 443757
{'answer_type': 'other',
 'answers': [{'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 1},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 2},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 3},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 4},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 5},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 6},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 7},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 8},
             {'answer': 'frisbee', 'answer_confidence': 'yes', 'answer_id': 9},
             {'answer': 'frisbee',
              'answer_confidence': 'yes',
              'answer_id': 10}],
 'im

In [7]:
import pickle
import os

file_qtypes_saved_before = os.path.join(data_main_path, "question_types_saved.pickle")

with open(file_qtypes_saved_before, 'rb') as f:
  qt_dict = pickle.load(f)

# create a reversed dictionary for convienence for converting an index back to qtype
qt_reversed_dict = {}
for t, i in qt_dict.items():
  qt_reversed_dict[i] = t
assert (len(qt_reversed_dict) == len(qt_dict))

qtypes = qt_dict.keys()
print ("number of unique question types = {}".format(len(qtypes)))
print ("qtypes = ", qtypes)
print ("qt_dict = ", qt_dict)
assert (len(qt_dict) == QUESTION_TYPE_DIMENSION)

# checking only
qtypes_cur = list(set(qa_df["question_type"]))
print ("len(qtypes_cur) = {}".format(len(qtypes_cur)))

# make sure every qtype from qtypes_cur is in qtypes
for t in qtypes_cur:
  assert(t in qt_dict)

qtypes_cur = None # free it in case it is accidentally used


def get_one_hot_representation_qtype(qtype_df):
  qtypes_onehot = []
  ntypes = len(qt_dict)

  v0 = [0] * ntypes
  n = len(qtype_df)

  for qt in qtype_df:
    v = list(v0)
    qt_id = qt_dict[qt]
    v[qt_id] = 1
    # to be consistent with other seq data, put v in [] to increase 1 dimension
    qtypes_onehot.append([v])

  return qtypes_onehot

def list_matrices_to_list_question_types(qtypes_onehot):
  types = []
  for qt in qtypes_onehot:
    v = list(qt[0])
    tid = v.index(1)
    types.append( qt_reversed_dict[tid] )
  return types

# --------------------------------
# the following codes show how the original question_types_saved.pickle was created.
# the qtypes are saved in the beginning to create an common order for future reference

# file_qtypes_saved_before = os.path.join(data_main_path, "question_types_saved.pickle")
# qtypes = list(set(qa_df["question_type"]))
# qtypes.sort()
# qt_dict = {}
# for i in range(len(qtypes)):
#  qt_dict[ qtypes[i] ] = i

## save the dictionary for future lookup
# with open(file_qtypes_saved_before, 'wb') as f:
#  # store the qt_dict as binary data stream
#  pickle.dump(qt_dict, f)

number of unique question types = 65
qtypes =  dict_keys(['are', 'are the', 'are there', 'are there any', 'are these', 'are they', 'can you', 'could', 'do', 'do you', 'does the', 'does this', 'has', 'how', 'how many', 'how many people are', 'how many people are in', 'is', 'is he', 'is it', 'is that a', 'is the', 'is the man', 'is the person', 'is the woman', 'is there', 'is there a', 'is this', 'is this a', 'is this an', 'is this person', 'none of the above', 'was', 'what', 'what animal is', 'what are', 'what are the', 'what brand', 'what color', 'what color are the', 'what color is', 'what color is the', 'what does the', 'what is', 'what is in the', 'what is on the', 'what is the', 'what is the color of the', 'what is the man', 'what is the name', 'what is the person', 'what is the woman', 'what is this', 'what kind of', 'what number is', 'what room is', 'what sport is', 'what time', 'what type of', 'where are the', 'where is the', 'which', 'who is', 'why', 'why is the'])
qt_dict =  {

In [8]:
import os
import datetime

# The image feature data downloaded from https://github.com/MILVLG/mcan-vqa
data_path2 = os.path.join(main_path, "data/vqa_v2/MILVLG_mcan-vqa/")
#foldername = "image_features_examples_from_MILVLG_mcan-vqa"
foldername = "train2014"
filename = "{}.tar.gz".format(foldername)
tarfile =os.path.join(data_path2, filename)

image_path = os.path.join("./", foldername) # updated image path

# copy the tarfile to disk and extract it there
!ls ./

# firstly, check if the extracted folder is there
if os.path.isdir(foldername):
  print ("folder {} exists. no need to copy and extract".format(foldername))
else:
  # if not then copy over, and later extract
  if os.path.isfile(filename):
    print ("file {} exists".format(filename))
  else:
    print ("start copying the file to home folder")
    print ("start time: ", datetime.datetime.now())
    !cp "{tarfile}" .
    print ("end time: ", datetime.datetime.now())  

  # now extract the tar file
  print ("start extracting the tar file")
  print ("start time: ", datetime.datetime.now())
  !tar xf "{filename}"
  print ("end time: ", datetime.datetime.now())

  if os.path.isdir(foldername):
    print ("extraction is done. remove tar file")
    !rm "{filename}"
    
!ls ./

gdrive	sample_data
start copying the file to home folder
start time:  2020-04-24 03:12:56.539571
end time:  2020-04-24 03:21:08.947510
start extracting the tar file
start time:  2020-04-24 03:21:08.948089
end time:  2020-04-24 03:26:08.598201
extraction is done. remove tar file
gdrive	sample_data  train2014


In [0]:
# !rm -r "train2014"

In [10]:
import os
import numpy as np

# sometimes the mounting will encounter the "Google Drive timeout" error. have to try for a few times
if not os.path.isdir(image_path):
  print("image features folder {} does not exist".format(image_path))
else:
  print ("image feature folder {} exists.".format(image_path))

# test
filename = "COCO_train2014_000000458752.jpg.npz"
filepath = os.path.join(image_path, filename)
if not os.path.isfile(filepath):
  print ("file {} does not exist".format(filepath))
else:
  print ("file {} can be found".format(filepath))

image = np.load(filepath)
print ("image.keys() = {}".format(image.keys()))
print ("image.files = {}".format(image.files))
print ("image['x'].shape = {}".format(image['x'].shape))
print ("image['image_w'] = {}".format(image['image_w']))
print ("image['image_h'] = {}".format(image['image_h']))
print ("image['num_bbox'] = {}".format(image['num_bbox']))
print ("image['bbox'].shape = {}".format(image['bbox'].shape))

# some stats computed offline:
# the maximum image features length is  99
# the 70.0 percentile image features length is 37
# the 80.0 percentile image features length is 41
# the 90.0 percentile image features length is 46
# the 95.0 percentile image features length is 51
# the 99.0 percentile image features length is 59
# the 99.9 percentile image features length is 69

image feature folder ./train2014 exists.
file ./train2014/COCO_train2014_000000458752.jpg.npz can be found
image.keys() = KeysView(<numpy.lib.npyio.NpzFile object at 0x7ffb00f87b70>)
image.files = ['x', 'image_w', 'bbox', 'num_bbox', 'image_h']
image['x'].shape = (2048, 34)
image['image_w'] = 640
image['image_h'] = 480
image['num_bbox'] = 34
image['bbox'].shape = (34, 4)


In [0]:
# print ("qa_df[\"image_id\"][0:100] = ", list(qa_df["image_id"][0:100]) )

In [0]:
## old codes

# get all the questions in one list
# questions_list = []
# for question in qlist:
#  questions_list.append(question['question'])
# tq.fit_on_texts(questions_list)

# tq = Tokenizer(num_words=1000, split=' ', char_level=False)
# tq.fit_on_texts(qa_df["question"])

# num_words_more_than_50_occurences = len( list( filter(lambda elem: elem[1] > 50, tq.word_counts.items()) ) )
# print ("num_words_more_than_50_occurences = {}".format(num_words_more_than_50_occurences))
# based on original questions in training data
# num_words_more_than_5_occurences = 6105
# num_words_more_than_10_occurences = 4495
# num_words_more_than_15_occurences = 3712
# num_words_more_than_20_occurences = 3194
# num_words_more_than_30_occurences = 2614
# num_words_more_than_50_occurences = 2039

# summary:
# print("tq.word_counts = {}".format(len(tq.word_counts)) )
# print("tq.document_count = {}".format(tq.document_count) )
# print("tq.word_index = {}".format(tq.word_index) )
# print("tq.word_docs = {}".format(tq.word_docs) )

# ta = Tokenizer(split=' ', char_level=False)
# ta.fit_on_texts(qa_df["answer"])
# print("ta.word_counts = {}".format(len(ta.word_counts)) )
# print("ta.document_count = {}".format(ta.document_count) )

In [0]:
# How the tokenizer is trained in the first place

# from keras.preprocessing.text import Tokenizer
# import pickle

# print ("qa_df.iloc[0] =\n{}".format(qa_df.iloc[0]))

# tqa = Tokenizer(num_words=MAX_NUM_WORDS, split=' ', char_level=False, oov_token=NULLTOKEN)
# tqa.fit_on_texts(qa_df["question"] + qa_df["answer"])
# print("tqa.word_counts = {}".format(len(tqa.word_counts)) )
# print("tqa.document_count = {}".format(tqa.document_count) )

# print ("tqa.word_index[\"the\"] = ", tqa.word_index["the"])

# print ("word index for the nulltoken = ", tqa.word_index[NULLTOKEN])

# save trained tokenizer to file
# filename = "tokenizer.pickle"
# filepath = os.path.join(output_path, filename)
# pickle.dump(tqa, open(filepath, 'wb'))
 

# The words kept by Tokenizer do not depend on freq.. just the most common words.
# Therefore the self-created SENTENCE_START and SENTENCE_END words are not there.

# sentence_start_index = tqa.word_index[SENTENCE_START]
# sentence_end_index = tqa.word_index[SENTENCE_END]
# print ("sentence_start_index = ", sentence_start_index)
# print ("sentence_end_index = ", sentence_end_index)

In [14]:
# load trained tokenizer from file
import pickle
import datetime

filename = "tokenizer.pickle"
filepath = os.path.join(output_path, filename)

print ("start time for loading tokenizer: ", datetime.datetime.now())
tqa = pickle.load(open(filepath, 'rb'))
print ("end time for loading tokenizer: ", datetime.datetime.now())

print("tqa.word_counts = {}".format(len(tqa.word_counts)) )
print("tqa.document_count = {}".format(tqa.document_count) )

print ("tqa.word_index[\"the\"] = ", tqa.word_index["the"])

# check what is the number for the nulltoken
NULLTOKEN_INDEX = tqa.word_index[NULLTOKEN]
print ("word index for the nulltoken = ", NULLTOKEN_INDEX)


start time for loading tokenizer:  2020-04-24 03:26:13.859463
end time for loading tokenizer:  2020-04-24 03:26:19.068605
tqa.word_counts = 19710
tqa.document_count = 443757
tqa.word_index["the"] =  2
word index for the nulltoken =  1


In [15]:
import time
import numpy as np
import os
import random

# image features from https://github.com/MILVLG/mcan-vqa
# every image["x"] contains 2048 features for each bounding box;
# so the shape of image["x"] is always 2048*num_bbox

# image_path =  = os.path.join(data_main_path, "MILVLG_mcan-vqa")

def reduce_buffer_if_needed():
	n = len(image_data_buffer) 
	if n <= MAX_IMAGE_BUFFER_SIZE:
		print ("buffer ({}) not filled up.".format(n))
		return

	n_to_remove = n - MAX_IMAGE_BUFFER_SIZE
	print ("To remove {} items from buffer with size {}".format(n_to_remove, n))

	time_and_image_ids = []
	for key, item in image_data_buffer.items():
		time_and_image_ids.append((item["time"], key))

	# time_and_image_ids.sort() # deleted based on last accessed time
	random.shuffle(time_and_image_ids) # randomly delete

	for i in range(n_to_remove):
		print ("remove image (last accessed time, image id) = {}".format(time_and_image_ids[i]))
		del image_data_buffer[time_and_image_ids[i][1]]

	assert(n - n_to_remove == len(image_data_buffer))
	print ("After removing old items, len(image_data_buffer) = ", len(image_data_buffer))

def get_image_data_based_on_image_ids(list_image_ids):
	image_data = []
	list_found = []
	last_id = -1
	for image_id in list_image_ids:
		# print ("image_id = ", image_id)
		if image_id in image_data_buffer:
			image_data.append(image_data_buffer[image_id]["image"]["x"])
			list_found.append(True)
		else:
			image_id2 = str(image_id).zfill(12)
			filename = "COCO_train2014_{}.jpg.npz".format(image_id2)
			# print ("\nfile: {}".format(filename))

			filepath = os.path.join(image_path, filename)
   
			# for .npz file, a dictionary-like object is returned
			try:
				image = np.load(filepath)
			except:
				if image_id != last_id:
					print ("for image id {}, cannot find file {}. continue for testing".format(image_id, filepath))
				last_id = image_id
				list_found.append(False)
				continue

			image = dict(image)
			# print ("image.keys() = ", image.keys())
			# reshape x:
			image['x'] = image['x'].reshape((-1,2048))
			image_data_buffer[image_id] = {"image": image, "time": time.time()}
			image_data.append(image_data_buffer[image_id]["image"]["x"])
			list_found.append(True)

		# print ("image.keys() = {}".format(image.keys()))
		# print ("image['x'].shape = {}".format(image['x'].shape))

	print ("len(list_found) = {}, len(list_image_ids) = {}".format(len(list_found), len(list_image_ids)))
	assert(len(list_found) == len(list_image_ids))
	assert(sum(list_found) == len(image_data)) 
	reduce_buffer_if_needed()

	return image_data, list_found

# test
list_image_ids = qa_df["image_id"][0:100]
image_data, list_found = get_image_data_based_on_image_ids(list_image_ids)
print ("len(list_image_ids) = {}, len(image_data) = {}".format(len(list_image_ids), len(image_data)))
if len(image_data) > 0:
  print ("image_data[0].shape = ", image_data[0].shape)
# assert(len(list_image_ids) == len(image_data))

len(list_found) = 100, len(list_image_ids) = 100
buffer (17) not filled up.
len(list_image_ids) = 100, len(image_data) = 100
image_data[0].shape =  (34, 2048)


In [16]:
from keras.preprocessing import sequence
import numpy as np

def list_texts_to_encoded_sequence(df, tokenizer):
  # list_texts = [ x.split() for x in df ]
  # convert a list of strings to a list of sequences (one sequence is a list of integers)
  # the words that are dropped by the Tokenizer will be dropped too..
  seqs = tokenizer.texts_to_sequences(df)
  # encoded = tokenizer_model.sequences_to_matrix(encoded) 
  return seqs

def one_seq_to_one_matrix(s):
	n = len(s)
	matrix = []
	v0 = [0] * MAX_NUM_WORDS
	for i in range(n):
		v = list(v0)
		v[s[i]] = 1
		matrix.append(v)
	return matrix

def sequences_to_list_onehot_matrices(seqs):
	lmatrices = []
	for s in seqs:
		lmatrices.append(one_seq_to_one_matrix(s))
	return lmatrices

def list_texts_to_list_onehot_matrices(df, tokenizer):
  return sequences_to_list_onehot_matrices(list_texts_to_encoded_sequence(df, tokenizer))

# converting a list of matrices back to a list of texts
def list_matrices_to_list_seqs(lmatrices):
  ls = []
  for m in lmatrices:
    s = []
    for row in m:
      row = list(row)
      index = row.index(max(row))
      s.append(index)
    ls.append(s)
  return ls


def list_matrices_to_list_texts(lmatrices):
  return tqa.sequences_to_texts(list_matrices_to_list_seqs(lmatrices))


def list_sequences_to_texts(lseq):
  return tqa.sequences_to_texts(lseq)

Using TensorFlow backend.


In [17]:
def format_answer_texts(adf):
  adata = list_texts_to_list_onehot_matrices(adf, tqa)

  adata = sequence.pad_sequences(adata, maxlen=MAX_ANSWER_LENGTH, padding='post', truncating='post')
  # adata = np.expand_dims(adata, axis=3) 
  print ("adata.shape = {}".format(adata.shape))
  return adata

# return associated questions and answers
def get_train_data_qa(start, end):
  qdf = qa_df["question"][start:end]

  qdata = list_texts_to_list_onehot_matrices(qdf, tqa)

  qdata = sequence.pad_sequences(qdata, maxlen=MAX_QUESTION_LENGTH, padding='post', truncating='post')
  # qdata = np.expand_dims(qdata, axis=3)
  print ("qdata.shape = {}".format(qdata.shape))

  adf = qa_df["answer"][start:end]
  adata = format_answer_texts(adf)

  #adata = list_texts_to_list_onehot_matrices(adf, tqa)

  #adata = sequence.pad_sequences(adata, maxlen=MAX_ANSWER_LENGTH, padding='post', truncating='post')
  # adata = np.expand_dims(adata, axis=3) 
  #print ("adata.shape = {}".format(adata.shape))

  return qdata, adata

# return associated questions, answers, media, and question types
def get_train_data(start, end):
  qdf = qa_df["question"][start:end]

  qdata = list_texts_to_list_onehot_matrices(qdf, tqa)

  qdata = sequence.pad_sequences(qdata, maxlen=MAX_QUESTION_LENGTH, padding='post', truncating='post')
  # qdata = np.expand_dims(qdata, axis=3)
  print ("qdata.shape = {}".format(qdata.shape))

  adf = qa_df["answer"][start:end]
  adata = list_texts_to_list_onehot_matrices(adf, tqa)

  adata = sequence.pad_sequences(adata, maxlen=MAX_ANSWER_LENGTH, padding='post', truncating='post')
  # adata = np.expand_dims(adata, axis=3) 
  print ("adata.shape = {}".format(adata.shape))

  qtype_df = qa_df["question_type"][start:end]
  qtype_data = get_one_hot_representation_qtype(qtype_df)
  qtype_data = sequence.pad_sequences(qtype_data, maxlen=MAX_QUESTION_TYPE_LENGTH, padding='post', truncating='post')
  print ("qtype_data.shape = {}".format(qtype_data.shape))

  image_ids = qa_df["image_id"][start:end]
  image_data, list_found = get_image_data_based_on_image_ids(image_ids)
  image_data = sequence.pad_sequences(image_data, maxlen=MEDIA_MAX_LENGTH, padding='post', truncating='post')
  print ("image_data.shape = {}".format(image_data.shape))
  qdata = qdata[list_found]
  adata = adata[list_found]
  qtype_data = qtype_data[list_found]

  print ("qdata.shape = {}".format(qdata.shape))
  print ("adata.shape = {}".format(adata.shape))
  print ("qtype_data.shape = {}".format(qtype_data.shape))

  return {'question_data': qdata,
          'answer_data': adata,
          'media_data': image_data,
          'questionType_data': qtype_data,
          'question_texts': qdf,
          'answer_texts': adf,
          'questionType_texts': qtype_df,
          'media_ids': image_ids,
          'question_seqs': list_texts_to_encoded_sequence(qdf, tqa),
          'answer_seqs': list_texts_to_encoded_sequence(adf, tqa)}

def get_train_data_qam(start, end):
  data = get_train_data(start, end)
  # return qdata, adata, image_data
  return data['question_data'], data['answer_data'], data['media_data']

# test
print ("\ntest get_train_data_qa")
qdata, adata = get_train_data_qa(0, 100)

print ("\ntest get_train_data")
data = get_train_data(0, 99)

print ("\ntest get_train_data_qam")
qdata, adata, image_data = get_train_data_qam(0, 100)


test get_train_data_qa
qdata.shape = (100, 25, 3200)
adata.shape = (100, 10, 3200)

test get_train_data
qdata.shape = (99, 25, 3200)
adata.shape = (99, 10, 3200)
qtype_data.shape = (99, 1, 65)
len(list_found) = 99, len(list_image_ids) = 99
buffer (17) not filled up.
image_data.shape = (99, 70, 2048)
qdata.shape = (99, 25, 3200)
adata.shape = (99, 10, 3200)
qtype_data.shape = (99, 1, 65)

test get_train_data_qam
qdata.shape = (100, 25, 3200)
adata.shape = (100, 10, 3200)
qtype_data.shape = (100, 1, 65)
len(list_found) = 100, len(list_image_ids) = 100
buffer (17) not filled up.
image_data.shape = (100, 70, 2048)
qdata.shape = (100, 25, 3200)
adata.shape = (100, 10, 3200)
qtype_data.shape = (100, 1, 65)


In [18]:
# seq2seq with 2 inputs. one is text, the other is a list of representation from an image/vidoe. 
# reference: https://medium.com/softmax/multi-input-seq2seq-generation-with-keras-and-talos-84d8bdec2d46

# when this is used for question+image -> answer, the 1st input is the question, and the 2nd input is the image.
# note that the image is trained offline. The input here is just a list of vectors, each of which could be a flattened vector of a m*m*c region.

from keras.layers import Dense, TimeDistributed, CuDNNLSTM, LSTM, RepeatVector, Input, concatenate, Flatten
from keras.models import Model
from keras.optimizers import Adam

# The simplest model that can take 2 seq as input and produce 1 seq as output;
# There is slightly improved model in the next cell
def set_up_simple_model(params):
    print("parameters:", params)
    # colab sometimes limits usage of GPU; to be safe, train using the normal CPU so the saved model can be used
    if params["GPU"] == True:
      LSTM__ = CuDNNLSTM
    else:
      LSTM__ = LSTM
      print ("Use normal LSTM")

    optim = Adam(lr=params["lr"])

    # text input seq
    text_input = Input(shape = params["text_shape"], name='text')

    # media input seq
    media_input = Input(shape = params["media_shape"], name='media')

    # text input goes into one LSTM
    lstm_text_repr = LSTM__(params["layer_1_text_input_neuron"])(text_input)

    # media input goes into another LSTM
    lstm_media_repr = LSTM__(params["layer_1_media_input_neuron"])(media_input)

    # concatenate the 2 outputs from the LSTM cells
    concatenated = concatenate([lstm_text_repr, lstm_media_repr], axis=-1)

    # repeat the concatenated represention for OUTPUT_SEQ_LENGTH
    concatenated_repeated = RepeatVector(params["output_shape1"])(concatenated)

    # feed the seq into another LSTM, which will lead to OUTPUT_SEQ_LENGTH steps
    concatenated_lstm = LSTM__(params["concatenated_layer_neuron"], return_sequences=True)(concatenated_repeated)

    # apply a Dense on each step's output from the last LSTM
    timedistributed = TimeDistributed(Dense(params["output_shape2"], activation='softmax'))(concatenated_lstm)

    model = Model([text_input, media_input], timedistributed)
    model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['acc'])
    return model

# test
params={
    "GPU": False, # if False, CPU will be used; to use GPU instead, set it to True and also configure Runtime -> change runtime type
    "lr":0.001,
    "text_shape":(MAX_QUESTION_LENGTH, MAX_NUM_WORDS),
    "media_shape":(MEDIA_MAX_LENGTH, MEDIA_FEATURE_DIMENSION),
    "output_shape1": MAX_ANSWER_LENGTH, # output length
    "output_shape2": MAX_NUM_WORDS, # number of possible words
    "layer_1_text_input_neuron": 100,
    "layer_1_media_input_neuron": 150,
    "concatenated_layer_neuron": 200,
}


model_ag_test = set_up_simple_model(params)
print(model_ag_test.summary())  

parameters: {'GPU': False, 'lr': 0.001, 'text_shape': (25, 3200), 'media_shape': (70, 2048), 'output_shape1': 10, 'output_shape2': 3200, 'layer_1_text_input_neuron': 100, 'layer_1_media_input_neuron': 150, 'concatenated_layer_neuron': 200}
Use normal LSTM
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               (None, 25, 3200)     0                                            
__________________________________________________________________________________________________
media (InputLayer)              (None, 70, 2048)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 100)         

In [19]:
# seq2seq with 2 inputs. one is text, the other is a list of representation from an image/vidoe. 

from keras.layers import Dense, TimeDistributed, CuDNNLSTM, LSTM, RepeatVector, Input, concatenate, Flatten
from keras.models import Model
from keras.optimizers import Adam

# Improve the the simple model by adding a condensed repr of media to the last lstm
def set_up_model2(params):
    print("parameters:", params)
    # colab sometimes limits usage of GPU; to be safe, train using the normal CPU so the saved model can be used
    if params["GPU"] == True:
      LSTM__ = CuDNNLSTM
    else:
      LSTM__ = LSTM
      print ("Use normal LSTM")

    optim = Adam(lr=params["lr"])

    # text input seq
    text_input = Input(shape = params["text_shape"], name='text')

    # media input seq
    media_input = Input(shape = params["media_shape"], name='media')

    # text input goes into one LSTM
    lstm_text_repr = LSTM__(params["layer_1_text_input_neuron"])(text_input)

    # media input goes into another LSTM
    lstm_media_repr = LSTM__(params["layer_1_media_input_neuron"])(media_input)

    # one more smaller lstm to get the return seq for each component in the media
    condensed_media = LSTM__(20, return_sequences=True)(media_input)

    # flatten it to 1d dim
    condensed_media = Flatten()(condensed_media)

    # concatenate the 2 outputs from the LSTM cells
    # compared to the simple model, the condensed version of media is also an input to every single step of the last LSTM
    concatenated = concatenate([lstm_text_repr, lstm_media_repr, condensed_media], axis=-1)

    # repeat the concatenated represention for OUTPUT_SEQ_LENGTH
    concatenated_repeated = RepeatVector(params["output_shape1"])(concatenated)

    # feed the seq into another LSTM, which will lead to OUTPUT_SEQ_LENGTH steps
    concatenated_lstm = LSTM__(params["concatenated_layer_neuron"], return_sequences=True)(concatenated_repeated)

    # apply a Dense on each step's output from the last LSTM
    timedistributed = TimeDistributed(Dense(params["output_shape2"], activation='softmax'))(concatenated_lstm)

    model = Model([text_input, media_input], timedistributed)
    model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['acc'])
    return model
    


# test
params={
    "GPU": False, # if False, CPU will be used; to use GPU instead, set it to True and also configure Runtime -> change runtime type
    "lr":0.001,
    "text_shape":(MAX_QUESTION_LENGTH, MAX_NUM_WORDS),
    "media_shape":(MEDIA_MAX_LENGTH, MEDIA_FEATURE_DIMENSION),
    "output_shape1": MAX_ANSWER_LENGTH, # output length
    "output_shape2": MAX_NUM_WORDS, # number of possible words
    "layer_1_text_input_neuron": 100,
    "layer_1_media_input_neuron": 150,
    "concatenated_layer_neuron": 200,
}


model_ag_test = set_up_model2(params)
print(model_ag_test.summary())    

parameters: {'GPU': False, 'lr': 0.001, 'text_shape': (25, 3200), 'media_shape': (70, 2048), 'output_shape1': 10, 'output_shape2': 3200, 'layer_1_text_input_neuron': 100, 'layer_1_media_input_neuron': 150, 'concatenated_layer_neuron': 200}
Use normal LSTM
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
media (InputLayer)              (None, 70, 2048)     0                                            
__________________________________________________________________________________________________
text (InputLayer)               (None, 25, 3200)     0                                            
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (None, 70, 20)       165520      media[0][0]                      
__________________________________

In [0]:
# choose the type of model
# set_up_model = set_up_simple_model
set_up_model = set_up_model2

In [0]:
# construct answer generator model with QUESTION TEXT as input

# Using VQA V2 existing features (downloaded from https://github.com/MILVLG/mcan-vqa)

from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.models import load_model
import pandas as pd
import os

# Note that the inputs are question and media, and output is answer
params={
    "GPU": False, # if False, CPU will be used; to use GPU instead, set it to True and also configure Runtime -> change runtime type
    "lr":0.001,
    "text_shape":(MAX_QUESTION_LENGTH, MAX_NUM_WORDS),
    "media_shape":(MEDIA_MAX_LENGTH, MEDIA_FEATURE_DIMENSION),
    "output_shape1": MAX_ANSWER_LENGTH, # output length
    "output_shape2": MAX_NUM_WORDS, # number of possible words
    "layer_1_text_input_neuron": 100,
    "layer_1_media_input_neuron": 150,
    "concatenated_layer_neuron": 200,
}

# get data and train

file_saved_model = os.path.join(output_path, "ag_question_as_input_100_150_200_cpu.model2")
if os.path.isfile(file_saved_model):
  model_ag1 = load_model(file_saved_model) # continue from last checkpoint
  print("load existing model from ", file_saved_model)
else:
  print ("Existing model is not found. Start with a new model.")
  model_ag1 = set_up_model(params)

print(model_ag1.summary())

# save every n epoches
checkpoint = ModelCheckpoint(file_saved_model, monitor='acc', verbose=1, save_best_only=False, mode='max', period=10)

ndata = len(qa_df)

# size = 500 # for testing
size = 2000
start = 157*size

n_rounds_for_1_pass = int(ndata/size) + 1
print ("n_rounds_for_1_pass = ", n_rounds_for_1_pass)

# fetch data section by section so that it won't use up all the ram
for i in range( n_rounds_for_1_pass * 300 ):
  print ("running round ", i)

  # make sure validation data is the same after 1 pass of the whole data
  if start >= ndata:
    start = 0
  print ("start: {} (total data = {})".format(start, ndata))

  qdata, adata, media_data = get_train_data_qam(start, start+size)
  start = start + size

  if (len(qdata) != len(media_data)) or len(qdata) == 0:
    print( "len(qdata) ({}) != len(media_data) ({}). likely the part to get images has some temporary mounting issues. skip this section of data.".format( len(qdata), len(media_data) ) )
    continue

  # qdata_media = pd.DataFrame.from_dict( {"question": qdata, "media": media_data} ) # need to merge qdata and media in order to do the split
  # X_train, X_test, y_train, y_test = train_test_split(
  #    qdata_media, adata, test_size=0.2, random_state=50)

  # A simple split of data to training and testing data
  train_size = int(0.8 * len(qdata))
  qdata_train = qdata[0:train_size]
  qdata_test = qdata[train_size:]
  adata_train = adata[0:train_size]
  adata_test = adata[train_size:]
  media_data_train = media_data[0:train_size]
  media_data_test = media_data[train_size:]

  print ("qdata_train.shape = ", qdata_train.shape)
  print ("qdata_test.shape = ", qdata_test.shape)
  print ("adata_train.shape = ", adata_train.shape)
  print ("adata_test.shape = ", adata_test.shape)
  print ("media_data_train.shape = ", media_data_train.shape)
  print ("media_data_test.shape = ", media_data_test.shape)

  if i % 10 == 0 and os.path.isfile(file_saved_model):
    # reload the model from file once in a while, in case the .fit() function buffers too much data
    model_ag1 = None
    model_ag1 = load_model(file_saved_model)
    print ("reload model from file")

  out=model_ag1.fit({'text': qdata_train, 'media': media_data_train}, 
                adata_train, 
                epochs=20,
                batch_size=20, 
                validation_data=({'text': qdata_test, 'media': media_data_test}, adata_test), 
                verbose=1,
                callbacks=[checkpoint])

load existing model from  /content/gdrive/My Drive/cs6240_project_codes/outputs/ag_question_as_input_100_150_200_cpu.model2
Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
media (InputLayer)              (None, 70, 2048)     0                                            
__________________________________________________________________________________________________
text (InputLayer)               (None, 25, 3200)     0                                            
__________________________________________________________________________________________________
lstm_40 (LSTM)                  (None, 70, 20)       165520      media[0][0]                      
__________________________________________________________________________________________________
lstm_38 (LSTM)                  (None, 100)          1320400     te

In [21]:
# test the AG model with question text as input
# load the trained model and test

from keras.models import load_model

filename = "ag_question_as_input_100_150_200_cpu.model2"
file_saved_model = os.path.join(output_path, filename)
model_ag1_trained = load_model(file_saved_model)
print(model_ag1_trained.summary())

# get some data and make prediction
qdata, adata, media_data = get_train_data_qam(0, 30)
answers_predicted = model_ag1_trained.predict({'text': qdata, 'media': media_data})

print ("answers_predicted.shape = ", answers_predicted.shape)
print ("type(answers_predicted) = ", type(answers_predicted))

print ("answers_predicted[0, :] = ", answers_predicted[0, :])

answer_text = list_matrices_to_list_texts(answers_predicted)
qdata_text = list_matrices_to_list_texts(qdata)
adata_text = list_matrices_to_list_texts(adata)
for i in range(len(qdata)):
  print("\nquestion: ", qdata_text[i])
  print("given answer: ", adata_text[i])
  print ("predicted answer sequence = ", answer_text[i])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
media (InputLayer)              (None, 70, 2048)     0                                            
__________________________________________________________________________________________________
text (InputLayer)               (None, 25, 3200)     0                                            
__________________________________________________________________________________________________
lstm_40 (LSTM)                  (None, 70, 20)       165520      media[0][0]                      
__________________________________________________________________________________________________
lstm_38 (LSTM)                  (None, 100)          1320400     text[0][0]                     

In [0]:
# construct answer generator model with QUESTION TYPE as input
# (VQA V2 dataset provides question type, which saves our works to get the types from question texts)

# Using VQA V2 existing features (downloaded from https://github.com/MILVLG/mcan-vqa)

# Note that the inputs are question types and media, and output is answer
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.models import load_model
import pandas as pd
import os

params={
    "GPU": False, # if False, CPU will be used; to use GPU instead, set it to True and also configure Runtime -> change runtime type
    "lr":0.001,
    "text_shape":(MAX_QUESTION_TYPE_LENGTH, QUESTION_TYPE_DIMENSION),
    "media_shape":(MEDIA_MAX_LENGTH, MEDIA_FEATURE_DIMENSION),
    "output_shape1": MAX_ANSWER_LENGTH, # output length
    "output_shape2": MAX_NUM_WORDS, # number of possible words
    "layer_1_text_input_neuron": 100,
    "layer_1_media_input_neuron": 150,
    "concatenated_layer_neuron": 200,
}

# get data and train

file_saved_model = os.path.join(output_path, "ag_questionTypes_as_input_100_150_200_cpu.model2")
if os.path.isfile(file_saved_model):
  print ("load model from file")
  model_ag2 = load_model(file_saved_model) # continue from last checkpoint
else:
  model_ag2 = set_up_model(params)

print(model_ag2.summary())

# save every n epoches
checkpoint = ModelCheckpoint(file_saved_model, monitor='acc', verbose=1, save_best_only=False, mode='max', period=10)

ndata = len(qa_df)
# start = 0
# size = 500 # for testing
size = 2000
start = 0*size
n_rounds_for_1_pass = int(ndata/size) + 1
print ("n_rounds_for_1_pass = ", n_rounds_for_1_pass)

# fetch data section by section so that it won't use up all the ram
for i in range( n_rounds_for_1_pass * 300 ):
  print ("running round ", i)

  # make sure validation data is the same after 1 pass of the whole data
  if start >= ndata:
    start = 0
  print ("start: {} (total data = {})".format(start, ndata))

  data = get_train_data(start, start+size)
  qdata = data["questionType_data"]
  adata = data["answer_data"]
  media_data = data["media_data"]

  start = start + size

  if (len(qdata) != len(media_data)) or len(qdata) == 0:
    print( "len(qdata) ({}) != len(media_data) ({}). likely the part to get images has some temporary mounting issues. skip this section of data.".format( len(qdata), len(media_data) ) )
    continue

  train_size = int(0.8 * len(qdata))
  qdata_train = qdata[0:train_size]
  qdata_test = qdata[train_size:]
  adata_train = adata[0:train_size]
  adata_test = adata[train_size:]
  media_data_train = media_data[0:train_size]
  media_data_test = media_data[train_size:]

  print ("qdata_train.shape = ", qdata_train.shape)
  print ("qdata_test.shape = ", qdata_test.shape)
  print ("adata_train.shape = ", adata_train.shape)
  print ("adata_test.shape = ", adata_test.shape)
  print ("media_data_train.shape = ", media_data_train.shape)
  print ("media_data_test.shape = ", media_data_test.shape)

  if i % 10 == 0 and os.path.isfile(file_saved_model):
    # reload the model from file once in a while, in case the .fit() function buffers too much data
    model_ag2 = None
    model_ag2 = load_model(file_saved_model)
    print ("reload model from file")

  out=model_ag2.fit({'text': qdata_train, 'media': media_data_train}, 
                adata_train, 
                epochs=20,
                batch_size=20, 
                validation_data=({'text': qdata_test, 'media': media_data_test}, adata_test), 
                verbose=1,
                callbacks=[checkpoint])

parameters: {'GPU': False, 'lr': 0.001, 'text_shape': (1, 65), 'media_shape': (70, 2048), 'output_shape1': 10, 'output_shape2': 3200, 'layer_1_text_input_neuron': 100, 'layer_1_media_input_neuron': 150, 'concatenated_layer_neuron': 200}
Use normal LSTM
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
media (InputLayer)              (None, 70, 2048)     0                                            
__________________________________________________________________________________________________
text (InputLayer)               (None, 1, 65)        0                                            
__________________________________________________________________________________________________
lstm_10 (LSTM)                  (None, 70, 20)       165520      media[0][0]                      
_____________________________________

In [0]:
# test the AG model with question types as input
# load the trained model and test

from keras.models import load_model

filename = "ag_questionTypes_as_input_100_150_200_cpu.model"
file_saved_model = os.path.join(output_path, filename)
model = load_model(file_saved_model)
print(model.summary())

# get some data and make prediction
data = get_train_data(0, 30)
qdata = data["questionType_data"]
adata = data["answer_data"]
media_data = data["media_data"]

answers_predicted = model.predict({'text': qdata, 'media': media_data})

print ("answers_predicted.shape = ", answers_predicted.shape)
print ("type(answers_predicted) = ", type(answers_predicted))

print ("answers_predicted[0, :] = ", answers_predicted[0, :])

answer_text = list_matrices_to_list_texts(answers_predicted)
qdata_text = list_matrices_to_list_question_types(qdata)
adata_text = list_matrices_to_list_texts(adata)
for i in range(len(qdata)):
  print("\nquestion type: ", qdata_text[i])
  print("given answer: ", adata_text[i])
  print ("predicted_answer = ", answer_text[i])

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               (None, 1, 65)        0                                            
__________________________________________________________________________________________________
media (InputLayer)              (None, 70, 2048)     0                                            
__________________________________________________________________________________________________
lstm_4 (LSTM)                   (None, 100)          66400       text[0][0]                       
__________________________________________________________________________________________________
lstm_5 (LSTM)                   (None, 150)          1319400     media[0][0]                      
____________________________________________________________________________________________

In [0]:
# construct question generator model
# which is basically the same as the AG model except input and output are different

from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.models import load_model
import pandas as pd
import os

# Note that the inputs are answer and media, and output is question
params={
    "GPU": False,    
    "lr":0.001,
    "text_shape":(MAX_ANSWER_LENGTH, MAX_NUM_WORDS), # answer text
    "media_shape":(MEDIA_MAX_LENGTH, MEDIA_FEATURE_DIMENSION),
    "output_shape1": MAX_QUESTION_LENGTH, # output length
    "output_shape2": MAX_NUM_WORDS, # number of possible words
    "layer_1_text_input_neuron": 100,
    "layer_1_media_input_neuron": 150,
    "concatenated_layer_neuron": 200,
}

# get data and train

file_saved_model = os.path.join(output_path, "qg_100_150_200_cpu.model2")
if os.path.isfile(file_saved_model):
  print ("load model from ", file_saved_model)
  model_qg = load_model(file_saved_model) # continue from last checkpoint
else:
  model_qg = set_up_model(params)

print(model_qg.summary())

# save every n epoches
checkpoint = ModelCheckpoint(file_saved_model, monitor='acc', verbose=1, save_best_only=False, mode='max', period=10)

ndata = len(qa_df)

# size = 500 # for testing
size = 2000

# start from last round
start = size * 156

n_rounds_for_1_pass = int(ndata/size) + 1
print ("n_rounds_for_1_pass = ", n_rounds_for_1_pass)

# fetch data section by section so that it won't use up all the ram
for i in range( n_rounds_for_1_pass * 300 ):
  print ("running round ", i)

  # make sure validation data is the same after 1 pass of the whole data
  if start >= ndata:
    start = 0
  print ("start: {} (total data = {})".format(start, ndata))

  qdata, adata, media_data = get_train_data_qam(start, start+size)
  start = start + size

  if (len(qdata) != len(media_data)) or len(qdata) == 0:
    print( "len(qdata) ({}) != len(media_data) ({}). likely the part to get images has some issues. skip this section of data.".format( len(qdata), len(media_data) ) )
    continue

  train_size = int(0.8 * len(qdata))
  qdata_train = qdata[0:train_size]
  qdata_test = qdata[train_size:]
  adata_train = adata[0:train_size]
  adata_test = adata[train_size:]
  media_data_train = media_data[0:train_size]
  media_data_test = media_data[train_size:]

  print ("qdata_train.shape = ", qdata_train.shape)
  print ("qdata_test.shape = ", qdata_test.shape)
  print ("adata_train.shape = ", adata_train.shape)
  print ("adata_test.shape = ", adata_test.shape)
  print ("media_data_train.shape = ", media_data_train.shape)
  print ("media_data_test.shape = ", media_data_test.shape)

  if i % 10 == 0 and os.path.isfile(file_saved_model):
    # reload the model from file once in a while, in case the .fit() function buffers too much data
    model_qg = None
    model_qg = load_model(file_saved_model)
    print ("reload model from file")

  out=model_qg.fit({'text': adata_train, 'media': media_data_train}, 
                qdata_train, 
                epochs=20,
                batch_size=20, 
                validation_data=({'text': adata_test, 'media': media_data_test}, qdata_test), 
                verbose=1,
                callbacks=[checkpoint])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
remove image (last accessed time, image id) = (1587227298.1227336, 360129)
remove image (last accessed time, image id) = (1587226253.20252, 358915)
remove image (last accessed time, image id) = (1587228824.4239643, 100078)
remove image (last accessed time, image id) = (1587238894.2621481, 374018)
remove image (last accessed time, image id) = (1587250492.2032132, 125473)
remove image (last accessed time, image id) = (1587235664.4692457, 369886)
remove image (last accessed time, image id) = (1587242623.4730105, 222097)
remove image (last accessed time, image id) = (1587253147.8808484, 390287)
remove image (last accessed time, image id) = (1587227804.417472, 491826)
remove image (last accessed time, image id) = (1587236202.3611832, 108375)
remove image (last accessed time, image id) = (1587245779.0514624, 250929)
remove image (last accessed time, image id) = (1587238891.6080625, 373444)
remove image (last accessed time, imag

In [0]:
# test the QG model
# load the trained model and test

from keras.models import load_model

filename = "qg_100_150_200_cpu_24hrs.model"
file_saved_model = os.path.join(output_path, filename)
model = load_model(file_saved_model)

# get some data and make prediction
qdata, adata, media_data = get_train_data_qam(0, 30)
questions_predicted = model.predict({'text': adata, 'media': media_data})

print ("questions_predicted.shape = ", questions_predicted.shape)
print ("type(questions_predicted) = ", type(questions_predicted))

print ("questions_predicted[0, :] = ", questions_predicted[0, :])

questions_text = list_matrices_to_list_texts(questions_predicted)
qdata_text = list_matrices_to_list_texts(qdata)
adata_text = list_matrices_to_list_texts(adata)
for i in range(len(qdata)):
  print("\nanswer: ", adata_text[i])
  print("given question: ", qdata_text[i])
  print ("predicted_question = ", questions_text[i])

qdata.shape = (30, 25, 3200)
adata.shape = (30, 10, 3200)
qtype_data.shape = (30, 1, 65)
len(list_found) = 30, len(list_image_ids) = 30
buffer (385) not filled up.
image_data.shape = (30, 70, 2048)
qdata.shape = (30, 25, 3200)
adata.shape = (30, 10, 3200)
qtype_data.shape = (30, 1, 65)
questions_predicted.shape =  (30, 25, 3200)
type(questions_predicted) =  <class 'numpy.ndarray'>
questions_predicted[0, :] =  [[2.58950777e-10 1.84189318e-07 9.98575251e-06 ... 6.03744824e-14
  7.32403572e-14 4.49432700e-13]
 [1.30791944e-09 2.05657620e-04 1.38621044e-05 ... 8.29086660e-11
  9.69836780e-11 3.26113803e-10]
 [2.84407742e-09 4.67982190e-03 1.49858087e-01 ... 1.00914423e-08
  8.00579869e-09 7.93053800e-09]
 ...
 [1.98415471e-08 1.44950211e-01 4.00337763e-02 ... 2.75154353e-08
  1.18046195e-08 1.62848295e-08]
 [2.00473327e-08 1.55325279e-01 2.89985389e-02 ... 2.89739610e-08
  1.37513272e-08 1.67651333e-08]
 [2.05099777e-08 1.50445327e-01 2.44095735e-02 ... 3.04458645e-08
  1.28933602e-08 1.60

In [0]:
# beam search given a predicted text sequence
# reference: https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/

from math import log
 
 # get the k best answer sequences regardless of lengths
def beam_search(one_seq, k):

  # sequences is to keep the best k candidates
	# sequences = [[list(), 1.0]]
	sequences = [[list(), 0]]

	# keep track of the impossible candidates to speed up the selection
	impossible_candidates = {}

	# do this for the length of one_seq so it won't take too long
	for l in range(len(one_seq)):
		all_candidates = {}
		# for each best candidate so far, add each word and compute the new combined log probabilities;
    # if there are 10 candidates (i.e. k=10) and 50 possible words, then there will be 10*50 = 500 candidates (before further reducing to k candidates)
		for i in range(len(sequences)):
			seq, score = sequences[i]
			# add the last selected candidates to candidates
			if seq:
				all_candidates[str(seq)] = sequences[i]

			# the next row for each surviving seq is different
			next_row_id = len(seq)
			next_row = one_seq[next_row_id]
			for j in range(len(next_row)):
        # construct a new candidate, and the log probability
				# candidate = [seq + [j], score + log(next_row[j])]
				# to avoid duplications

				# if NULLTOKEN is at the first word, then dont keep the sequence
				if next_row_id == 0 and j == NULLTOKEN_INDEX:
					continue

				c = seq + [j]
				key = str(c)
				if key in impossible_candidates:
					continue

				# to compute the score: log(a*b) = log(a) + log(b)
				if next_row[j] == 0:
					# if the probability is 0, then set it to a very small number
					score_added = log(0.0000000000000000001)
				else:
					score_added = log(next_row[j])
		 
				candidate = [c, score + score_added]
				if not key in all_candidates:
					all_candidates[key] = candidate

    
		# print ("len(all_candidates) = ", len(all_candidates))
		# if l == 0:
		#	print ("all_candidates = ", all_candidates)	
	 
	  # just need the values
		all_candidates_list = all_candidates.values()

		# in each step, order all candidates by score
		ordered = sorted(all_candidates_list, reverse = True, key=lambda element: element[1])
  
		# in each step, select the currently best k candidates
		sequences = ordered[:k]
		# the current kth candidates onward and their childrens should not be considered
		for c in ordered[k:]:
			impossible_candidates[str(c[0])] = 0
		# print ("len(impossible_candidates) = ", len(impossible_candidates))
	
	return sequences


 # After beam search, maybe some customized but reasonable rules can be used to further select answers:

 # 1. given a selected 10-word answer from the beam search,
 # it can be futher cut to 1-word, 2-word, .. and 10-word answers (all starting from the first word), each of which will have its own log probability.
 # the rationale is that the answer is expected to be short, so shorter answers should be sampled from a 10-word answer candidate.

 # 2. in addition, if a word is found repeated, then the answer can be chopped at the word.
 
 # 3. if the REMOVEDTOKEN is encoutered for 2 consecutive times, the answer can be chopped too.


In [17]:
# load the trained models
from keras.models import load_model
import numpy as np

filename = "ag_question_as_input_100_150_200_cpu_30hrs.model"
file_saved_model = os.path.join(output_path, filename)
model_ag1 = load_model(file_saved_model)
print ("model summary for AG when question text is input:")
print(model_ag1.summary())


filename = "ag_questionTypes_as_input_100_150_200_cpu_32hrs.model"
file_saved_model = os.path.join(output_path, filename)
model_ag2 = load_model(file_saved_model)
print ("model summary for AG when questionType is input:")
print(model_ag2.summary())

filename = "qg_100_150_200_cpu_32hrs.model"
file_saved_model = os.path.join(output_path, filename)
model_qg = load_model(file_saved_model)
print("model summary for QG:")
print(model_qg.summary())

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

model summary for AG when question text is input:
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               (None, 25, 3200)     0                                            
__________________________________________________________________________________________________
media (InputLayer)              (None, 70, 2048)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 100)          1320400     text[0][0]                       
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 150)   

In [0]:
## use the AG and QG models to obtain the combined score for each answer candidate

import numpy as np
from math import log

def get_scores_from_qg(qseq, questions_each_answer_candidate):
  scores = []
  for q in questions_each_answer_candidate:
    score = 0
    for i in range(len(qseq)):
      word_index = qseq[i]
      row = q[i]
      if row[word_index] == 0:
        score_added = log(0.0000000000000000001)
      else:
        score_added = log( row[word_index] )
      score += score_added

    scores.append(score)
  return scores

# combine with QG to get the final scores
def get_overall_scores_for_1_question(model_qg, answers_from_beam_search, media, question_ground_truth_seq, qtext_eg, SILENT):

  ans_candidates_seqs = []
  for a, a_score in answers_from_beam_search:
    ans_candidates_seqs.append(a)

  ans_candidates_texts = list_sequences_to_texts(ans_candidates_seqs)
  if not SILENT:
    print ("answers from beam search in texts:")
    for i in range(len(ans_candidates_texts)):
      print ("score: {}, answer:{}".format(answers_from_beam_search[i][1], ans_candidates_texts[i]))

  answers_scored_by_qg = []

  answer_candidates_onehot = format_answer_texts(ans_candidates_texts)

  media_data_repeat = np.array([media] * len(answer_candidates_onehot))

  questions_each_answer_candidate = model_qg.predict({'text': answer_candidates_onehot, 'media': media_data_repeat})

  if not SILENT:
    print ("question_ground_truth_seq = ", question_ground_truth_seq)

  questions_each_answer_texts = list_matrices_to_list_texts(questions_each_answer_candidate)
  # for i in range(len(questions_each_answer_texts)):
  #  print ("for answer candidate: '{}', the best question generated is: '{}'".format(ans_candidates_texts[i], questions_each_answer_texts[i]))

  if not SILENT:
    print ("\nquestion text: ", qtext_eg)
    print ("question seq: ", question_ground_truth_seq)
    print ("When this question text is the output of the model, the scores from qg for each answer candidate are:".format(qtext_eg))

  scores_from_qg = get_scores_from_qg(question_ground_truth_seq, questions_each_answer_candidate)
  if not SILENT:
    print ("scores_from_qg = ", scores_from_qg)

  answer_candidates_and_scores = []
  for i in range( len(ans_candidates_texts) ):
    score_ag = answers_from_beam_search[i][1]
    score_qg = scores_from_qg[i]
    score_combined = score_ag + score_qg
    answer_candidates_and_scores.append( (score_combined, score_ag, score_qg, ans_candidates_texts[i], ans_candidates_seqs[i]) )

  answer_candidates_and_scores.sort(reverse=True)

  return answer_candidates_and_scores

def get_answer_candidates_and_scores_for_1_question(model_ag, model_qg, data, answers_predicted, id_eg, SILENT):

  media_data = data['media_data']

  atexts_df = data['answer_texts']
  qtexts_df = data['question_texts']
  qseqs = data['question_seqs']

  # get candidate answers
  answers_from_beam_search = beam_search(answers_predicted[id_eg], NUM_CANDIDATE_ANSWERS)
  if not SILENT:
    print ("answers_from_beam_search = \n", answers_from_beam_search)

  ## now use the QG model to obtain the combined score for each answer candidate
  media = media_data[id_eg]
  question_ground_truth_seq = qseqs[id_eg]
  qtext_eg = qtexts_df.iloc[id_eg]
  answer_candidates_and_scores = get_overall_scores_for_1_question(model_qg, answers_from_beam_search, media, question_ground_truth_seq, qtext_eg, SILENT)

  if not SILENT:
    print ("given raw answer = ", atexts_df.iloc[id_eg])
    print ("\nThe final ranking of answer candidates:")
    for a in answer_candidates_and_scores:
      print (a)

  return answer_candidates_and_scores

def get_stats_1question(answer_candidates_and_scores, ground_truth_ans_seq):
  top1=False
  top5=False
  top10=False
  top20=False
  
  ground_truth = list(ground_truth_ans_seq)
  n = min(20, len(answer_candidates_and_scores))

  for i in range(n):
    ans_seq_candidate = answer_candidates_and_scores[i][4]
    if ans_seq_candidate == ground_truth:
      if i == 0:
        top1=top5=top10=top20=True
      elif i<5:
        top5=top10=top20=True
      elif i<10:
        top10=top20=True
      elif i<20:
        top20=True        
      # if the right answer is found, immediately break
      break

  return [top1, top5, top10, top20]

def get_prediction_accuracy(model_ag, model_qg, data, text_data, SILENT=True):

  adata = data['answer_data']
  media_data = data['media_data']
  qdata = data['question_data']

  atexts_df = data['answer_texts']
  qtexts_df = data['question_texts']
  qseqs = data['question_seqs']
  aseqs = data['answer_seqs']
  image_ids_df = data['media_ids']

  # get the answers predicted from AG model
  print ("len(text_data) = ", len(text_data))
  print ("len(media_data) = ", len(media_data))
  print ("len(qtexts_df) = ", len(qtexts_df))
  answers_predicted = model_ag.predict({'text': text_data, 'media': media_data})

  print ("answers_predicted.shape = ", answers_predicted.shape)

  qdata_text = list_matrices_to_list_texts(qdata)
  adata_text = list_matrices_to_list_texts(adata)

  n_questions = 0
  n_correct_top1 = 0
  n_correct_top5 = 0
  n_correct_top10 = 0
  n_correct_top20 = 0

  detailed = []

  for id_eg in range(len(text_data)):
    if not SILENT:
      print ("\nid_eg = ", id_eg)
      print ("qdata_text = ", qdata_text[id_eg])
      print ("qdata raw text = ", qtexts_df.iloc[id_eg])
      print ("given answer = ", adata_text[id_eg])
      print ("given raw answer = ", atexts_df.iloc[id_eg])
      print ("image id = ", image_ids_df.iloc[id_eg])

    answer_candidates_and_scores = get_answer_candidates_and_scores_for_1_question(model_ag, model_qg, data, answers_predicted, id_eg, SILENT)
    intop1, intop5, intop10, intop20 = get_stats_1question(answer_candidates_and_scores, aseqs[id_eg])

    n_questions +=1
    n_correct_top1 += intop1
    n_correct_top5 += intop5
    n_correct_top10 += intop10
    n_correct_top20 += intop20
    detailed.append([int(intop1), int(intop5), int(intop10), int(intop20)])

  return [n_questions, n_correct_top1, n_correct_top5, n_correct_top10, n_correct_top20, detailed]

In [0]:
# Compute some metrics to evaluate the model
import os
import datetime

# choose the AG model
text_input = 'qtext'
#text_input = 'qtype'

file_results_summary = os.path.join( output_path , "{}_results_summary.csv".format(text_input) )
file_results_detailed = os.path.join( output_path ,"{}_results_detailed.csv".format(text_input) )

fsummary = open(file_results_summary, 'a')
fdetailed = open(file_results_detailed, 'a')

ndata = len(qa_df)

size = 2000

n_rounds_for_1_pass = int(ndata/size)
print ("n_rounds_for_1_pass = ", n_rounds_for_1_pass)

train_data_ratio = 0.8
test_data_start = int(size * train_data_ratio)

n_questions_total = ntop1_total = ntop5_total = ntop10_total = ntop20_total = 0

# control from which round to start
start_round = 3

# fetch data section by section so that it won't use up all the ram
for i in range(start_round, n_rounds_for_1_pass):
  print ("running round ", i)

  start = i * size

  # make sure validation data is the same after 1 pass of the whole data
  assert(start < ndata)

  print ("start: {} (total data = {})".format(start, ndata))

  # start_v = 0
  # end_v = 10
  # data = get_train_data(start_v, end_v)

  # only get test data
  start_v = start + test_data_start
  end_v = start+size
  data = get_train_data(start_v, end_v)
  len_data = len(data['question_data'])
  if (len_data != end_v - start_v ):
    print ("Length of data not correct. Better ignore this section of data")
    continue

  qtype_text_df = data['questionType_texts']

  # data = get_train_data(start + test_data_start, start + test_data_start+5)

  start = start + size


  print ("len(data['question_data']) = ", len(data['question_data']))

  if text_input == 'qtext':
    text_data = data['question_data']
    model_ag = model_ag1
  elif text_input == 'qtype':
    text_data = data['questionType_data']
    model_ag = model_ag2

  # run the function
  print ("time start for this round: ", datetime.datetime.now())
  n_questions, ntop1, ntop5, ntop10, ntop20, detailed = get_prediction_accuracy(model_ag, model_qg, data, text_data)
  print ("time end for this round: ", datetime.datetime.now())

  print ("n_questions = ", n_questions)
  print ("ntop1 = ", ntop1)
  print ("ntop5 = ", ntop5)
  print ("ntop10 = ", ntop10)
  print ("ntop20 = ", ntop20)

  assert(n_questions == len(data['question_data']))

  n_questions_total+=n_questions
  ntop1_total+=ntop1
  ntop5_total+=ntop5
  ntop10_total+=ntop10
  ntop20_total+=ntop20

  print ("n_questions_total = ", n_questions_total)
  print ("ntop1_total = ", ntop1_total)
  print ("ntop5_total = ", ntop5_total)
  print ("ntop10_total = ", ntop10_total)
  print ("ntop20_total = ", ntop20_total)

  fsummary.write("{},{},{},{},{}\n".format(n_questions_total, ntop1_total, ntop5_total, ntop10_total, ntop20_total))
  fsummary.flush()

  for i in range(start_v, end_v):
    # row id in the data, question_type, top1, top5, top10, top20
    j = i-start_v
    fdetailed.write("{},{},{},{},{},{}\n".format(i, qtype_text_df.iloc[j], detailed[j][0], detailed[j][1], detailed[j][2], detailed[j][3]))

  fdetailed.flush()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.shape = (50, 10, 3200)
adata.s

In [19]:
# demo with some data

# choose the AG model
#text_input = 'qtext'
text_input = 'qtype'

# get some data and make prediction 
data = get_train_data(5693, 5700)
print ("len(data['question_data']) = ", len(data['question_data']))

if text_input == 'qtext':
  text_data = data['question_data']
  model_ag = model_ag1
elif text_input == 'qtype':
  text_data = data['questionType_data']
  model_ag = model_ag2

# run the function
n_questions, ntop1, ntop5, ntop10, ntop20, detailed = get_prediction_accuracy(model_ag, model_qg, data, text_data, False)

print ("n_questions = ", n_questions)
print ("ntop1 = ", ntop1)
print ("ntop5 = ", ntop5)
print ("ntop10 = ", ntop10)
print ("ntop20 = ", ntop20)

qdata.shape = (7, 25, 3200)
adata.shape = (7, 10, 3200)
qtype_data.shape = (7, 1, 65)
len(list_found) = 7, len(list_image_ids) = 7
buffer (19) not filled up.
image_data.shape = (7, 70, 2048)
qdata.shape = (7, 25, 3200)
adata.shape = (7, 10, 3200)
qtype_data.shape = (7, 1, 65)
len(data['question_data']) =  7
len(text_data) =  7
len(media_data) =  7
len(qtexts_df) =  7
answers_predicted.shape =  (7, 10, 3200)

id_eg =  0
qdata_text =  what sport is being played NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN
qdata raw text =  What sport is being played?
given answer =  tennis NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN NULLTOKEN
given raw answer =  tennis
image id =  524559
answers_from_beam_search = 
 [[[100], -1.0818179824521976], [[100, 452], -1.0967492382101245], [[217], -1.3356021098256092], [[217,