In [1]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/ML-Project1/image-generator/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
# prepare word vectors for captioning model
 
from numpy import asarray
from pickle import dump
from gensim.models import Word2Vec
from config import config
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# load a pre-defined list of photo identifiers
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)
 

 
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# store
			descriptions[image_id] = 'startseq ' + ' '.join(image_desc) + ' endseq'
	return descriptions
 
# load dev set
dataset = load_set(config['train_data_path'])
print('Dataset: %d' % len(dataset))

train_descriptions = load_clean_descriptions('/content/drive/My Drive/ML-Project1/image-generator/descriptions.txt', dataset)
print('Descriptions: train=%d' % len(train_descriptions))
 
# train word2vec model
lines = [s.split() for s in train_descriptions.values()]
model = Word2Vec(lines, size=100, window=5, workers=8, min_count=1)
# summarize vocabulary size in model
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))
 
# save model in ASCII (word2vec) format
filename ='/content/drive/My Drive/ML-Project1/image-generator/model_data/custom_embedding.txt'
model.wv.save_word2vec_format(filename, binary=False)
 
# load the whole embedding into memory
embedding = dict()
file = open('/content/drive/My Drive/ML-Project1/image-generator/model_data/custom_embedding.txt')
for line in file:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embedding[word] = coefs
file.close()
print('Embedding Size: %d' % len(embedding))
 
# summarize vocabulary
all_tokens = ' '.join(train_descriptions.values()).split()
vocabulary = set(all_tokens)
print('Vocabulary Size: %d' % len(vocabulary))
 
# get the vectors for words in our vocab
cust_embedding = dict()
for word in vocabulary:
	# check if word in embedding
	if word not in embedding:
		continue
	cust_embedding[word] = embedding[word]
print('Custom Embedding %d' % len(cust_embedding))
 
# save
dump(cust_embedding, open('/content/drive/My Drive/ML-Project1/image-generator/model_data/word2vec_embedding.pkl', 'wb'))
print('Saved Embedding')

Dataset: 6000
Descriptions: train=6000
Vocabulary size: 3837


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Embedding Size: 3838
Vocabulary Size: 3837
Custom Embedding 3837
Saved Embedding
