# A. Generating Word Embeddings

**1. Generate the Word2Vec Embeddings**

In [0]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]


In [0]:
# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)


Word2Vec(vocab=14, size=100, alpha=0.025)


In [0]:
# summarize vocabulary
words = list(model.wv.vocab)
print(words)

['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second', 'yet', 'another', 'one', 'more', 'and', 'final']


In [0]:
# access vector for one word
print(model['sentence'])

In [0]:
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=14, size=100, alpha=0.025)


**2. Visualize the Embeddings**

In [0]:
#Retrieve the embeddings from the trained model
X = model[model.wv.vocab]

In [0]:
#Train the pca prjection model using the scikit implementation of PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
result = pca.fit_transform(X)

In [0]:
#Plot the vectors using matplot
from matplotlib import pyplot

pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

# B. Loading Pre-trained Embeddings

**1. Stanfords’s GloVe Embeddings**

In [0]:
from gensim.models import KeyedVectors

filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [0]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.6.300d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(6, 300)

In [0]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

In [0]:
print(model['queen'])