# Word 2 vector (word embedding)

data used is paragraphs (lines) from one of my papers, after removing all the headers, figures, and references.<br>
https://doi.org/10.1073/pnas.1810316115

In [159]:
import numpy as np
import string
import re

### Process data

In [160]:
# open file
raw_data = open("sample.txt", "r")
data = []
# read line by line
for l in raw_data:
    # split paragraph into discrete sentences
    sentences = re.split('\. |\? |! ',l)
    for s in sentences:
        if (s[0].islower()):
            # this is a wrong split (i.e. / e.g. etc), add to the end of the previous sentence
            if (len(data)>0):
                s = data.pop()+' '+s
        # make lower case and remove all punctuation (except hyphen) and numbers
        s = s.lower()
        s = s.translate(str.maketrans('','',(string.punctuation+string.digits+'\n'+'\t').replace('-','')))
        # add to data sentences that have at least 2 words
        if (len(s.split())>1):
            data.append(s)

# note - there are some specific issues with Fig. S# and words that are always capitalized, such as ATP
# but overall, this should be good enough

In [161]:
# make list of unique words
tokens = set()
for s in data:
    tokens.update(s.split())
# transform to list
tokens = list(tokens)
# generate dictionary to convert tokens to ID's
token_to_id = {token:i for (i,token) in enumerate(tokens)}

In [162]:
# generate pairs of target,context
window = 5
pairs = []
for l in data:
    s = l.split()
    for i in range(len(s)):
        target = s[i]
        for j in range(i-window,i+window+1):
            if (j>=0 and j!=i and j<len(s)):
                pairs.append((token_to_id[target],token_to_id[s[j]]))

In [163]:
# transform to one-hot-encoded
n_words = len(tokens)
X = np.zeros(shape=(len(pairs),n_words))
y = np.zeros(shape=(len(pairs),n_words))
for i in range(len(X)):
    X[i,pairs[i][0]] = 1
    y[i,pairs[i][1]] = 1

In [164]:
from keras.models import Model
from keras.layers import Input, Dense

In [165]:
# size of word vector
embed = 20

input_layer = Input(shape=(n_words,))
embedding = Dense(embed,use_bias=False)(input_layer)
output = Dense(n_words,activation='softmax')(embedding)

model = Model(inputs=input_layer,outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [166]:
model.fit(X,y,batch_size=25,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa5ff6fe748>

In [217]:
# check one word at random
word = np.random.randint(len(tokens))
print(tokens[word])

# get word vector
word_vec = model.layers[1].get_weights()[0][word]
# precalculate length
b = np.sqrt(word_vec.dot(word_vec))

# find the closest vector to this one
cos = -float('inf')
idx = -1
i = 0
for w in model.layers[1].get_weights()[0]:
    current = (w.dot(word_vec)) / (np.sqrt(w.dot(w))*b)
    if (current>cos):
        if (i != word):
            cos = current
            idx = i
    i+=1

tokens[idx]

enzyme


'catalysis'

In [218]:
# try some arithmetric
w1, w2 = token_to_id['energy'], token_to_id['simulation']
vec = model.layers[1].get_weights()[0][w1] + model.layers[1].get_weights()[0][w2]
# precalculate length
b = np.sqrt(vec.dot(vec))

# find the closest vector to this one
cos = -float('inf')
idx = -1
i = 0
for w in model.layers[1].get_weights()[0]:
    current = (w.dot(vec)) / (np.sqrt(w.dot(w))*b)
    if (current>cos):
        if (i != w1 and i!= w2):
            cos = current
            idx = i
    i+=1

tokens[idx]

'performs'

In [231]:
# try some arithmetric
w1, w2 = token_to_id['protein'], token_to_id['energy']
vec = model.layers[1].get_weights()[0][w1] + model.layers[1].get_weights()[0][w2]
# precalculate length
b = np.sqrt(vec.dot(vec))

# find the closest vector to this one
cos = -float('inf')
idx = []
i = 0
for w in model.layers[1].get_weights()[0]:
    current = (w.dot(vec)) / (np.sqrt(w.dot(w))*b)
    if (current>cos):
        if (i != w1 and i!= w2):
            cos = current
            idx.append((i,cos))
    i+=1

[tokens[i[0]] for i in idx]

['gpcr-mediated',
 'look',
 'conditions',
 'selected',
 'us',
 'estimated',
 'projection',
 'performs',
 'fairly']