# Word 2 vector (word embedding)

In [1]:
import requests
import numpy as np
import string
import re
from collections import defaultdict

### Fetch data

Abstracts of papers about GPCRs

In [2]:
# get all pubmed IDs
keyword = 'gpcr'
max_ids = 1000

url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&'
url+= f'term={keyword}&retmax={max_ids}'
response = requests.get(url) 
xml = response.text
xml = xml.split('\n')
ids = [int(re.match('.*<Id>(\d*)<',x)[1]) for x in xml if '<Id>' in x]

In [3]:
# get all abstracts
size = 10
data = []
for i in range(0,len(ids),size):
    j = i+size
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&'
    url+= 'id='+','.join([str(num) for num in ids[i:j]])+'&retmode=text&rettype=abstract'
    # split between items (two empty lines between items)
    # and split between segments (one empty line)
    items = [item.split('\n\n') for item in requests.get(url).text.split('\n\n\n')]
    # abstract should be the 4th, but sometimes it isn't   
    data.extend([item[4] for item in items if len(item)>5])
len(data)

973

### Process data

In [4]:
# open file
data_processed = []
# read line by line
for l in data:
    # split paragraph into discrete sentences
    sentences = re.split('\. |\? |! ',l)
    for s in sentences:
        if (s[0].islower()):
            # this is a wrong split (i.e. / e.g. etc), add to the end of the previous sentence
            if (len(data_processed)>0):
                s = data_processed.pop()+' '+s
        # make lower case and remove all punctuation (except hyphen) and numbers
        s = s.lower()
        s = s.translate(str.maketrans('','',(string.punctuation+string.digits+'\n'+'\t').replace('-','')))
        # add to data sentences that have at least 2 words
        if (len(s.split())>1):
            data_processed.append(s)

In [5]:
# make list of unique words
tokens = {}
for s in data_processed:
    for w in s.split():
        tokens[w] = tokens.get(w,0)+1
# prune out rare words
tokens = {t for t in tokens if tokens[t]>4}
# generate dictionary to convert tokens to ID's
id_to_token = {i:token for (i,token) in enumerate(tokens)}
token_to_id = {token:i for (i,token) in enumerate(tokens)}

In [6]:
# generate pairs of target,context
window = 5
pairs = []
for l in data_processed:
    s = l.split()
    for i in range(len(s)):
        target = s[i]
        if (target in token_to_id):
            for j in range(i-window,i+window+1):
                if (j>=0 and j!=i and j<len(s)):
                    if (s[j] in token_to_id):
                        pairs.append((token_to_id[target],token_to_id[s[j]]))

In [7]:
# can't use one-hot-encoded matrix for all the data because it is too big. Use generator instead
def one_hot_generator(pairs, batch_size, n_words, max_iterations=None):
    # current index
    current = 0
    # current iteration on all data
    iteration = 0
    # default is endless looping
    if (max_iterations is None):
        max_iterations = float('inf')
    while (iteration<max_iterations):
        # take batch pairs
        batch = pairs[current:current+batch_size]
        # one hot encode X and y
        X = np.zeros(shape=(len(batch),n_words))
        y = np.zeros(shape=(len(batch),n_words))
        for i in range(len(X)):
            X[i,batch[i][0]] = 1
            y[i,batch[i][1]] = 1
        # update index for next bath
        current+=batch_size
        yield X,y
        # rollback to 0 at the end of the dataset
        if (current>=len(pairs)):
            current=0
            iteration+=1

In [8]:
from keras.models import Model
from keras.layers import Input, Dense

Using TensorFlow backend.


In [9]:
# size of word vector
embed = 100

input_layer = Input(shape=(len(tokens),))
embedding = Dense(embed,use_bias=False)(input_layer)
output = Dense(len(tokens),activation='softmax')(embedding)

model = Model(inputs=input_layer,outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy')

Instructions for updating:
Colocations handled automatically by placer.


In [10]:
batch_size = 100
model.fit_generator(one_hot_generator(pairs,batch_size,len(tokens)),steps_per_epoch=len(pairs)//batch_size+1,epochs=5)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb88f5c1cf8>

In [30]:
# check one word at random
word = np.random.randint(len(tokens))
print(id_to_token[word])

# get word vector
word_vec = model.layers[1].get_weights()[0][word]
# precalculate length
b = np.sqrt(word_vec.dot(word_vec))

find = 5
# find the closest vectors to this one
close = [(-float('inf'),-1)]
i = 0
for w in model.layers[1].get_weights()[0]:
    current = (w.dot(word_vec)) / (np.sqrt(w.dot(w))*b)
    if (current>close[0][0]):
        if (i != word):
            # add new element in order
            j = 0
            while (j<len(close) and current > close[j][0]):
                j+=1
            close.insert(j,(current,i))
            # remove one element if needed
            if (len(close)>find):
                close.pop(0)         
    i+=1

[id_to_token[w[1]] for w in close]

belong


['comprising', 'coupledreceptors', 'relaxin', 'belonging', 'belongs']

Fairly accurate,<br>Note: the abstracts use scientific language, which is not particularly regular and natural. 