# Word2vec implementation

read Data.xlsx and extract abstract belonging to medical category to train word embedding and clean the training dataset.

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
df = pd.read_excel('Data.xlsx')
Medical = df[df['Domain'] == 'Medical ']
Medical_subset = Medical.head(7000)
x_Medical = Medical_subset['Abstract'].tolist()
# clean the abstract reading from train.txt
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
x_clean = [re.sub(r'[^a-zA-Z-\s]', u'',sentence.replace('\n', ''), flags=re.UNICODE) for sentence in x_Medical]
a = [list(filter(lambda w: w not in stop_words and len(w)>2, map(lemmatizer.lemmatize, sentence.lower().split()))) for sentence in x_clean]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


remove tokens with less than 5 occurrences

In [None]:
# calculate the numbers of tokens in vocabularies and selects the ones that occurred more than 5 times
import collections
token_counter = collections.Counter([token for sentence in a for token in sentence])
token_counter = dict(filter(lambda token: token[1] >= 5, token_counter.items()))
print(len(token_counter))

14435


In [None]:
# encode each token with its index
token_array = [token for token, _ in token_counter.items()]
t_to_i = {token: index for index, token in enumerate(token_array)}
data = [[t_to_i[token] for token in sentence if token in t_to_i]
           for sentence in a]
# calculate total number of tokens
tokens_num = sum([len(sentence) for sentence in data])
print("tokens before subsampling ", tokens_num)

tokens before subsampling  917944


Some high-frequency words usually appear in text data, such as "the" "a" and "in" in English. Typically, in a background window, a word (such as "cell") and a lower-frequency word, such as “thrombocytopenia”, is more beneficial to embedding the training word model than a higher-frequent word (Such as "the").Therefore, a training word can be sampled secondly when embedded in a model. Specifically, the data is concentrated in each indexed word $w_{i}$
There will be a certain probability that will be discarded.
$P\left(w_{i}\right)=\max \left(1-\sqrt{\frac{t}{f\left(w_{i}\right)}}, 0\right)$
f($w_{i}$)is the proportion between Data concentration word $w_{i}$ and the constant t.
t is a superparameter (set to $10^{−4}$in the experiment)
See only if $f (w_{i}) > t$, we can throw out the word w, and the higher the frequency of the word, the more likely it is to be discarded.






In [None]:
import math
import random
dataset_subsample = []
for sentence in data:
  dataset = []
  for token in sentence:
    if 1 - math.sqrt(1e-4 / token_counter[token_array[token]] * tokens_num) < random.uniform(0, 1):
      dataset.append(token)
  dataset_subsample.append(dataset)
print("tokens after subsampling ",sum([len(sentence) for sentence in dataset_subsample]))

tokens after subsampling  525554


After sampling, there is a decrease in the frequency of words with more occurance, the number of words with less occurance remains unchanged.

In [None]:
def compare(t):
  sum_before = 0
  for string in data:
    sum_before += string.count(t_to_i[t])
  sum_after = 0
  for string in dataset_subsample:
    sum_after += string.count(t_to_i[t])
  print('%s: before=%d, after=%d' % (t, sum_before, sum_after))
compare('cell')
compare('thrombocytopenia')

cell: before=3147, after=551
thrombocytopenia: before=23, after=23


To increase the training accuracy and efficiency, a window size is set to limit the distance from the central word to the context words. The central word is the word to be predicted. Context words are closer to central words, so there are semantic similarities between them. We randomly sample an integer between 1 and the set maximum context window size as context window size. Context words that exceed the window size will be discarded.


In [None]:
def get_centers_and_contexts(dataset, max_window_size):
  centers = []
  contexts = []
  for sentence in dataset:
    if len(sentence) < 2:
      continue
    for i in range(len(sentence)):
      window_size = random.randint(1, max_window_size)
      centers.append(sentence[i])
      contexts.append(sentence[max(0, i-window_size):i]+sentence[i+1:min(i+window_size+1, len(sentence))])
  return centers, contexts

In [None]:
#centers, contexts = get_centers_and_contexts(dataset_subsample, 5)

For each pair of center words and context words, we employ negative sampling to randomly select K negative samples. The sampling probability P(w) for noise words is set to the 0.75 power of the frequency of word w. This setting implies that higher frequent words are more likely to be selected during sampling, but their selection probabilities do not grow linearly with their frequencies; instead, they increase with a power of 0.75. This specific setting aims to balance the sampling probabilities between high-frequency and low-frequency words, ensuring a reasonable sampling of noise words in the negative sampling process.

In [None]:
def get_negatives(contexts, weight, K):
    negatives_array = []
    neg_samples = []
    i = 0
    for context in contexts:
        negatives = []
        while len(negatives) < len(context) * K:
            if i == len(neg_samples):
                # According to the weight of each word (sampling_weights), the index of k words is randomly generated as a noise word.
                # For efficient calculation, k can be set slightly larger
                neg_samples = random.choices(list(range(len(weight))), weight, k=int(1e5))
                i = 0
            # Noise words cannot be background words
            if neg_samples[i] not in set(context):
                negatives.append(neg_samples[i])
            i = i + 1
        negatives_array.append(negatives)
    return negatives_array

weight = [token_counter[w]**0.75 for w in token_array]
#negatives_array = get_negatives(contexts, weight, 5)


In [None]:
import torch
from torch import nn
import torch.utils.data as Data
class Dataset(torch.utils.data.Dataset):
    def __init__(self, centers, backgrounds, negative_samples):
        self.centers = centers
        self.backgrounds = backgrounds
        self.negatives = negative_samples

    def __getitem__(self, index):
        return (self.centers[index], self.backgrounds[index], self.negatives[index])

    def __len__(self):
        return len(self.backgrounds)
def batches(data):
    max_len = max(len(context) + len(negative) for _, context, negative in data)
    centers, contexts_negatives, context_negative_masks, context_masks = [], [], [], []
    for center, context, negative in data:
        length = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - length)]
        context_negative_masks += [[1] * length + [0] * (max_len - length)]
        context_masks += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(context_negative_masks), torch.tensor(context_masks))

'''dataset = Data.DataLoader(Dataset(centers, contexts, negatives_array), 512, shuffle=True,
                            collate_fn=batches,
                            num_workers=4)
for batch in dataset:
    for n, data in zip(['centers', 'contexts_negatives', 'context_negative_masks',
                           'context_masks'], batch):
        print(n, 'of shape:', data.shape)
    break
'''

"dataset = Data.DataLoader(Dataset(centers, contexts, negatives_array), 512, shuffle=True,\n                            collate_fn=batches,\n                            num_workers=4)\nfor batch in dataset:\n    for n, data in zip(['centers', 'contexts_negatives', 'context_negative_masks',\n                           'context_masks'], batch):\n        print(n, 'of shape:', data.shape)\n    break\n"


When the mask is 1, the predicted value and label of the corresponding position will participate in the calculation of the loss function; when the mask is 0, the predicted value and label of the corresponding position will not participate in the calculation of the loss function.

In [None]:
class SigmoidLoss(nn.Module):
    def __init__(self): # none mean sum
        super(SigmoidLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        return nn.functional.binary_cross_entropy_with_logits(inputs.float(), targets.float(), reduction="none", weight=mask.float()).mean(dim=1)

sigmoid_loss = SigmoidLoss()


The  𝑠𝑘𝑖𝑝_𝑔𝑟𝑎𝑚  model's input for the forward computation consists of the connected context and negative samples (contexts_and_negatives) and the central word index (center). The contexts_and_negatives variable has the shape of (batch size, max_len) and the center variable has the shape of (batch size, 1). These two variables undergo a word embedding layer conversion from word indices to word vectors before being multiplied by mini-batches to produce an output with the shape of (batch size, 1, max_len). Each component of the output is the inner product of the context word vector or negative samples vector and the center word vector.

In [None]:
def skip_gram(center, contexts_and_negatives, embed_i, embed_j):
    return torch.bmm(embed_i(center), embed_j(contexts_and_negatives).permute(0, 2, 1))

train the word embedding, training steps will be faster if GPU is used

In [None]:
import time
def train(lr, epochs, dimension, window_size, negatives_num):
    centers, contexts = get_centers_and_contexts(dataset_subsample, window_size)
    negatives_array = get_negatives(contexts, weight, negatives_num)
    dataset = Data.DataLoader(Dataset(centers, contexts, negatives_array), 512, shuffle=True,
                            collate_fn=batches,
                            num_workers=4)
    for batch in dataset:
        for n, data in zip(['centers', 'contexts_negatives', 'context_negative_masks',
                              'context_masks'], batch):
            print(n, 'of shape:', data.shape)
        break
    embedding = nn.Sequential(
        nn.Embedding(num_embeddings=len(token_array), embedding_dim=dimension),
        nn.Embedding(num_embeddings=len(token_array), embedding_dim=dimension)
    )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    embedding = embedding.to(device)
    optimizer = torch.optim.Adam(embedding.parameters(), lr=lr)
    loss_array = []
    for epoch in range(epochs):
        start = time.time()
        loss_sum = 0.0
        num = 0
        for batch in dataset:
            center, context_negative, context_negative_masks, context_masks = [d.to(device) for d in batch]

            pred = skip_gram(center, context_negative, embedding[0], embedding[1])
            l = sigmoid_loss(pred.view(context_masks.shape), context_masks, context_negative_masks)
            # Use the mask variable mask to avoid the impact of padding on the loss function calculation
            loss = (l * context_negative_masks.shape[1] / context_negative_masks.float().sum(dim=1)).mean() # average loss of each batch
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_sum += loss.cpu().item()
            num += 1
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, loss_sum / num, time.time() - start))
        loss_array.append(loss)
    return embedding, loss_array


set the learning rate to 0.01, epochs to 10, embedding dimension to 100, context window size to 5, negative sample numbers to 5

In [None]:
embedding, loss_array1 = train(0.01, 10, 100, 5, 5)



centers of shape: torch.Size([512, 1])
contexts_negatives of shape: torch.Size([512, 60])
context_negative_masks of shape: torch.Size([512, 60])
context_masks of shape: torch.Size([512, 60])
epoch 1, loss 1.78, time 73.94s
epoch 2, loss 0.53, time 72.73s
epoch 3, loss 0.41, time 61.10s
epoch 4, loss 0.37, time 65.51s
epoch 5, loss 0.35, time 78.47s
epoch 6, loss 0.34, time 76.10s
epoch 7, loss 0.33, time 75.54s
epoch 8, loss 0.32, time 67.55s
epoch 9, loss 0.32, time 60.48s
epoch 10, loss 0.31, time 62.08s


find the similar words using cosine similarity

In [None]:
def get_similar_tokens(token, n, embedding):
    weight = embedding.weight.data
    #x = weight[t_to_i[token]]
    #extract the top k tokens with the most similarity
    cos_similarity = torch.matmul(weight, weight[t_to_i[token]]) / (torch.sum(weight * weight, dim=1) * torch.sum(weight[t_to_i[token]] * weight[t_to_i[token]]) + 1e-9).sqrt()
    _, top_k_token = torch.topk(cos_similarity, k=n+1)
    #topk = top_k_token.cpu().numpy()
    for i in top_k_token[1:]:  # delete the input word
        print('cosine similarity=%.3f: %s' % (cos_similarity[i], (token_array[i])))

get_similar_tokens('protein', 5, embedding[0])

cosine similarity=0.594: transcription
cosine similarity=0.552: intracellular
cosine similarity=0.544: expression
cosine similarity=0.526: ap-
cosine similarity=0.525: synthase


In [None]:
get_similar_tokens('injury', 5, embedding[0])

cosine similarity=0.731: sport
cosine similarity=0.659: sprain
cosine similarity=0.627: athlete
cosine similarity=0.580: hnis
cosine similarity=0.573: competition


In [None]:
get_similar_tokens('depression', 5, embedding[0])

cosine similarity=0.636: depressive
cosine similarity=0.586: anxiety
cosine similarity=0.540: psychiatric
cosine similarity=0.531: self-esteem
cosine similarity=0.527: membership


In [None]:
get_similar_tokens('nerve', 5, embedding[0])

cosine similarity=0.491: stimulation
cosine similarity=0.480: spinal
cosine similarity=0.477: ipsilateral
cosine similarity=0.474: bglum
cosine similarity=0.464: parieto-occipital


In [None]:
get_similar_tokens('weight', 5, embedding[0])

cosine similarity=0.503: loss
cosine similarity=0.477: bmi
cosine similarity=0.461: body
cosine similarity=0.460: normoglycemic
cosine similarity=0.459: waist


In [None]:
get_similar_tokens('cancer', 5, embedding[0])

cosine similarity=0.713: breast
cosine similarity=0.502: radiotherapy
cosine similarity=0.487: squamous
cosine similarity=0.478: prostate
cosine similarity=0.477: non-inflammatory


In [None]:
get_similar_tokens('headache', 5, embedding[0])

cosine similarity=0.607: vertigo
cosine similarity=0.591: migraine
cosine similarity=0.554: epilepsy
cosine similarity=0.498: tth
cosine similarity=0.493: attack


In [None]:
get_similar_tokens('diabetes', 5, embedding[0])

cosine similarity=0.601: mellitus
cosine similarity=0.573: hypertension
cosine similarity=0.506: cardiovascular
cosine similarity=0.479: lipoprotein
cosine similarity=0.478: diabetic


In [None]:
get_similar_tokens('brain', 5, embedding[0])

cosine similarity=0.584: neuron
cosine similarity=0.564: pathology
cosine similarity=0.543: cerebral
cosine similarity=0.534: synaptic
cosine similarity=0.531: nucleus


In [None]:
get_similar_tokens('heart', 5, embedding[0])

cosine similarity=0.519: valvular
cosine similarity=0.512: cardiac
cosine similarity=0.510: fibrillation
cosine similarity=0.508: arrhythmia
cosine similarity=0.498: ecg


In [None]:
get_similar_tokens('autism', 5, embedding[0])

cosine similarity=0.553: asd
cosine similarity=0.523: developmental
cosine similarity=0.515: social
cosine similarity=0.512: dsed
cosine similarity=0.494: irritability
