In [1]:
from datasets import load_dataset
import os

max_threads = os.cpu_count()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("bookcorpus", num_proc=max_threads)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})

In [4]:
dataset['train'][0]

{'text': 'usually , he would be tearing around the living room , playing with his toys .'}

In [5]:
from nltk.tokenize import WordPunctTokenizer
import pickle
import torch

tokenizer = WordPunctTokenizer()

print(tokenizer.tokenize(dataset["train"][0]['text']))

['usually', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.']


In [6]:
def preprocess_data(samples):
  return {
      "processed_text": [
       list(filter(lambda tok: all('a' <= x.lower() <= 'z' for x in tok), tokenizer.tokenize(text)))
       for text in samples["text"]
      ]
  }

tokenized_dataset = dataset["train"].map(preprocess_data, batched=True, num_proc=max_threads)

In [7]:
tokenized_dataset[0]

{'text': 'usually , he would be tearing around the living room , playing with his toys .',
 'processed_text': ['usually',
  'he',
  'would',
  'be',
  'tearing',
  'around',
  'the',
  'living',
  'room',
  'playing',
  'with',
  'his',
  'toys']}

# Transform text to token ids

In [8]:
import pickle

In [9]:
with open('word2tokenId.pickle', 'rb') as handle:
    word2tokenId = pickle.load(handle)

In [10]:
def tokenize_texts(samples):
  return {
      "tokenized_text": [
                [
                    word2tokenId[word] for word in text
                ] 
                for text in samples['processed_text']
      ]
  }

In [11]:
tokenized_dataset = tokenized_dataset.map(tokenize_texts, batched=True, num_proc=max_threads)

In [12]:
tokenized_dataset[0]

{'text': 'usually , he would be tearing around the living room , playing with his toys .',
 'processed_text': ['usually',
  'he',
  'would',
  'be',
  'tearing',
  'around',
  'the',
  'living',
  'room',
  'playing',
  'with',
  'his',
  'toys'],
 'tokenized_text': [9424,
  82531,
  230181,
  318116,
  431394,
  340779,
  418995,
  479022,
  431560,
  510338,
  382838,
  593799,
  405334]}

# Word2Vec Class (Pytorch)

<img src='img1.jpg' width=900, heigth=600>?

<img src='img2.jpg' width=900, heigth=600>

<img src='img3.jpg' width=900, heigth=600>

<img src='img4.jpg' width=900, heigth=600>

In [13]:
from tqdm import trange, tqdm
import numpy as np
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count
import multiprocessing as mp

In [14]:
class Word2Vec:
    def __init__(self, 
                 embeds_size, 
                 ctx_window_size, 
                 negative_samples_count=10, 
                 path_to_vocab='tokenId2word.pickle'
                ):
        with open(path_to_vocab, 'rb') as handle:
            self.vocab = pickle.load(handle)
        self.vocab_size = len(self.vocab)
        self.ctx_window_size = ctx_window_size
        self.negative_samples_count = negative_samples_count
        self.central_embeddings = torch.normal(mean=0, std=1, size=(self.vocab_size, embeds_size), device=torch.device('cuda:0'), requires_grad=True)
        self.context_embeddings = torch.normal(mean=0, std=1, size=(self.vocab_size, embeds_size), device=torch.device('cuda:0'), requires_grad=True)

    def fit(self, tokenized_dataset, lr=0.01):
        for text in tqdm(tokenized_dataset):
            for i in range(len(text['tokenized_text'])):
                
                v = self.central_embeddings[text['tokenized_text'][i]]
                for token in text['tokenized_text'][min(0, i-self.ctx_window_size): max(i+self.ctx_window_size, len(text['tokenized_text']))]:
                    u = self.context_embeddings[token]

                    negative_samples = np.random.randint(0, self.vocab_size, size=self.negative_samples_count)

                    J = - u.T @ v + torch.log(torch.sum(torch.exp(self.context_embeddings[negative_samples] @ v)))

                    J.backward()
                    with torch.no_grad():
                        self.central_embeddings[text['tokenized_text'][i]] -= lr * self.central_embeddings.grad[text['tokenized_text'][i]]
                        self.context_embeddings[token] -= lr * self.context_embeddings.grad[token]
                        self.context_embeddings[negative_samples] -= lr * self.context_embeddings.grad[negative_samples]

                    self.context_embeddings.grad.zero_()
                    self.central_embeddings.grad.zero_()

In [None]:
w2v = Word2Vec(embeds_size=50, ctx_window_size=5)

w2v.fit(tokenized_dataset)