# Load dataset from hugging face

In [1]:
!python3 --version

Python 3.12.2


In [2]:
from datasets import load_dataset
import os

max_threads = os.cpu_count()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("bookcorpus", num_proc=max_threads)

In [4]:
dataset["train"][0]

{'text': 'usually , he would be tearing around the living room , playing with his toys .'}

# Tokenize dataset (with lemmatization) and create vocab

In [5]:
from nltk.tokenize import WordPunctTokenizer
import pickle
import torch

In [6]:
tokenizer = WordPunctTokenizer()

print(tokenizer.tokenize(dataset["train"][0]['text']))

['usually', ',', 'he', 'would', 'be', 'tearing', 'around', 'the', 'living', 'room', ',', 'playing', 'with', 'his', 'toys', '.']


In [7]:
def preprocess_data(samples):
  return {
      "processed_text": [
       list(filter(lambda tok: all('a' <= x.lower() <= 'z' for x in tok), tokenizer.tokenize(text)))
       for text in samples["text"]
      ]
  }

In [8]:
tokenized_dataset = dataset["train"].map(preprocess_data, batched=True, num_proc=max_threads)

In [9]:
tokenized_dataset[0]

{'text': 'usually , he would be tearing around the living room , playing with his toys .',
 'processed_text': ['usually',
  'he',
  'would',
  'be',
  'tearing',
  'around',
  'the',
  'living',
  'room',
  'playing',
  'with',
  'his',
  'toys']}

### Build vocabulary for dataset

In [10]:
# vocab = set()

# def build_vocab(sample):
#     global vocab
#     vocab.update(set(sample["processed_text"]))

In [11]:
# tokenized_dataset.map(build_vocab)

# word2tokenId = {word: idx for idx, word in enumerate(vocab)}
# tokenId2word = {idx: word for idx, word in enumerate(vocab)}

In [12]:
# with open('word2tokenId.pickle', 'wb') as handle:
#     pickle.dump(word2tokenId, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('tokenId2word.pickle', 'wb') as handle:
#     pickle.dump(tokenId2word, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Transform text to token ids

In [20]:
with open('word2tokenId.pickle', 'rb') as handle:
    word2tokenId = pickle.load(handle)

In [21]:
def tokenize_texts(samples):
  return {
      "tokenized_text": [
                [
                    word2tokenId[word] for word in text
                ] 
                for text in samples['processed_text']
      ]
  }

In [22]:
tokenized_dataset = tokenized_dataset.map(tokenize_texts, batched=True, num_proc=max_threads)

In [23]:
tokenized_dataset

Dataset({
    features: ['text', 'processed_text', 'tokenized_text'],
    num_rows: 74004228
})

# Word2Vec Class (Pytorch)

<img src='img1.jpg' width=900, heigth=600>

<img src='img2.jpg' width=900, heigth=600>

<img src='img3.jpg' width=900, heigth=600>

<img src='img4.jpg' width=900, heigth=600>

In [24]:
from tqdm import trange, tqdm
import numpy as np
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count
import multiprocessing as mp

In [25]:
# распараллелить подсчёт лосса для батча предложений
# аккумулировать лосс

In [26]:
def batch_generator(dataset, batch_size):
    for i in trange(len(dataset)//batch_size + 1):
        yield dataset[i*batch_size:(i+1)*batch_size]['tokenized_text']


In [27]:
class Word2Vec:
    def __init__(self, 
                 embeds_size, 
                 ctx_window_size, 
                 negative_samples_count=10, 
                 path_to_vocab='tokenId2word.pickle'
                ):
        with open(path_to_vocab, 'rb') as handle:
            self.vocab = pickle.load(handle)
        self.vocab_size = len(self.vocab)
        self.ctx_window_size = ctx_window_size
        self.negative_samples_count = negative_samples_count
        self.central_embeddings = torch.normal(mean=0, std=1, size=(self.vocab_size, embeds_size), device=torch.device('cuda:0'), requires_grad=True)
        self.context_embeddings = torch.normal(mean=0, std=1, size=(self.vocab_size, embeds_size), device=torch.device('cuda:0'), requires_grad=True)

    def sentence_process(self, sentence, lr=0.01):
        sentence = sentence['tokenized_text']
        for c in range(len(sentence)):
            for o in range(max(c - self.ctx_window_size//2, 0), min(c + self.ctx_window_size//2, len(sentence)-1)):
                context_embeddings.grad.zero_()
                central_embeddings.grad.zero_()

                negative_samples = np.random.randint(0, high=self.vocab_size, size=self.negative_samples_count)
                loss = -(self.context_embeddings[sentence[o]].T@self.central_embeddings[sentence[c]]) + torch.log(torch.sum(torch.exp(self.context_embeddings[negative_samples]@self.central_embeddings[sentence[c]])))
                
                loss.backward()
                
                with torch.no_grad():
                    self.context_embeddings[sentence[o]] -= lr*self.context_embeddings.grad[sentence[o]]
                    self.central_embeddings[sentence[c]] -= lr*self.central_embeddings.grad[sentence[c]]
                    for w in negative_samples:
                        self.context_embeddings[w] -= lr*self.context_embeddings.grad[w]
    
    def fit(self, tokenized_dataset, lr=0.01):
        # process_map(self.sentence_process, tokenized_dataset, max_workers=max_threads, chunksize=1)
        for sentence in tqdm(tokenized_dataset):
            sentence = sentence['tokenized_text']
            for c in range(len(sentence)):
                for o in range(max(c - self.ctx_window_size//2, 0), min(c + self.ctx_window_size//2, len(sentence)-1)):
                    negative_samples = np.random.randint(0, high=self.vocab_size, size=self.negative_samples_count)
                    loss = -(self.context_embeddings[sentence[o]].T@self.central_embeddings[sentence[c]]) + torch.log(torch.sum(torch.exp(self.context_embeddings[negative_samples]@self.central_embeddings[sentence[c]])))
                    
                    loss.backward()
                    
                    with torch.no_grad():
                        self.context_embeddings[sentence[o]] -= lr*self.context_embeddings.grad[sentence[o]]
                        self.central_embeddings[sentence[c]] -= lr*self.central_embeddings.grad[sentence[c]]
                        for w in negative_samples:
                            self.context_embeddings[w] -= lr*self.context_embeddings.grad[w]

                    self.context_embeddings.grad.zero_()
                    self.central_embeddings.grad.zero_()
                

In [28]:
word2vec = Word2Vec(100, ctx_window_size=5)

In [29]:
tokenized_dataset

Dataset({
    features: ['text', 'processed_text', 'tokenized_text'],
    num_rows: 74004228
})

In [30]:
word2vec.fit(tokenized_dataset)

  loss = -(self.context_embeddings[sentence[o]].T@self.central_embeddings[sentence[c]]) + torch.log(torch.sum(torch.exp(self.context_embeddings[negative_samples]@self.central_embeddings[sentence[c]])))
  0%|                                                                         | 58/74004228 [00:55<19689:35:15,  1.04it/s]


KeyboardInterrupt: 

In [None]:
# def fit(self, tokenized_dataset, batch_size, lr=0.01):
#     for batch in batch_generator(tokenized_dataset, batch_size=batch_size):
#         for sentence in batch:
#             for c in range(len(sentence)):
#                 for o in range(max(c - ctx_window_size//2, 0), min(c + ctx_window_size//2, len(sentence)-1)):
#                     context_embeddings.grad.zero_()
#                     central_embeddings.grad.zero_()

#                     negative_samples = np.random.randint(0, high=self.vocab_size, size=negative_samples_count)
#                     loss = -(self.context_embeddings[sentence[o]].T@self.central_embeddings[sentence[c]]) + torch.log(torch.sum(torch.exp(self.context_embeddings[negative_samples]@self.central_embeddings[sentence[c]])))
                    
#                     loss.backward()
                    
#                     self.context_embeddings[sentence[o]] -= lr*self.context_embeddings.grad[sentence[o]]
#                     self.central_embeddings[sentence[c]] -= lr*self.central_embeddings.grad[sentence[c]]
#                     for w in negative_samples:
#                         self.context_embeddings[w] -= lr*self.context_embeddings.grad[w]