# **Environment setup**

In [4]:
!pip install gensim



In [5]:
!pip install datasets



# **Import**

In [15]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
import torch.nn as nn
from torch import optim

from gensim.models import Word2Vec

from datasets import load_dataset
from tokenizers import Tokenizer
import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

# **Dataset**

In [2]:
ds = load_dataset("arshiaafshani/persian-natural-fluently")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/729 [00:00<?, ?B/s]

fluently-fa.json:   0%|          | 0.00/437k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/733 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 733
    })
})

In [3]:
n = 1000
ds['train']['output']

['خورشید در واقع حرکت نمی\u200cکنه، بلکه این زمینه که دور خورشید می\u200cچرخه. زمین هر ۲۴ ساعت یک بار دور خودش می\u200cچرخه و این باعث می\u200cشه که ما احساس کنیم خورشید در آسمون حرکت می\u200cکنه. وقتی زمین می\u200cچرخه، قسمت\u200cهای مختلفش رو به سمت خورشید می\u200cگیره و اینطوری روز و شب ایجاد می\u200cشه. پس این حرکت ظاهری خورشید به خاطر چرخش زمینه.',
 'ستاره\u200cها به خاطر جو زمین چشمک می\u200cزنن. وقتی نور ستاره\u200cها از جو زمین رد می\u200cشه، با لایه\u200cهای مختلف هوا برخورد می\u200cکنه. این لایه\u200cها دما و چگالی متفاوتی دارن و باعث می\u200cشن نور ستاره شکسته بشه. این شکست نور باعث می\u200cشه که ما ستاره\u200cها رو به صورت چشمک\u200cزن ببینیم. اگر از جو زمین خارج بشیم، ستاره\u200cها ثابت به نظر می\u200cرسن.',
 'برف سفیده چون نور خورشید رو به طور کامل منعکس می\u200cکنه. هر دانه برف از کریستال\u200cهای یخ تشکیل شده که سطحشون صاف و شفافه. وقتی نور به این کریستال\u200cها برخورد می\u200cکنه، در همه جهات پخش می\u200cشه و به چشم ما بازتابیده می\u200cشه. این بازتاب کامل نور باعث می

# **Tokenizer**

In [4]:
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
tokenizer

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=BPE(dropout=None, unk_token="[UNK]", continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))

In [5]:
trainer = trainers.BpeTrainer(
    vocab_size=500,  # Set the vocabulary size
    special_tokens=["[UNK]"],
    min_frequency=2,  # Set the minimum frequency of tokens
)


In [6]:
tokenizer.train_from_iterator(ds['train'][:]['output'], trainer)


In [7]:
tokenizer.save("bpe_tokenizer.json")
tokenizer.encode("کره زمین میچرخه دور خورشیدت").tokens

['ک',
 'ر',
 'ه ',
 'زم',
 'ی',
 'ن',
 ' می',
 'چ',
 'ر',
 'خ',
 'ه ',
 'دو',
 'ر ',
 'خ',
 'ور',
 'شی',
 'د',
 'ت']

In [8]:
vocab = tokenizer.get_vocab()
sorted_vocab = dict(sorted(vocab.items(), key=lambda item: item[1]))  # Sort by token IDs
print(sorted_vocab)

{'[UNK]': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '#': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, '*': 12, '+': 13, ',': 14, '-': 15, '.': 16, '/': 17, '0': 18, '1': 19, '2': 20, '3': 21, '4': 22, '5': 23, '6': 24, '7': 25, '8': 26, '9': 27, ':': 28, ';': 29, '<': 30, '=': 31, '>': 32, '?': 33, '@': 34, 'A': 35, 'B': 36, 'C': 37, 'D': 38, 'E': 39, 'F': 40, 'G': 41, 'H': 42, 'I': 43, 'J': 44, 'K': 45, 'L': 46, 'M': 47, 'N': 48, 'O': 49, 'P': 50, 'Q': 51, 'R': 52, 'S': 53, 'T': 54, 'U': 55, 'V': 56, 'W': 57, 'X': 58, 'Y': 59, '[': 60, '\\': 61, ']': 62, '^': 63, '_': 64, '`': 65, 'a': 66, 'b': 67, 'c': 68, 'd': 69, 'e': 70, 'f': 71, 'g': 72, 'h': 73, 'i': 74, 'j': 75, 'k': 76, 'l': 77, 'm': 78, 'n': 79, 'o': 80, 'p': 81, 'q': 82, 'r': 83, 's': 84, 't': 85, 'u': 86, 'v': 87, 'w': 88, 'x': 89, 'y': 90, 'z': 91, '{': 92, '|': 93, '}': 94, '~': 95, '«': 96, '°': 97, '±': 98, '²': 99, '³': 100, '¹': 101, '»': 102, '×': 103, '÷': 104, 'Δ': 105, 'Θ': 106, 'Σ': 107, 'Φ': 108, 'α': 109, 'θ'

# **Prepare data**

In [9]:
data = ds['train'][:]['output']

tokenized_data = [tokenizer.encode(sample).ids for sample in data]
print(tokenized_data[0])

[128, 310, 417, 274, 312, 343, 142, 373, 127, 131, 156, 264, 145, 144, 254, 259, 146, 115, 247, 143, 315, 304, 348, 158, 145, 384, 355, 258, 128, 310, 417, 129, 261, 154, 131, 128, 146, 267, 348, 158, 250, 146, 258, 161, 163, 2, 364, 139, 264, 291, 299, 258, 355, 258, 391, 134, 261, 154, 131, 128, 248, 256, 251, 145, 299, 139, 125, 397, 156, 146, 246, 257, 122, 127, 133, 303, 433, 158, 276, 128, 310, 417, 274, 312, 119, 133, 144, 288, 127, 131, 156, 124, 297, 146, 267, 431, 348, 158, 145, 261, 154, 131, 128, 409, 142, 133, 144, 124, 322, 246, 383, 143, 141, 342, 290, 272, 133, 144, 264, 128, 310, 417, 129, 261, 418, 248, 256, 251, 145, 137, 310, 249, 290, 132, 316, 134, 357, 251, 126, 265, 338, 267, 472, 304, 127, 131, 156, 264, 138, 420, 131, 249, 128, 310, 417, 129, 272, 128, 122, 137, 258, 154, 131, 128, 342, 348, 158, 365, 16]


In [10]:
window_size = 2  # Number of tokens to the left/right to consider as context

# Generate Skip-Gram pairs
skip_gram_pairs = []
for tokens in tokenized_data:
    for i, target_token in enumerate(tokens):
        for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
            if i != j:  # Avoid pairing the target token with itself
                skip_gram_pairs.append((target_token, tokens[j]))

# Convert pairs to PyTorch tensors
skip_gram_pairs = torch.tensor(skip_gram_pairs, dtype=torch.long)
print(skip_gram_pairs.shape)

torch.Size([379906, 2])


In [11]:
skip_gram_pairs[:2]
tokenizer.id_to_token(2445), tokenizer.id_to_token(1117), tokenizer.id_to_token(4707)

(None, None, None)

In [12]:
train_set = TensorDataset(skip_gram_pairs[:, 0], skip_gram_pairs[:, 1])#central word, contex word
train_set, valid_set = random_split(train_set, [0.8, 0.2])

len(train_set), len(valid_set)

(303925, 75981)

In [13]:
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=128, shuffle=False)

len(train_loader), len(valid_loader)

(2375, 594)

In [14]:
next(iter(train_loader))

[tensor([344,   1, 274,  78,  81,  66, 410, 250,  32, 333, 250, 169, 270, 134,
         169, 405,   2, 164,  73, 126, 153, 329,  79, 462, 158, 324, 311, 304,
           2, 405, 124, 127,  19, 335, 248, 145, 447, 302, 248, 122, 160, 271,
         158, 124,  10, 447, 263,  22, 144, 144, 374, 129, 270, 122,  11, 197,
         247, 160,  15, 142, 129, 169, 292, 275, 271, 273, 146, 144, 248, 440,
         183, 366, 311, 129, 169,  10, 182, 275, 146, 278,   2,  77,  43, 294,
         251, 157, 260, 134, 308, 401,  57,  14, 366, 226, 369, 253, 246, 126,
         184, 156,  80, 271, 251,  15, 279, 301, 255, 301,  64,   1, 259,   2,
           2,  76, 466,  85, 485,  71, 385, 141,  35, 135, 321,  84,   3, 146,
          25, 253]),
 tensor([312, 293, 129, 255,  85,  67, 156, 290,  66, 154, 348, 354, 146, 133,
         350,  84, 251, 160, 302, 132, 388, 250,  92,  42, 295, 449, 128, 403,
         424, 302, 290, 137,  89, 403, 293, 156, 260,  84, 259, 124, 451, 273,
         266, 494,  84, 145, 16

# **Word2Vec model**

In [16]:
class SkipGram(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word):
        embedded = self.embeddings(target_word)
        output = self.linear(embedded)
        return output

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Hyperparameters
embedding_dim = 100  # Size of the word embeddings
learning_rate = 0.01
num_epochs = 10
vocab_size = tokenizer.get_vocab_size()

# Initialize model, loss, and optimizer
model = SkipGram(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.001)

sum([param.numel() for param in model.parameters()]) / 1e6

0.1005

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:

def train_one_epoch(model, train_loader, optimizer, loss_fn=criterion, epoch=None):
  model.train()
  loss_train = AverageMeter()

  with tqdm.tqdm(train_loader, unit='batch') as tepoch:
    for inputs, targets in tepoch:
      if epoch:
        tepoch.set_description(f'Epoch {epoch}')

      inputs = inputs.to(device)
      targets = targets.to(device)

      outputs = model(inputs)

      loss = loss_fn(outputs.reshape(-1, outputs.shape[-1]), targets.flatten())

      loss.backward()

      optimizer.step()
      optimizer.zero_grad()

      loss_train.update(loss.item(), n=len(targets))


      tepoch.set_postfix(loss=loss_train.avg)
  return model, loss_train.avg

In [None]:
epochs = 50

for epoch in range(epochs):
  model, loss_ = train_one_epoch(model, train_loader, optimizer, criterion, epoch)


100%|██████████| 2375/2375 [00:07<00:00, 299.76batch/s, loss=6.2]
Epoch 1: 100%|██████████| 2375/2375 [00:10<00:00, 216.90batch/s, loss=5.91]
Epoch 2: 100%|██████████| 2375/2375 [00:11<00:00, 215.50batch/s, loss=5.71]
Epoch 3: 100%|██████████| 2375/2375 [00:11<00:00, 207.43batch/s, loss=5.57]
Epoch 4: 100%|██████████| 2375/2375 [00:10<00:00, 222.55batch/s, loss=5.47]
Epoch 5: 100%|██████████| 2375/2375 [00:10<00:00, 216.40batch/s, loss=5.39]
Epoch 6: 100%|██████████| 2375/2375 [00:11<00:00, 215.45batch/s, loss=5.33]
Epoch 7: 100%|██████████| 2375/2375 [00:11<00:00, 201.55batch/s, loss=5.28]
Epoch 8: 100%|██████████| 2375/2375 [00:10<00:00, 222.37batch/s, loss=5.24]
Epoch 9: 100%|██████████| 2375/2375 [00:10<00:00, 218.27batch/s, loss=5.21]
Epoch 10: 100%|██████████| 2375/2375 [00:11<00:00, 212.87batch/s, loss=5.18]
Epoch 11: 100%|██████████| 2375/2375 [00:11<00:00, 212.87batch/s, loss=5.16]
Epoch 12: 100%|██████████| 2375/2375 [00:10<00:00, 221.01batch/s, loss=5.14]
Epoch 13: 100%|████

In [None]:
# Get the embeddings for all tokens
embeddings = model.embeddings.weight.data

# Example: Get the embedding for a specific token
token = "خورشید"
if token in vocab:
    token_id = vocab[token]
    token_embedding = embeddings[token_id]
    print(f"Embedding for '{token}': {token_embedding}")
else:
    print(f"'{token}' not in vocabulary")

'خورشید' not in vocabulary


In [None]:
torch.save(model, 'model.pt')