In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [2]:
# Check if CUDA (GPU) is available

device = torch.device("mps")
device

device(type='mps')

# Dataset

In [3]:
!pip install -q datasets

In [4]:
from datasets import load_dataset

data = load_dataset("CohleM/english-to-nepali")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['en', 'ne'],
        num_rows: 177334
    })
})

In [6]:
data['train']['en'][101], data['train']['ne'][101]

('5. A further step under current study is the advancement to university level of the School of Engineering discussed in the following chapter on training.',
 'यस कृषि स्कूलमा क्रमशः चाहिने उपयुक्त शाखाहरु जस्तो कृषि, रसायन, अनाज र पशुको रोगसम्बन्धी अनुसन्धान केन्द्र र प्रयोगशाला इत्यादि समावेश गरिनेछन्।')

In [7]:
eng_data = data['train']['en'][:50000]
nep_data = data['train']['ne'][:50000]

eng_corpus = " ".join(eng_data)
nep_corpus = " ".join(nep_data)



In [8]:
len(eng_corpus)
len(nep_corpus)

7202948

# Tokenizer

In [9]:
class BPETokenizer:

    def __init__(self, text, vocab_size = 300):
        tokens = text.encode("utf-8")
        tokens = list(map(int, tokens))
        self.merges = self.create_merges(tokens,vocab_size)
        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]

    def get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
          if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            newids.append(idx)
            i += 2
          else:
            newids.append(ids[i])
            i += 1
        return newids

    def create_merges(self, ids, vocab_size):
        num_merges = vocab_size - 256
        merges = {}
        for i in range(num_merges):
          stats = self.get_stats(ids)
          pair = max(stats, key=stats.get)
          idx = 256 + i
          print(f"merging {pair} into a new token {idx}")
          ids = self.merge(ids, pair, idx)
          merges[pair] = idx
        return merges


    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
          stats = self.get_stats(tokens)
          pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
          if pair not in self.merges:
            break # nothing else can be merged
          idx = self.merges[pair]
          tokens = self.merge(tokens, pair, idx)
        return tokens

    def decode(self, ids):
        # given ids (list of integers), return Python string
        tokens = b"".join(self.vocab[idx] for idx in ids)
        text = tokens.decode("utf-8", errors="replace")
        return text

In [None]:
eng_tokenizer = BPETokenizer(eng_corpus, 1500)
nep_tokenizer = BPETokenizer(nep_corpus, 1500)

In [14]:
import pickle

In [15]:
# Saving object as pickle
with open("Saved/eng_tokenizer_50k.pkl", "wb") as file:
    pickle.dump(eng_tokenizer, file)

with open("Saved/nep_tokenizer_50k.pkl", "wb") as file:
    pickle.dump(nep_tokenizer, file)

In [None]:
vocab = nep_tokenizer.vocab

string_dict = {key: value.decode('utf-8', errors='replace') for key, value in vocab.items()}

# Print the resulting dictionary
for key, val in string_dict.items():
  print(key, val)