In [1]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import Sequence, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer


In [2]:
tokenizer = Tokenizer(models.BPE())

In [3]:
tokenizer.normalizer = Sequence([ NFD()])

In [4]:
tokenizer.pre_tokenizer = Whitespace()

In [5]:
trainer = BpeTrainer( vocab_size=8000, min_frequency=2, show_progress=True, special_tokens=["<pad>", "<unk>", "<s>", "</s>", "<mask>"] )

In [1]:
files = ["wikipedia_articles_cleaned_10MB.txt"]

In [7]:
tokenizer.train(files, trainer)

In [8]:
tokenizer.save("bpe_tokenizer.json")

In [10]:
from transformers import GPT2TokenizerFast
from torch.utils.data import Dataset, DataLoader
import torch

# Load tokenizer
tokenizer = GPT2TokenizerFast(tokenizer_file="bpe_tokenizer.json")

# Load your text
with open("hindi_wikipedia_corpus_cleaned.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Encode the full text
tokens = tokenizer.encode(text)
print(f"Total tokens: {len(tokens)}")

# Create a simple dataset that chunks the tokenized text
class TextDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size

    def __len__(self):
        return len(self.tokens) // self.block_size

    def __getitem__(self, idx):
        start = idx * self.block_size
        end = start + self.block_size
        x = torch.tensor(self.tokens[start:end], dtype=torch.long)
        y = torch.tensor(self.tokens[start + 1:end + 1], dtype=torch.long)
        return x, y


Total tokens: 311545


In [11]:
from transformers import GPT2LMHeadModel, GPT2Config

# Build a GPT2 model from scratch
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=4,
    n_head=4
)
model = GPT2LMHeadModel(config)


In [12]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# Prepare dataset and dataloader
dataset = TextDataset(tokens, block_size=128)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-4)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Train manually
epochs = 20
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)

        outputs = model(x, labels=y)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Loss: {loss.item():.4f}")


Epoch 1/20


  0%|          | 0/609 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
100%|██████████| 609/609 [00:09<00:00, 64.94it/s]


Loss: 7.0643
Epoch 2/20


100%|██████████| 609/609 [00:06<00:00, 87.28it/s]


Loss: 7.0157
Epoch 3/20


100%|██████████| 609/609 [00:07<00:00, 84.17it/s]


Loss: 7.1682
Epoch 4/20


100%|██████████| 609/609 [00:06<00:00, 90.35it/s]


Loss: 6.3951
Epoch 5/20


100%|██████████| 609/609 [00:07<00:00, 84.13it/s]


Loss: 5.9823
Epoch 6/20


100%|██████████| 609/609 [00:06<00:00, 88.87it/s]


Loss: 6.3657
Epoch 7/20


100%|██████████| 609/609 [00:07<00:00, 84.84it/s]


Loss: 6.1810
Epoch 8/20


100%|██████████| 609/609 [00:07<00:00, 80.00it/s]


Loss: 6.2789
Epoch 9/20


100%|██████████| 609/609 [00:06<00:00, 88.95it/s]


Loss: 6.2003
Epoch 10/20


100%|██████████| 609/609 [00:07<00:00, 78.41it/s]


Loss: 5.5586
Epoch 11/20


100%|██████████| 609/609 [00:07<00:00, 86.82it/s]


Loss: 5.8348
Epoch 12/20


100%|██████████| 609/609 [00:07<00:00, 84.93it/s]


Loss: 5.7599
Epoch 13/20


100%|██████████| 609/609 [00:07<00:00, 82.22it/s]


Loss: 5.5904
Epoch 14/20


100%|██████████| 609/609 [00:06<00:00, 88.98it/s]


Loss: 5.7515
Epoch 15/20


100%|██████████| 609/609 [00:07<00:00, 83.14it/s]


Loss: 5.7084
Epoch 16/20


100%|██████████| 609/609 [00:06<00:00, 87.45it/s]


Loss: 5.4794
Epoch 17/20


100%|██████████| 609/609 [00:07<00:00, 84.63it/s]


Loss: 5.4455
Epoch 18/20


100%|██████████| 609/609 [00:07<00:00, 83.21it/s]


Loss: 5.0566
Epoch 19/20


100%|██████████| 609/609 [00:07<00:00, 86.58it/s]


Loss: 5.5968
Epoch 20/20


100%|██████████| 609/609 [00:11<00:00, 54.91it/s]

Loss: 4.8086





In [15]:
model.save_pretrained("gpt2_morph_manual")
tokenizer.save_pretrained("gpt2_morph_manual")


('gpt2_morph_manual/tokenizer_config.json',
 'gpt2_morph_manual/special_tokens_map.json',
 'gpt2_morph_manual/vocab.json',
 'gpt2_morph_manual/merges.txt',
 'gpt2_morph_manual/added_tokens.json',
 'gpt2_morph_manual/tokenizer.json')

In [14]:
input_text =  "अब्राहम लिंकन द्वारा अमरीकी गृह युद्ध के बीच में गुलामों:"
inputs = tokenizer(input_text, return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Set pad_token_id to eos_token_id (or define your own pad_token_id if you have one)
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=50,
    do_sample=True,
    top_p=0.9,
    temperature=0.9,
    pad_token_id=tokenizer.eos_token_id,
    no_repeat_ngram_size=1
)

print(tokenizer.decode(output[0]))




RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)