In [1]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.normalizers import Sequence, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer


In [2]:
tokenizer = Tokenizer(models.BPE())

In [3]:
tokenizer.normalizer = Sequence([ NFD()])

In [4]:
tokenizer.pre_tokenizer = Whitespace()

In [5]:
trainer = BpeTrainer( vocab_size=12000, 
                     min_frequency=2, 
                     show_progress=True, 
                     special_tokens=["<pad>", "<unk>", "<s>", "</s>", "<mask>"], 
                     limit_alphabet=1000,
                     continuing_subword_prefix="##", 
                    )

In [6]:
files = ["wikipedia_articles_cleaned_100MB.txt"]

In [7]:
tokenizer.train(files, trainer)

In [8]:
tokenizer.save("bpe_tokenizer_100MB.json")

In [9]:
from transformers import GPT2TokenizerFast
from torch.utils.data import Dataset, DataLoader
import torch

# Load tokenizer
tokenizer = GPT2TokenizerFast(tokenizer_file="bpe_tokenizer_100MB.json")

# Load your text
with open(files[0], "r", encoding="utf-8") as f:
    text = f.read()

# Encode the full text
tokens = tokenizer.encode(text)
print(f"Total tokens: {len(tokens)}")

# Create a simple dataset that chunks the tokenized text
class TextDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size

    def __len__(self):
        return len(self.tokens) // self.block_size

    def __getitem__(self, idx):
        start = idx * self.block_size
        end = start + self.block_size
        x = torch.tensor(self.tokens[start:end], dtype=torch.long)
        y = torch.tensor(self.tokens[start + 1:end + 1], dtype=torch.long)
        return x, y


Total tokens: 9598690


In [10]:
from transformers import GPT2LMHeadModel, GPT2Config

# Build a GPT2 model from scratch
config = GPT2Config(
    vocab_size=12_000,
    n_positions=256,
    n_ctx=256,
    n_embd=384,
    n_layer=8,
    n_head=6,
    resid_pdrop=0.1,             # Dropout for residual connections
    attn_pdrop=0.1,              # Dropout for attention layers
    embd_pdrop=0.1,              # Dropout for embeddings
    # Optimization
    activation_function="gelu",
    use_cache=False,             
)
model = GPT2LMHeadModel(config)


In [11]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# Prepare dataset and dataloader
dataset = TextDataset(tokens, block_size=256)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-4, weight_decay=0.01)

# Gradient clipping to avoid explosions
max_grad_norm = 1.0

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Train manually
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
    
    for x, y in progress_bar:
        x, y = x.to(device), y.to(device)
        
        outputs = model(x, labels=y)
        loss = outputs.loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Clip gradients
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    
    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f}")



Epoch 1/20:   0%|                                                                             | 0/1171 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:30<00:00,  7.76it/s, loss=6.49]


Epoch 1 | Train Loss: 6.7915


Epoch 2/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.68it/s, loss=6.21]


Epoch 2 | Train Loss: 6.3241


Epoch 3/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.69it/s, loss=5.85]


Epoch 3 | Train Loss: 6.1110


Epoch 4/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.67it/s, loss=5.89]


Epoch 4 | Train Loss: 5.9533


Epoch 5/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.68it/s, loss=5.81]


Epoch 5 | Train Loss: 5.8283


Epoch 6/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.68it/s, loss=5.65]


Epoch 6 | Train Loss: 5.7240


Epoch 7/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.68it/s, loss=5.61]


Epoch 7 | Train Loss: 5.6356


Epoch 8/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.67it/s, loss=5.78]


Epoch 8 | Train Loss: 5.5592


Epoch 9/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.68it/s, loss=5.42]


Epoch 9 | Train Loss: 5.4920


Epoch 10/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.67it/s, loss=5.53]


Epoch 10 | Train Loss: 5.4325


Epoch 11/20: 100%|███████████████████████████████████████████████████████| 1171/1171 [02:33<00:00,  7.65it/s, loss=5.5]


Epoch 11 | Train Loss: 5.3795


Epoch 12/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.66it/s, loss=5.38]


Epoch 12 | Train Loss: 5.3311


Epoch 13/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.66it/s, loss=5.25]


Epoch 13 | Train Loss: 5.2881


Epoch 14/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.66it/s, loss=5.35]


Epoch 14 | Train Loss: 5.2483


Epoch 15/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.67it/s, loss=5.29]


Epoch 15 | Train Loss: 5.2125


Epoch 16/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.67it/s, loss=5.45]


Epoch 16 | Train Loss: 5.1785


Epoch 17/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.69it/s, loss=5.14]


Epoch 17 | Train Loss: 5.1483


Epoch 18/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.69it/s, loss=5.21]


Epoch 18 | Train Loss: 5.1203


Epoch 19/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.68it/s, loss=5.33]


Epoch 19 | Train Loss: 5.0939


Epoch 20/20: 100%|██████████████████████████████████████████████████████| 1171/1171 [02:32<00:00,  7.70it/s, loss=5.04]

Epoch 20 | Train Loss: 5.0695





In [12]:
model.save_pretrained("gpt2_morph_manual_100MB")
tokenizer.save_pretrained("gpt2_morph_manual_100MB")

('gpt2_morph_manual_100MB\\tokenizer_config.json',
 'gpt2_morph_manual_100MB\\special_tokens_map.json',
 'gpt2_morph_manual_100MB\\vocab.json',
 'gpt2_morph_manual_100MB\\merges.txt',
 'gpt2_morph_manual_100MB\\added_tokens.json',
 'gpt2_morph_manual_100MB\\tokenizer.json')

In [13]:
input_text = "अब्राहम लिंकन द्वारा अमरीकी गृह युद्ध के बीच में गुलामों:"

# Tokenize and move tensors to the same device as the model
inputs = tokenizer(input_text, return_tensors="pt").to(device)  # Add `.to(device)`

# Generate text
output = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=50,
    do_sample=True,
    top_p=0.9,
    temperature=0.9,
    pad_token_id=tokenizer.eos_token_id,  # Ensure pad_token_id is set
    no_repeat_ngram_size=1
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

अब ##्राह ##म लिंक ##न द्वारा अमरीकी गृह युद्ध के बीच में गुलाम ##ों आक्रमण समय की से रही और रूप कई ##ियों संगठित गया अमेरिका एक पैमाने आतंकवाद अंत ##ष्ट का करने है इसे पर ##ाया था बाद इस ने दिया । तक दक्षिण एशिया मध्य शांति लिए ##वाद
