In [None]:
!pip install torch
!pip install pymupdf
!pip install transformers
!pip install peft
!pip install bitsandbytes



In [None]:
import torch
import math
import fitz
import torch.nn as nn
import re
import torch.optim as option
from collections import Counter
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

# Custom Tokenizer

In [None]:
class CustomTokenizer:
    def __init__(self, vocab_size=8000):
        self.vocab_size = vocab_size
        self.vocab = {"<pad>": 0, "<unk>": 1, "<bos>": 2, "<eos>": 3}
        self.rev_vocab = None

    def build_vocab(self, text):
        tokens = re.findall(r'\w+|[.,!?;]', text.lower())
        token_counts = Counter(tokens)
        most_common = token_counts.most_common(self.vocab_size - 4)
        self.vocab.update({token: idx + 4 for idx, (token, _) in enumerate(most_common)})
        self.rev_vocab = {v: k for k, v in self.vocab.items()}

    def encode(self, text):
        tokens = re.findall(r'\w+|[.,!?;]', text.lower())
        return [self.vocab.get(token, self.vocab["<unk>"]) for token in tokens]

    def decode(self, indices):
        return ' '.join([self.rev_vocab.get(idx, "<unk>") for idx in indices])

# Load Data and process

In [None]:
def load_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = " ".join(page.get_text("text") for page in doc)
    return text

In [None]:
####for both small and big data

# class RelativePositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len):
#         super().__init__()
#         self.relative_positions = nn.Parameter(torch.randn(max_len, d_model))

#     def forward(self, positions):
#         return self.relative_positions[positions]

# rpe = RelativePositionalEncoding(d_model=256, max_len=100)
# positions = torch.arange(0, 10)
# encoded_positions = rpe(positions)
# print(encoded_positions.shape)

In [None]:
#### for medium type data

# def rotary_embedding(x, theta=10000):
#     seq_len, dim = x.shape
#     freqs = torch.pow(theta, -torch.arange(0, dim, 2) / dim)
#     angles = torch.arange(seq_len).unsqueeze(1) * freqs.unsqueeze(0)
#     x_rotated = torch.cat([x[:, 0::2] * torch.cos(angles) - x[:, 1::2] * torch.sin(angles),
#                            x[:, 0::2] * torch.sin(angles) + x[:, 1::2] * torch.cos(angles)], dim=-1)
#     return x_rotated

# x = torch.randn(10, 512)
# x_rope = rotary_embedding(x)
# print(x_rope.shape)

In [None]:
##### for small data

class LearnedPositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.embedding = nn.Embedding(max_len, d_model)

    def forward(self, positions):
        return self.embedding(positions.long())

# Transformer

Rnn is good gor real time and small  data processing

Rest is Casulal Transformer

# RNN-based Model (Using GRU Instead of Transformer)

In [None]:
# class RNNLanguageModel(nn.Module):
#     def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, num_layers=2):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
#         self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, vocab_size)

#     def forward(self, x):
#         x = self.embedding(x)
#         x, _ = self.gru(x)  # Using GRU
#
#         return self.fc(x)

# RNN-based Model (Using LSTM Instead of Transformer)

In [None]:
# class RNNLanguageModel(nn.Module):
#     def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, num_layers=2):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, vocab_size)

#     def forward(self, x):
#         x = self.embedding(x)
#         x, _ = self.lstm(x)

#         return self.fc(x)

# Casual Transformer

In [None]:
class CausalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, num_heads=8, hidden_dim=512, num_layers=6, max_len=1000):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos_encoder = LearnedPositionalEncoding(max_len, embed_dim)
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = self.embedding(x)
        x = x + self.pos_encoder(positions)
        attn_mask = torch.triu(torch.ones(x.size(1), x.size(1), device=x.device), diagonal=1).bool()
        return self.lm_head(self.transformer(x, mask=attn_mask))

# appling Lora

In [None]:
def apply_lora(model):
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=["lm_head"]
    )
    model = get_peft_model(model, lora_config)
    return model

# appling QLorra

In [None]:
def apply_qlora(model):
    q_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
    )
    model = model.to("cuda" if torch.cuda.is_available() else "cpu")
    return get_peft_model(model, q_config)

# Training Function with Accuracy

In [None]:
def train_model(model, dataset, epochs=500, batch_size=4, lr=3e-5, max_len=100):
    model.train()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
    criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)

    for epoch in range(epochs):
        optimizer.zero_grad()
        inputs = dataset[:, :-1]
        targets = dataset[:, 1:]
        outputs = model(inputs)

        loss = criterion(outputs.reshape(-1, outputs.size(-1)), targets.reshape(-1))

        predictions = torch.argmax(outputs, dim=-1)
        correct = (predictions == targets).float()
        mask = (targets != 0).float()
        accuracy = (correct * mask).sum() / mask.sum()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        # Calculate Perplexity
        perplexity = math.exp(loss.item()) if loss.item() < 300 else float('inf')

        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Perplexity: {perplexity:.4f}, Accuracy: {accuracy.item() * 100:.2f}%")

# Generate Response

In [None]:
def generate_response(model, tokenizer, prompt, max_len=50, temperature=0.8, top_k=10):
    model.eval()
    tokens = tokenizer.encode(prompt) + [tokenizer.vocab["<bos>"]]
    input_tensor = torch.tensor(tokens).unsqueeze(0)
    response = []
    with torch.no_grad():
        for _ in range(max_len):
            output = model(input_tensor)
            probs = torch.nn.functional.softmax(output[:, -1, :] / temperature, dim=-1)
            top_k_probs, top_k_indices = torch.topk(probs, top_k)
            next_token = top_k_indices[0, torch.multinomial(top_k_probs, num_samples=1)].item()
            if next_token == tokenizer.vocab["<eos>"]:
                break
            response.append(next_token)
            input_tensor = torch.cat([input_tensor, torch.tensor([[next_token]])], dim=1)
    return tokenizer.decode(response)

In [None]:
pdf_path = "/content/drive/MyDrive/Bnagla.txt"
pdf_text = load_pdf_text(pdf_path)
tokenizer = CustomTokenizer()
tokenizer.build_vocab(pdf_text)

vocab_size = len(tokenizer.vocab)
embed_dim = 256
num_heads = 8
hidden_dim = 512
num_layers = 6
max_len = 100

# Apply LoRA or QLoRA

In [None]:
# model = CausalTransformer(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_len)

# use_qlora = True
# if use_qlora:
#     model = apply_qlora(model)
# else:
#     model = apply_lora(model)

# dataset = torch.tensor([tokenizer.encode(pdf_text)[:max_len] + [0] * (max_len - len(tokenizer.encode(pdf_text)[:max_len]))], dtype=torch.long)

# train_model(model, dataset)

# while True:
#     user_input = input("You: ")
#     if user_input.lower() in ["exit", "quit"]:
#         break
#     response = generate_response(model, tokenizer, user_input)
#     print("Bot:", response)

In [None]:
model = CausalTransformer(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_len)
dataset = torch.tensor([tokenizer.encode(pdf_text)[:max_len] + [0] * (max_len - len(tokenizer.encode(pdf_text)[:max_len]))], dtype=torch.long)

train_model(model, dataset)

while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        break
    response = generate_response(model, tokenizer, user_input)
    print("Bot:", response)

Epoch 1/500, Loss: 6.9726, Perplexity: 1067.0274, Accuracy: 0.00%
Epoch 2/500, Loss: 6.8762, Perplexity: 968.9681, Accuracy: 0.00%
Epoch 3/500, Loss: 6.8447, Perplexity: 938.9026, Accuracy: 0.00%
Epoch 4/500, Loss: 6.7766, Perplexity: 877.0911, Accuracy: 0.00%
Epoch 5/500, Loss: 6.6941, Perplexity: 807.6463, Accuracy: 0.00%
Epoch 6/500, Loss: 6.6331, Perplexity: 759.8420, Accuracy: 3.03%
Epoch 7/500, Loss: 6.5684, Perplexity: 712.2239, Accuracy: 3.03%
Epoch 8/500, Loss: 6.5222, Perplexity: 680.0444, Accuracy: 3.03%
Epoch 9/500, Loss: 6.4527, Perplexity: 634.4254, Accuracy: 6.06%
Epoch 10/500, Loss: 6.4081, Perplexity: 606.7500, Accuracy: 8.08%
Epoch 11/500, Loss: 6.3421, Perplexity: 567.9719, Accuracy: 11.11%
Epoch 12/500, Loss: 6.3027, Perplexity: 546.0463, Accuracy: 11.11%
Epoch 13/500, Loss: 6.2508, Perplexity: 518.4042, Accuracy: 12.12%
Epoch 14/500, Loss: 6.2085, Perplexity: 496.9784, Accuracy: 12.12%
Epoch 15/500, Loss: 6.1483, Perplexity: 467.9385, Accuracy: 12.12%
Epoch 16/500,