In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
dataset = load_dataset("squad")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
import numpy as np
from collections import Counter
import re

In [29]:
train_data = dataset["train"]
val_data = dataset["validation"]

In [30]:
# Print the first sample
sample = train_data[0]
for key, value in sample.items():
    print(f"{key.upper()}:\n{value}\n")



ID:
5733be284776f41900661182

TITLE:
University_of_Notre_Dame

CONTEXT:
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

QUESTION:
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

ANSWERS:
{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}



In [31]:
context = sample["context"]
answer_text = sample["answers"]["text"][0]
answer_start = sample["answers"]["answer_start"][0]

# Extract substring from context and compare
extracted_text = context[answer_start : answer_start + len(answer_text)]

print("ANSWER TEXT:", answer_text)
print("EXTRACTED:", extracted_text)
print("Match:", answer_text == extracted_text)


ANSWER TEXT: Saint Bernadette Soubirous
EXTRACTED: Saint Bernadette Soubirous
Match: True


In [32]:
unique_titles = set([sample["title"] for sample in train_data])
print(f"Unique topics in training set: {len(unique_titles)}")


Unique topics in training set: 442


In [33]:


answer_counts = Counter([len(sample["answers"]["text"]) for sample in train_data])
print(answer_counts)


Counter({1: 87599})


In [34]:
#Basic Tokenizor
def simple_tokenizer(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.split()


In [35]:
def build_vocab(data, min_freq=2):
    counter = Counter()
    for sample in data:
        tokens = simple_tokenizer(sample["context"]) + simple_tokenizer(sample["question"])
        counter.update(tokens)

    vocab = {"[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3}
    index = 4
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = index
            index += 1
    return vocab

vocab = build_vocab(train_data)


In [36]:
def tokens_to_ids(tokens, vocab):
    return [vocab.get(token, vocab["[UNK]"]) for token in tokens]


In [37]:
def encode_sample(sample, vocab, max_len=384):
    question = sample["question"]
    context = sample["context"]
    answer_text = sample["answers"]["text"][0]
    answer_start_char = sample["answers"]["answer_start"][0]

    question_tokens = simple_tokenizer(question)
    context_tokens = simple_tokenizer(context)

    # Combine and add special tokens
    input_tokens = ["[CLS]"] + question_tokens + ["[SEP]"] + context_tokens + ["[SEP]"]
    token_ids = tokens_to_ids(input_tokens, vocab)
    attention_mask = [1] * len(token_ids)

    # Padding
    pad_len = max_len - len(token_ids)
    if pad_len > 0:
        token_ids += [vocab["[PAD]"]] * pad_len
        attention_mask += [0] * pad_len
    else:
        token_ids = token_ids[:max_len]
        attention_mask = attention_mask[:max_len]

    # Approximate start/end positions in token space
    answer_start_words = len(simple_tokenizer(context[:answer_start_char]))
    answer_end_words = len(simple_tokenizer(context[:answer_start_char + len(answer_text)])) - 1

    context_start_idx = len(question_tokens) + 2
    start_pos = min(context_start_idx + answer_start_words, max_len - 1)
    end_pos = min(context_start_idx + answer_end_words, max_len - 1)

    return {
        "input_ids": token_ids,
        "attention_mask": attention_mask,
        "start_position": start_pos,
        "end_position": end_pos,
        "tokens": input_tokens
    }


In [38]:
# encoded_train = [encode_sample(sample, vocab) for sample in train_data.select(range(1000))]
encoded_train = [encode_sample(sample, vocab) for sample in train_data]


In [39]:
sample = encoded_train[0]
print("Tokens:", sample["tokens"][:40])
print("Start Pos:", sample["start_position"])
print("End Pos:", sample["end_position"])
print("Answer Tokens:", sample["tokens"][sample["start_position"]:sample["end_position"]+1])


Tokens: ['[CLS]', 'to', 'whom', 'did', 'the', 'virgin', 'mary', 'allegedly', 'appear', 'in', '1858', 'in', 'lourdes', 'france', '[SEP]', 'architecturally', 'the', 'school', 'has', 'a', 'catholic', 'character', 'atop', 'the', 'main', 'buildings', 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'virgin', 'mary', 'immediately', 'in', 'front', 'of']
Start Pos: 105
End Pos: 107
Answer Tokens: ['saint', 'bernadette', 'soubirous']


In [40]:
#Hello

#Transformer Buildings and Embeddings

In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


In [42]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pos_embed = nn.Embedding(max_len, d_model)

    def forward(self, x):
        batch_size, seq_len = x.size(0), x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        return self.pos_embed(positions)


In [43]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.qkv_proj = nn.Linear(d_model, d_model * 3)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()
        qkv = self.qkv_proj(x)
        q, k, v = qkv.chunk(3, dim=-1)

        # Reshape for heads
        def reshape(x):  # (B, S, D) → (B, H, S, D//H)
            return x.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        q = reshape(q)
        k = reshape(k)
        v = reshape(v)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            scores = scores.masked_fill(mask[:, None, None, :] == 0, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, v)  # (B, H, S, D//H)
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return self.out_proj(out)


In [59]:
class FeedForward(nn.Module):
    def __init__(self, d_model, ff_dim):
        super().__init__()
        self.linear1 = nn.Linear(d_model, ff_dim)
        self.linear2 = nn.Linear(ff_dim, d_model)

    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))


In [58]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, ff_dim)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.attn(x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x


In [60]:
class LearnGPT(nn.Module):
    def __init__(self, vocab_size, d_model=768, num_heads=8, ff_dim=512, num_layers=4, max_len=384):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = PositionalEncoding(max_len, d_model)

        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, ff_dim) for _ in range(num_layers)
        ])

        self.qa_outputs = nn.Linear(d_model, 2)  # for start and end logits

    def forward(self, input_ids, attention_mask=None):
        x = self.token_embed(input_ids) + self.pos_embed(input_ids)

        for block in self.transformer_blocks:
            x = block(x, attention_mask)

        logits = self.qa_outputs(x)  # (B, S, 2)
        start_logits, end_logits = logits.split(1, dim=-1)
        return start_logits.squeeze(-1), end_logits.squeeze(-1)


In [61]:
def compute_loss(start_logits, end_logits, start_pos, end_pos):
    loss_fct = nn.CrossEntropyLoss()
    loss_start = loss_fct(start_logits, start_pos)
    loss_end = loss_fct(end_logits, end_pos)
    return (loss_start + loss_end) / 2


In [62]:
vocab_size = len(vocab)
model = LearnGPT(vocab_size)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
#PHASE 4 TRAINING

In [63]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)


In [64]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    return {
        "input_ids": torch.tensor([x["input_ids"] for x in batch]),
        "attention_mask": torch.tensor([x["attention_mask"] for x in batch]),
        "start_positions": torch.tensor([x["start_position"] for x in batch]),
        "end_positions": torch.tensor([x["end_position"] for x in batch])
    }

train_loader = DataLoader(encoded_train, batch_size=8, shuffle=True, collate_fn=collate_fn)


In [66]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = len(train_loader) * num_epochs

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [67]:
best_val_loss = float('inf')  # Initialize with a very high value
patience = 3  # Number of epochs to wait before stopping
epochs_without_improvement = 0

In [57]:
from tqdm import tqdm

model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_pos = batch['start_positions'].to(device)
        end_pos = batch['end_positions'].to(device)

        optimizer.zero_grad()

        start_logits, end_logits = model(input_ids, attention_mask)

        loss = compute_loss(start_logits, end_logits, start_pos, end_pos)
        loss.backward()
        optimizer.step()

        if 'lr_scheduler' in locals():
            lr_scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")


Epoch 1/10


100%|██████████| 10950/10950 [01:14<00:00, 147.37it/s]


Epoch 1 Loss: 4.3537
Epoch 2/10


100%|██████████| 10950/10950 [01:13<00:00, 148.81it/s]


Epoch 2 Loss: 4.2335
Epoch 3/10


 38%|███▊      | 4208/10950 [00:26<00:42, 158.20it/s]


KeyboardInterrupt: 

In [68]:
def create_val_loader(val_data, vocab, batch_size=8):
  encoded_val = [encode_sample(sample, vocab) for sample in val_data]
  val_loader = DataLoader(encoded_val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
  return val_loader



In [69]:
val_loader = create_val_loader(val_data, vocab)

In [70]:
print(val_loader)

<torch.utils.data.dataloader.DataLoader object at 0x00000193B4124EE0>


In [71]:
from tqdm import tqdm

model.train()

# Early stopping parameters
best_val_loss = float('inf')
patience = 3
epochs_without_improvement = 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    total_loss = 0

    # Training loop
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_pos = batch['start_positions'].to(device)
        end_pos = batch['end_positions'].to(device)

        optimizer.zero_grad()

        start_logits, end_logits = model(input_ids, attention_mask)

        loss = compute_loss(start_logits, end_logits, start_pos, end_pos)
        loss.backward()
        optimizer.step()

        if 'lr_scheduler' in locals():
            lr_scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_pos = batch['start_positions'].to(device)
            end_pos = batch['end_positions'].to(device)

            start_logits, end_logits = model(input_ids, attention_mask)
            loss = compute_loss(start_logits, end_logits, start_pos, end_pos)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print("Early stopping triggered.")
            break

Epoch 1/10


100%|██████████| 10950/10950 [07:38<00:00, 23.86it/s]


Epoch 1 Loss: 4.3781


100%|██████████| 1322/1322 [00:12<00:00, 102.92it/s]


Epoch 1 Validation Loss: 4.3330
Epoch 2/10


100%|██████████| 10950/10950 [07:39<00:00, 23.81it/s]


Epoch 2 Loss: 4.1779


100%|██████████| 1322/1322 [00:12<00:00, 103.15it/s]


Epoch 2 Validation Loss: 4.2278
Epoch 3/10


100%|██████████| 10950/10950 [07:37<00:00, 23.91it/s]


Epoch 3 Loss: 3.9813


100%|██████████| 1322/1322 [00:12<00:00, 103.47it/s]


Epoch 3 Validation Loss: 4.1231
Epoch 4/10


100%|██████████| 10950/10950 [07:42<00:00, 23.67it/s]


Epoch 4 Loss: 3.8042


100%|██████████| 1322/1322 [00:12<00:00, 103.46it/s]


Epoch 4 Validation Loss: 4.1058
Epoch 5/10


100%|██████████| 10950/10950 [07:36<00:00, 23.97it/s]


Epoch 5 Loss: 3.6560


100%|██████████| 1322/1322 [00:12<00:00, 102.57it/s]


Epoch 5 Validation Loss: 4.1489
Epoch 6/10


100%|██████████| 10950/10950 [07:38<00:00, 23.88it/s]


Epoch 6 Loss: 3.5210


100%|██████████| 1322/1322 [00:12<00:00, 102.78it/s]


Epoch 6 Validation Loss: 4.1631
Epoch 7/10


100%|██████████| 10950/10950 [07:38<00:00, 23.86it/s]


Epoch 7 Loss: 3.4002


100%|██████████| 1322/1322 [00:12<00:00, 103.38it/s]

Epoch 7 Validation Loss: 4.2845
Early stopping triggered.





In [None]:
#Evaluation + Testing

In [113]:
sample = train_data[99]  # any sample you like #3046 #2047 #9067 #3400 #3500 #3800 #4000 #5000
encoded = encode_sample(sample, vocab)

input_ids = torch.tensor([encoded["input_ids"]]).to(device)
attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)


In [114]:
model.eval()
with torch.no_grad():
    start_logits, end_logits = model(input_ids, attention_mask=attention_mask)

start_idx = torch.argmax(start_logits, dim=1).item()
end_idx = torch.argmax(end_logits, dim=1).item()


In [115]:
tokens = encoded["tokens"]  # This is your `[CLS] question [SEP] context [SEP]` token list

# Fix case where model predicts end < start    1078  99  32
if end_idx < start_idx:
    end_idx = start_idx

predicted_tokens = tokens[start_idx:end_idx+1]
predicted_answer = " ".join(predicted_tokens).replace(" ##", "")

print("Question:", sample["question"])
print("True Answer:", sample["answers"]["text"][0])
print("Predicted Answer:", predicted_answer)


Question: What was the amount of wins Knute Rockne attained at Notre Dame while head coach?
True Answer: 105
Predicted Answer: 105


In [78]:
import torch

# Save model
model_path = "LGM.pt"
torch.save(model.state_dict(), model_path)


In [79]:
from google.colab import files
files.download("LGM.pt")


ModuleNotFoundError: No module named 'google'

In [None]:
import torch
from transformers import BertModel, BertTokenizerFast

# Load tokenizer (ensure it's the same as used during training)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Define the model architecture (must match the saved model)
class LearnGPT(torch.nn.Module):
    def __init__(self, vocab_size):
        super(LearnGPT, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.qa_outputs = torch.nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        return start_logits.squeeze(-1), end_logits.squeeze(-1)

# Load the saved model
vocab_size = tokenizer.vocab_size  # Ensure vocab_size matches training
model = LearnGPT(vocab_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.load_state_dict(torch.load("best_model.pt", map_location=device)) # Load the model's state dictionary
model.eval() # Set the model to evaluation mode

print("Model loaded successfully!")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss', marker='o')
plt.plot(val_losses, label='Validation Loss', marker='o')
plt.title("Training vs Validation Loss")
plt.xlabel("Epoch")
plt.xticks(range(len(train_losses)), range(1, len(train_losses) + 1))  # Set x-axis ticks to represent epochs
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import f1_score

def exact_match(predicted, true):
    # Exact match = if they are exactly equal
    return int(predicted == true)

def f1(predicted, true):
    # Convert answers to token sets for F1 calculation
    pred_tokens = set(predicted.split())
    true_tokens = set(true.split())

    # Calculate overlap
    intersection = len(pred_tokens.intersection(true_tokens))
    if intersection == 0:
        return 0
    precision = intersection / len(pred_tokens)
    recall = intersection / len(true_tokens)
    return 2 * (precision * recall) / (precision + recall)


In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    total_em = 0
    total_f1 = 0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_positions"].to(device)
            end_positions = batch["end_positions"].to(device)

            start_logits, end_logits = model(input_ids, attention_mask=attention_mask)

            # Get predictions
            start_idx = torch.argmax(start_logits, dim=1)
            end_idx = torch.argmax(end_logits, dim=1)

            # Convert token indices back to string answers
            for i in range(len(start_idx)):
                start = start_idx[i].item()
                end = end_idx[i].item()

                # Handle case where end < start
                if end < start:
                    end = start

                predicted_tokens = batch["tokens"][i][start:end+1]
                predicted_answer = " ".join(predicted_tokens).replace(" ##", "")
                true_answer = batch["answers"]["text"][i]

                # Calculate Exact Match and F1
                total_em += exact_match(predicted_answer, true_answer)
                total_f1 += f1(predicted_answer, true_answer)
                total_samples += 1

    # Compute average scores
    em_score = total_em / total_samples
    f1_score_avg = total_f1 / total_samples

    return em_score, f1_score_avg


In [None]:
from torch.utils.data import DataLoader, Dataset

# Assuming you've already defined `encode_sample` and `train_data` above.

# Step 1: Define a custom dataset class for the validation set
class QADataset(Dataset):
    def __init__(self, data, vocab, max_len=384):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return encode_sample(sample, self.vocab, self.max_len)

# Step 2: Prepare validation data (using the validation split of the dataset)
val_data = dataset["validation"]  # Assuming dataset is already loaded with train/validation splits

# Step 3: Tokenize the validation dataset (using encode_sample)
encoded_val = [encode_sample(sample, vocab) for sample in val_data]

# Step 4: Convert to PyTorch Dataset and DataLoader
val_dataset = QADataset(encoded_val, vocab)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Step 5: Now `val_loader` is ready for use during evaluation


In [None]:
# Print one sample from val_data
print(val_data[0])


{'id': '56be4db0acb8001400a502ec', 'title': 'Super_Bowl_50', 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'question': 'Which NFL team represented the AFC at Super Bowl 50?', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'ans

In [None]:
# Trim train_data to match the size of val_data
train_data_trimmed = train_data.select(range(len(val_data)))

# Add the 'question' column to val_data
val_data = val_data.add_column('question', [sample['question'] for sample in train_data_trimmed])


ValueError: The table can't have duplicated columns but columns ['question'] are duplicated.

In [None]:
# Ensure that train_data and val_data are the same length
assert len(train_data) == len(val_data), "Training and validation data lengths do not match!"

# Add the 'question' column from train_data to val_data
val_data = val_data.add_column('question', [sample['question'] for sample in train_data])

# Now you should be able to access 'question' in val_data as well


AssertionError: Training and validation data lengths do not match!

In [None]:
# Assuming `val_loader` is your validation data loader
em, f1_avg = evaluate_model(model, val_loader, device)

print(f"Exact Match (EM): {em:.4f}")
print(f"F1 Score: {f1_avg:.4f}")


AttributeError: 'Dataset' object has no attribute 'eval'

### visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

In [None]:
def calculate_f1(start_preds, end_preds, start_positions, end_positions):
    """Calculates F1 score for question answering."""
    # Convert predictions and targets to lists
    start_preds = start_preds.tolist()
    end_preds = end_preds.tolist()
    start_positions = start_positions.tolist()
    end_positions = end_positions.tolist()

    f1_scores = []
    for i in range(len(start_preds)):
        pred_start = start_preds[i]
        pred_end = end_preds[i]
        true_start = start_positions[i]
        true_end = end_positions[i]

        # Handle cases where pred_end < pred_start
        if pred_end < pred_start:
            pred_end = pred_start

        # Calculate F1 score for this example
        pred_span = set(range(pred_start, pred_end + 1))
        true_span = set(range(true_start, true_end + 1))
        overlap = len(pred_span.intersection(true_span))
        if overlap == 0:
            f1_scores.append(0.0)
        else:
            precision = overlap / len(pred_span)
            recall = overlap / len(true_span)
            f1 = (2 * precision * recall) / (precision + recall)
            f1_scores.append(f1)

    # Return average F1 score
    return sum(f1_scores) / len(f1_scores)


def visualize_loss(losses):
    """Visualizes training loss over epochs."""
    plt.plot(losses)
    plt.title("Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

In [None]:
# ... (your existing training loop) ...

epoch_losses = []
epoch_f1_scores = []

for epoch in range(num_epochs):
    # ... (rest of your training loop) ...

    # Evaluation at the end of each epoch
    model.eval()
    total_loss = 0
    all_start_preds = []
    all_end_preds = []
    all_start_positions = []
    all_end_positions = []

    with torch.no_grad():
        for batch in tqdm(train_loader):  # or use a separate validation loader
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_pos = batch['start_positions'].to(device)
            end_pos = batch['end_positions'].to(device)

            start_logits, end_logits = model(input_ids, attention_mask)
            loss = compute_loss(start_logits, end_logits, start_pos, end_pos)
            total_loss += loss.item()

            # Get predictions
            start_preds = torch.argmax(start_logits, dim=1)
            end_preds = torch.argmax(end_logits, dim=1)

            # Store predictions and targets for F1 calculation
            all_start_preds.extend(start_preds)
            all_end_preds.extend(end_preds)
            all_start_positions.extend(start_pos)
            all_end_positions.extend(end_pos)

    avg_loss = total_loss / len(train_loader)
    epoch_losses.append(avg_loss)

    # Calculate F1 score
    f1 = calculate_f1(
        all_start_preds, all_end_preds, all_start_positions, all_end_positions
    )
    epoch_f1_scores.append(f1)

    print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}, F1: {f1:.4f}")

# Visualize loss
visualize_loss(epoch_losses)

# Print or visualize F1 scores if needed
print("F1 scores over epochs:", epoch_f1_scores)

100%|██████████| 10950/10950 [00:52<00:00, 208.03it/s]


AttributeError: 'list' object has no attribute 'tolist'

#Advanced Tokenization with BERT


##Starting here

In [9]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


In [10]:
def encode_sample_fast(sample, tokenizer, max_len=384):
    question = sample["question"]
    context = sample["context"]
    answer_text = sample["answers"]["text"][0]
    answer_start = sample["answers"]["answer_start"][0]

    # Use tokenizer with offset mapping
    inputs = tokenizer(
        question,
        context,
        max_length=max_len,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors=None  # returns dict of lists (not tensors)
    )

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offset_mapping = inputs["offset_mapping"]

    # Find start and end positions
    start_char = answer_start
    end_char = answer_start + len(answer_text)

    start_token = end_token = 0
    for idx, (start, end) in enumerate(offset_mapping):
        if start <= start_char < end:
            start_token = idx
        if start < end_char <= end:
            end_token = idx
            break

    # Handle edge case if token not found
    if start_token == 0 and end_token == 0:
        start_token = end_token = 0

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "start_position": start_token,
        "end_position": end_token
    }


In [11]:
encoded_train = [encode_sample_fast(sample, tokenizer) for sample in train_data]


In [12]:
encoded_val = [encode_sample_fast(sample, tokenizer) for sample in val_data]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_LEN = 384
VOCAB_SIZE = tokenizer.vocab_size
EMBED_DIM = 128
NUM_HEADS = 8
FF_DIM = 512
NUM_LAYERS = 8
BATCH_SIZE = 16
EPOCHS = 10


In [27]:
class SquadDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = self.encodings[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"]),
            "start_positions": torch.tensor(item["start_position"]),
            "end_positions": torch.tensor(item["end_position"]),
        }

train_dataset = SquadDataset(encoded_train)
val_dataset = SquadDataset(encoded_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].to(x.device)
        return x


In [16]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.fc_out = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        B, T, C = x.shape
        qkv = self.qkv(x)
        qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-inf"))
        attn = F.softmax(scores, dim=-1)

        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.fc_out(out)


In [17]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )

    def forward(self, x):
        return self.net(x)


In [18]:
class EncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attn = MultiHeadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = FeedForward(embed_dim, ff_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x, mask):
        x = self.norm1(x + self.attn(x, mask))
        x = self.norm2(x + self.ff(x))
        return x


In [19]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len=512):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            EncoderLayer(embed_dim, num_heads, ff_dim)
            for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask):
        x = self.embed(input_ids)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, attention_mask.unsqueeze(1).unsqueeze(2))  # (B, 1, 1, T)
        return x


In [20]:
class QAModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers):
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size, embed_dim, num_heads, ff_dim, num_layers)
        self.qa_outputs = nn.Linear(embed_dim, 2)  # for start and end

    def forward(self, input_ids, attention_mask):
        enc_output = self.encoder(input_ids, attention_mask)  # (B, T, D)
        logits = self.qa_outputs(enc_output)  # (B, T, 2)
        start_logits, end_logits = logits.split(1, dim=-1)
        return start_logits.squeeze(-1), end_logits.squeeze(-1)


In [28]:
import time

model = QAModel(VOCAB_SIZE, EMBED_DIM, NUM_HEADS, FF_DIM, NUM_LAYERS).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
loss_fn = nn.CrossEntropyLoss()

epoch_times = []

for epoch in range(EPOCHS):
    start_time = time.time()

    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        optimizer.zero_grad()
        start_logits, end_logits = model(input_ids, attention_mask)

        loss = loss_fn(start_logits, start_positions) + loss_fn(end_logits, end_positions)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    end_time = time.time()
    epoch_duration = end_time - start_time
    epoch_times.append(epoch_duration)

    avg_epoch_time = sum(epoch_times) / len(epoch_times)
    remaining_epochs = EPOCHS - (epoch + 1)
    eta = avg_epoch_time * remaining_epochs

    # Convert ETA to min:sec format
    eta_min = int(eta // 60)
    eta_sec = int(eta % 60)

    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f} - Time: {epoch_duration:.2f}s - ETA: {eta_min}m {eta_sec}s")


Epoch 1 - Loss: 8.9202 - Time: 265.21s - ETA: 17m 40s
Epoch 2 - Loss: 8.0496 - Time: 261.55s - ETA: 13m 10s
Epoch 3 - Loss: 7.6936 - Time: 260.74s - ETA: 8m 45s
Epoch 4 - Loss: 7.4471 - Time: 264.34s - ETA: 4m 22s
Epoch 5 - Loss: 7.2138 - Time: 260.84s - ETA: 0m 0s


In [24]:
!python -V

Python 3.11.12


In [25]:
import torch
import torch.nn.functional as F

# Pick a validation sample (or train sample for overfitting test)
sample = val_data[0]  # Or train_data[0] to test overfitting

# Encode the sample
encoded = encode_sample_fast(sample, tokenizer)

# Convert to tensors and move to device
input_ids = torch.tensor(encoded["input_ids"]).unsqueeze(0).to(device)
attention_mask = torch.tensor(encoded["attention_mask"]).unsqueeze(0).to(device)

# Run the model in evaluation mode
model.eval()
with torch.no_grad():
    start_logits, end_logits = model(input_ids, attention_mask)

# Get predicted start/end positions
start_idx = torch.argmax(start_logits, dim=1).item()
end_idx = torch.argmax(end_logits, dim=1).item()

# Fix edge case: if end < start, set end = start
if end_idx < start_idx:
    end_idx = start_idx

# Decode predicted tokens
decoded_tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][start_idx:end_idx + 1])
predicted_answer = tokenizer.convert_tokens_to_string(decoded_tokens)

# Print results
print("📌 Question:", sample["question"])
print("✅ True Answer:", sample["answers"]["text"][0])
print("🤖 Predicted Answer:", predicted_answer)


📌 Question: Which NFL team represented the AFC at Super Bowl 50?
✅ True Answer: Denver Broncos
🤖 Predicted Answer: league ( nfl ) for the 2015 season. the american football conference ( afc ) champion denver broncos defeated the national football conference ( nfc ) champion carolina panthers 24 – 10 to earn their third super bowl title. the game was played on february 7, 2016, at levi ' s stadium in the san francisco bay area at santa clara, california. as this was the 50th super bowl, the league emphasized the " golden anniversary " with various gold - themed initiatives, as well as temporarily suspending the tradition of naming each super bowl game with roman numerals ( under which the game would have been known as " super bowl l " ), so that the logo could prominently feature the arabic numerals 50. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [None]:
from torch.utils.data import DataLoader

train_dataset = QADataset(encoded_train)
val_dataset = QADataset(encoded_val)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
