In [1]:
%%capture
!mkdir squad
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_train-v1.1.json  -O squad/train-v2.0.json
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_test-v1.1_WER54.json -O squad/dev-v2.0.json
!pip install transformers evaluate


In [1]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
import time
import evaluate
import matplotlib.pyplot as plt


  torch.utils._pytree._register_pytree_node(


In [2]:
import json
from pathlib import Path
import random

# Load data from JSON files
def load_squad_data(path):
    with open(path, 'r') as f:
        squad_dict = json.load(f)
    texts, queries, answers = [], [], []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # Check if the question is answerable (use default False if 'is_impossible' is not present)
                if qa.get('is_impossible', False):
                    continue  # Skip unanswerable questions for simplicity
                for answer in qa['answers']:
                    texts.append(context)
                    queries.append(question)
                    answers.append({
                        'answer_start': answer['answer_start'],
                        'text': answer['text']
                    })
    return texts, queries, answers

# Specify the paths for the SQuAD data files
train_path = Path('squad/train-v2.0.json')
val_path = Path('squad/dev-v2.0.json')

# Load train and validation data
train_texts, train_queries, train_answers = load_squad_data(train_path)
val_texts, val_queries, val_answers = load_squad_data(val_path)

# Display a random example from the training data
random_index = random.randint(0, len(train_texts) - 1)
print("Train Examples:", len(train_texts), "Validation Examples:", len(val_texts))
print("Sample Random Train Example:")
print("Text:", train_texts[random_index])
print("Query:", train_queries[random_index])
print("Answer:", train_answers[random_index])


Train Examples: 37111 Validation Examples: 17841
Sample Random Train Example:
Text: the final showdown with between justin carini one of the early favorites and kelly clarkson. clarkson was not initially thought of as a contender but impress the judges with some good performances in the final rounds such as her performance ever read the franklin natural woman and benny heightens stuff like that there and eventually won the crown on september fourth ten thousand two.
Query: What month did Kelly Clarkson win?
Answer: {'answer_start': 352, 'text': 'september'}


In [3]:
# Ensure each answer has an 'answer_end'
def set_end_positions(answers):
    for answer in answers:
        start_idx = answer['answer_start']
        answer['answer_end'] = start_idx + len(answer['text'])

set_end_positions(train_answers)
set_end_positions(val_answers)


In [4]:
from transformers import AutoTokenizer

# Load the tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"  # or another suitable QA model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Now tokenize each example with `encode_plus`
train_encodings = {
    'input_ids': [],
    'attention_mask': [],
    'start_positions': [],
    'end_positions': []
}

for text, query in zip(train_texts, train_queries):
    encodings = tokenizer.encode_plus(
        text,
        query,
        truncation=True,
        padding="max_length",
        max_length=512,
        stride=128,
        return_tensors="pt",
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
    train_encodings['input_ids'].append(encodings['input_ids'])
    train_encodings['attention_mask'].append(encodings['attention_mask'])

# Similar approach for validation data
val_encodings = {
    'input_ids': [],
    'attention_mask': []
}

for text, query in zip(val_texts, val_queries):
    encodings = tokenizer.encode_plus(
        text,
        query,
        truncation=True,
        padding="max_length",
        max_length=512,
        stride=128,
        return_tensors="pt",
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
    val_encodings['input_ids'].append(encodings['input_ids'])
    val_encodings['attention_mask'].append(encodings['attention_mask'])


In [5]:
# Adjust the function to add start and end positions for each encoding
def add_token_positions(texts, queries, answers):
    encodings = {
        'input_ids': [],
        'attention_mask': [],
        'start_positions': [],
        'end_positions': []
    }

    for text, query, answer in zip(texts, queries, answers):
        # Encode text and query with overflow and stride
        encoding = tokenizer(
            text,
            query,
            truncation=True,
            padding="max_length",
            max_length=512,
            stride=128,
            return_tensors="pt",
            return_offsets_mapping=True,
            return_overflowing_tokens=True
        )

        # Find start and end positions within each encoding
        offset_mapping = encoding['offset_mapping'][0]
        start_char = answer['answer_start']
        end_char = answer['answer_end']
        
        # Initialize start and end token positions as None
        start_token = None
        end_token = None

        # Loop through offset mapping to find the token positions
        for idx, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                start_token = idx
            if start < end_char <= end:
                end_token = idx
                break

        # Handle cases where token positions are not found
        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length

        # Append the results for each instance
        encodings['input_ids'].append(encoding['input_ids'][0])
        encodings['attention_mask'].append(encoding['attention_mask'][0])
        encodings['start_positions'].append(start_token)
        encodings['end_positions'].append(end_token)

    # Convert lists to tensors for compatibility
    encodings['input_ids'] = torch.stack(encodings['input_ids'])
    encodings['attention_mask'] = torch.stack(encodings['attention_mask'])
    encodings['start_positions'] = torch.tensor(encodings['start_positions'])
    encodings['end_positions'] = torch.tensor(encodings['end_positions'])

    return encodings

# Apply the function to create encodings with token positions
train_encodings = add_token_positions(train_texts, train_queries, train_answers)
val_encodings = add_token_positions(val_texts, val_queries, val_answers)


In [6]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


In [7]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)


  torch.utils._pytree._register_pytree_node(
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [9]:
import torch
from torch.amp import autocast, GradScaler  # Updated import for amp

epochs = 5
train_losses = []
val_losses = []
print_every = 1000
scaler = GradScaler("cuda")  # Updated GradScaler usage

for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    # Training Loop
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()

        # Use mixed precision with autocast
        with autocast("cuda"):  # Updated autocast usage
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_train_loss += loss.item()

        # Scale loss for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Print loss every `print_every` steps
        if (step + 1) % print_every == 0:
            print(f"Step {step + 1}/{len(train_loader)}, Loss: {loss.item()}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation (Evaluated every epoch in this case, can be adjusted)
    model.eval()
    total_val_loss = 0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_val_loss += loss.item()

            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            # Collect token indices instead of decoding to text for each step
            for i in range(len(input_ids)):
                start_idx = torch.argmax(start_logits[i])
                end_idx = torch.argmax(end_logits[i]) + 1
                val_preds.append((input_ids[i][start_idx:end_idx]).tolist())
                val_labels.append((input_ids[i][batch['start_positions'][i]:batch['end_positions'][i] + 1]).tolist())

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    # Print training and validation loss
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step 1000/9278, Loss: 2.0218353271484375
Step 2000/9278, Loss: 1.7225341796875
Step 3000/9278, Loss: 1.4822540283203125
Step 4000/9278, Loss: 1.4004745483398438
Step 5000/9278, Loss: 1.221822738647461
Step 6000/9278, Loss: 0.5452346801757812
Step 7000/9278, Loss: 2.1209716796875
Step 8000/9278, Loss: 1.0061416625976562
Step 9000/9278, Loss: 1.23480224609375
Epoch 1/5
Training Loss: 1.5023252016050237, Validation Loss: nan
Step 1000/9278, Loss: 1.667999267578125
Step 2000/9278, Loss: 2.9754638671875
Step 3000/9278, Loss: 1.7955322265625
Step 4000/9278, Loss: 1.4657363891601562
Step 5000/9278, Loss: 1.257904052734375
Step 6000/9278, Loss: 2.203155517578125
Step 7000/9278, Loss: 1.1234588623046875
Step 8000/9278, Loss: 0.2133312225341797
Step 9000/9278, Loss: 1.7354736328125
Epoch 2/5
Training Loss: 1.271622144317005, Validation Loss: nan
Step 1000/9278, Loss: 0.5979766845703125
Step 2000/9278, Loss: 0.7570538520812988
Step 3000/9278, Loss: 0.5679855346679688
Step 4000/9278, Loss: 0.54984

In [14]:
def compute_f1(pred, true):
    pred_tokens = pred.split()
    true_tokens = true.split()
    common = set(pred_tokens) & set(true_tokens)
    
    if len(common) == 0:
        return 0.0
    
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(true_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


In [15]:
from collections import Counter

def compute_f1(predictions, labels):
    def f1_score(pred, label):
        pred_tokens = Counter(pred.split())
        label_tokens = Counter(label.split())
        
        # Find common tokens between prediction and label
        common_tokens = pred_tokens & label_tokens
        num_common = sum(common_tokens.values())
        
        if num_common == 0:
            return 0.0

        precision = num_common / sum(pred_tokens.values())
        recall = num_common / sum(label_tokens.values())
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    # Calculate F1 for each prediction-label pair and average them
    f1_scores = [f1_score(pred, label) for pred, label in zip(predictions, labels)]
    return sum(f1_scores) / len(f1_scores)


In [16]:
def get_predictions(model, tokenizer, data_loader, device):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Forward pass to get logits
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            # Find the best start and end positions for the answer
            for i in range(len(input_ids)):
                start_idx = torch.argmax(start_logits[i])
                end_idx = torch.argmax(end_logits[i]) + 1

                # Decode predicted answer and true answer
                pred_text = tokenizer.decode(input_ids[i][start_idx:end_idx], skip_special_tokens=True)
                true_text = tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i] + 1], skip_special_tokens=True)

                predictions.append(pred_text)
                labels.append(true_text)

    return predictions, labels


In [None]:
# Assuming `val_loader` is your validation DataLoader and `device` is set to "cuda" or "cpu"
predictions, labels = get_predictions(model, tokenizer, val_loader, device)
f1 = compute_f1(predictions, labels)
print("F1 Score:", f1)


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load pretrained QA model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluation function as previously defined
def evaluate_model(model, tokenizer, test_loader, device):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Forward pass to get logits
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            for i in range(len(input_ids)):
                # Get the predicted start and end indices
                start_idx = torch.argmax(start_logits[i])
                end_idx = torch.argmax(end_logits[i]) + 1

                # Decode predicted answer and true answer
                pred_text = tokenizer.decode(input_ids[i][start_idx:end_idx], skip_special_tokens=True)
                true_text = tokenizer.decode(input_ids[i][batch['start_positions'][i]:batch['end_positions'][i]+1], skip_special_tokens=True)

                predictions.append(pred_text)
                labels.append(true_text)

    f1 = compute_f1(predictions, labels)
    print("F1 Score on test set:", f1)
    return f1

# Evaluate the pretrained model on your test loader
f1_score = evaluate_model(model, tokenizer, test_loader, device)


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'test_loader' is not defined