In [1]:
%%capture
!mkdir squad
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_train-v1.1.json  -O squad/train-v2.0.json
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_test-v1.1_WER54.json -O squad/dev-v2.0.json
!pip install transformers evaluate


In [1]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
import time
import evaluate
import matplotlib.pyplot as plt


  torch.utils._pytree._register_pytree_node(


In [2]:
import json
from pathlib import Path
import random

# Load data from JSON files
def load_squad_data(path):
    with open(path, 'r') as f:
        squad_dict = json.load(f)
    texts, queries, answers = [], [], []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # Check if the question is answerable (use default False if 'is_impossible' is not present)
                if qa.get('is_impossible', False):
                    continue  # Skip unanswerable questions for simplicity
                for answer in qa['answers']:
                    texts.append(context)
                    queries.append(question)
                    answers.append({
                        'answer_start': answer['answer_start'],
                        'text': answer['text']
                    })
    return texts, queries, answers

# Specify the paths for the SQuAD data files
train_path = Path('squad/train-v2.0.json')
val_path = Path('squad/dev-v2.0.json')

# Load train and validation data
train_texts, train_queries, train_answers = load_squad_data(train_path)
val_texts, val_queries, val_answers = load_squad_data(val_path)

# Display a random example from the training data
random_index = random.randint(0, len(train_texts) - 1)
print("Train Examples:", len(train_texts), "Validation Examples:", len(val_texts))
print("Sample Random Train Example:")
print("Text:", train_texts[random_index])
print("Query:", train_queries[random_index])
print("Answer:", train_answers[random_index])


Train Examples: 37111 Validation Examples: 17841
Sample Random Train Example:
Text: the slowing sales growth has been attributed to the maturing pc market which constituted sixty six percent of del sales and analysts suggested that del needed to make inroads into nine p c business this segment such as storage services and servers. dells price advantage was tied to its ultra lean manufacturing for desktop p cs but this became less important as savings became harder to find inside the company supply chain and as competitors such as hewlett packard and these are made their p c manufacturing operations more efficient to match down weakening dells traditional price differentiation. throughout the entire pc industry declines in prices along with commensurate increases in performance meant that dell had fewer opportunities to accel to their customers lucrative strategy of encouraging buyers to upgrade the processor memory. as a result the company was selling a greater proportion of inexpensiv

In [3]:
# Ensure each answer has an 'answer_end'
def set_end_positions(answers):
    for answer in answers:
        start_idx = answer['answer_start']
        answer['answer_end'] = start_idx + len(answer['text'])

set_end_positions(train_answers)
set_end_positions(val_answers)


In [4]:
from transformers import AutoTokenizer

# Load the tokenizer
model_name = "distilbert-base-uncased"  # or another suitable QA model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Now tokenize each example with `encode_plus`
train_encodings = {
    'input_ids': [],
    'attention_mask': [],
    'start_positions': [],
    'end_positions': []
}

for text, query in zip(train_texts, train_queries):
    encodings = tokenizer.encode_plus(
        text,
        query,
        truncation=True,
        padding="max_length",
        max_length=512,
        stride=128,
        return_tensors="pt",
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
    train_encodings['input_ids'].append(encodings['input_ids'])
    train_encodings['attention_mask'].append(encodings['attention_mask'])

# Similar approach for validation data
val_encodings = {
    'input_ids': [],
    'attention_mask': []
}

for text, query in zip(val_texts, val_queries):
    encodings = tokenizer.encode_plus(
        text,
        query,
        truncation=True,
        padding="max_length",
        max_length=512,
        stride=128,
        return_tensors="pt",
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
    val_encodings['input_ids'].append(encodings['input_ids'])
    val_encodings['attention_mask'].append(encodings['attention_mask'])


In [5]:
# Adjust the function to add start and end positions for each encoding
def add_token_positions(texts, queries, answers):
    encodings = {
        'input_ids': [],
        'attention_mask': [],
        'start_positions': [],
        'end_positions': []
    }

    for text, query, answer in zip(texts, queries, answers):
        # Encode text and query with overflow and stride
        encoding = tokenizer(
            text,
            query,
            truncation=True,
            padding="max_length",
            max_length=512,
            stride=128,
            return_tensors="pt",
            return_offsets_mapping=True,
            return_overflowing_tokens=True
        )

        # Find start and end positions within each encoding
        offset_mapping = encoding['offset_mapping'][0]
        start_char = answer['answer_start']
        end_char = answer['answer_end']
        
        # Initialize start and end token positions as None
        start_token = None
        end_token = None

        # Loop through offset mapping to find the token positions
        for idx, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                start_token = idx
            if start < end_char <= end:
                end_token = idx
                break

        # Handle cases where token positions are not found
        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length

        # Append the results for each instance
        encodings['input_ids'].append(encoding['input_ids'][0])
        encodings['attention_mask'].append(encoding['attention_mask'][0])
        encodings['start_positions'].append(start_token)
        encodings['end_positions'].append(end_token)

    # Convert lists to tensors for compatibility
    encodings['input_ids'] = torch.stack(encodings['input_ids'])
    encodings['attention_mask'] = torch.stack(encodings['attention_mask'])
    encodings['start_positions'] = torch.tensor(encodings['start_positions'])
    encodings['end_positions'] = torch.tensor(encodings['end_positions'])

    return encodings

# Apply the function to create encodings with token positions
train_encodings = add_token_positions(train_texts, train_queries, train_answers)
val_encodings = add_token_positions(val_texts, val_queries, val_answers)


In [6]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


In [7]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased").to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [9]:
import torch
from torch.amp import autocast, GradScaler  # Updated import for amp

epochs = 5
train_losses = []
val_losses = []
print_every = 1000
scaler = GradScaler("cuda")  # Updated GradScaler usage

for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    # Training Loop
    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()

        # Use mixed precision with autocast
        with autocast("cuda"):  # Updated autocast usage
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_train_loss += loss.item()

        # Scale loss for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Print loss every `print_every` steps
        if (step + 1) % print_every == 0:
            print(f"Step {step + 1}/{len(train_loader)}, Loss: {loss.item()}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation (Evaluated every epoch in this case, can be adjusted)
    model.eval()
    total_val_loss = 0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_val_loss += loss.item()

            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            # Collect token indices instead of decoding to text for each step
            for i in range(len(input_ids)):
                start_idx = torch.argmax(start_logits[i])
                end_idx = torch.argmax(end_logits[i]) + 1
                val_preds.append((input_ids[i][start_idx:end_idx]).tolist())
                val_labels.append((input_ids[i][batch['start_positions'][i]:batch['end_positions'][i] + 1]).tolist())

    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    # Print training and validation loss
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step 1000/2320, Loss: 1.6515426635742188
Step 2000/2320, Loss: 1.1297216415405273
Epoch 1/5
Training Loss: 2.1570609541288737, Validation Loss: nan
Step 1000/2320, Loss: 1.6353263854980469
Step 2000/2320, Loss: 1.3644336462020874
Epoch 2/5
Training Loss: 1.3003990753329007, Validation Loss: nan
Step 1000/2320, Loss: 1.0834035873413086
Step 2000/2320, Loss: 0.533228874206543
Epoch 3/5
Training Loss: 0.9302236610124337, Validation Loss: nan
Step 1000/2320, Loss: 0.5495268106460571
Step 2000/2320, Loss: 0.703312337398529
Epoch 4/5
Training Loss: 0.6554086196204197, Validation Loss: nan
Step 1000/2320, Loss: 1.362839937210083
Step 2000/2320, Loss: 0.45770543813705444
Epoch 5/5
Training Loss: 0.4811129829782093, Validation Loss: nan


In [10]:
def compute_f1(pred, true):
    pred_tokens = pred.split()
    true_tokens = true.split()
    common = set(pred_tokens) & set(true_tokens)
    
    if len(common) == 0:
        return 0.0
    
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(true_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


In [11]:
from collections import Counter

def compute_f1(predictions, labels):
    def f1_score(pred, label):
        pred_tokens = Counter(pred.split())
        label_tokens = Counter(label.split())
        
        # Find common tokens between prediction and label
        common_tokens = pred_tokens & label_tokens
        num_common = sum(common_tokens.values())
        
        if num_common == 0:
            return 0.0

        precision = num_common / sum(pred_tokens.values())
        recall = num_common / sum(label_tokens.values())
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    # Calculate F1 for each prediction-label pair and average them
    f1_scores = [f1_score(pred, label) for pred, label in zip(predictions, labels)]
    return sum(f1_scores) / len(f1_scores)


In [12]:
def get_predictions(model, tokenizer, data_loader, device):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Forward pass to get logits
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            # Find the best start and end positions for the answer
            for i in range(len(input_ids)):
                start_idx = torch.argmax(start_logits[i])
                end_idx = torch.argmax(end_logits[i]) + 1

                # Decode predicted answer and true answer
                pred_text = tokenizer.decode(input_ids[i][start_idx:end_idx], skip_special_tokens=True)
                true_text = tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i] + 1], skip_special_tokens=True)

                predictions.append(pred_text)
                labels.append(true_text)

    return predictions, labels


In [13]:
# Assuming `val_loader` is your validation DataLoader and `device` is set to "cuda" or "cpu"
predictions, labels = get_predictions(model, tokenizer, val_loader, device)
f1 = compute_f1(predictions, labels)
print("F1 Score:", f1)


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


F1 Score: 0.13961467801300054


In [16]:

from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    if len(scores_for_ground_truths)==0: return 0
    return max(scores_for_ground_truths)

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1
def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
          f1_score, prediction, [ground_truths])
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'f1': f1}

In [17]:
evaluate(references,answers)

NameError: name 'references' is not defined