In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, default_data_collator, get_scheduler
from datasets import load_dataset
from accelerate import Accelerator, notebook_launcher
from huggingface_hub import Repository, get_full_repo_name, notebook_login
import evaluate
from tqdm.auto import tqdm
import numpy as np
import collections
import json
import matplotlib.pyplot as plt

print(f"torch.cuda.is_available(): {torch.cuda.is_available()}")

torch.cuda.is_available(): True


In [2]:
# SpokenSQuAD dataset files
spoken_train = 'spoken_train-v1.1.json'
spoken_test = 'spoken_test-v1.1.json'
spoken_test_WER44 = 'spoken_test-v1.1_WER44.json'
spoken_test_WER54 = 'spoken_test-v1.1_WER54.json'

# function to re-format json data
def reformat_json(json_file):
    with open(json_file, 'r') as f:
        json_data = json.load(f)
    
    examples = []
    # iterate over 'data' list
    for elem in json_data['data']:
        title = elem['title']

        # iterate over paragraphs
        for paragraph in elem['paragraphs']:
            context = paragraph['context']

            # iterate over question-answers for this paragraph
            for qa in paragraph['qas']:
                example = {}
                example['id'] = qa['id']
                example['title'] = title.strip()
                example['context'] = context.strip()
                example['question'] = qa['question'].strip()
                example['answers'] = {}
                example['answers']['answer_start'] = [answer["answer_start"] for answer in qa['answers']]
                example['answers']['text'] = [answer["text"] for answer in qa['answers']]
                examples.append(example)
    
    out_dict = {'data': examples}

    output_json_file = 'out_'+json_file
    with open(output_json_file, 'w') as f:
        json.dump(out_dict, f)

    return output_json_file


print("Loading SpokenSQuAD data...")

# reformat json data
spoken_train = reformat_json(spoken_train)
spoken_test = reformat_json(spoken_test)
spoken_test_WER44 = reformat_json(spoken_test_WER44)
spoken_test_WER54 = reformat_json(spoken_test_WER54)

spoken_squad_dataset = load_dataset('json',
                                    data_files= { 'train': spoken_train,
                                                  'validation': spoken_test,         # NO NOISE: 22.73% WER
                                                  'test_WER44': spoken_test_WER44,   # NOISE V1: 44.22% WER
                                                  'test_WER54': spoken_test_WER54 }, # NOISE V2: 54.82% WER
                                    field = 'data')

Loading SpokenSQuAD data...


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test_WER44 split: 0 examples [00:00, ? examples/s]

Generating test_WER54 split: 0 examples [00:00, ? examples/s]

In [None]:
### Model fine-tuned from bert-base-uncased
model_checkpoint = "rein5/bert-base-uncased-finetuned-spoken-squad"

# Uncomment this to use bert-base-uncased instead (e.g. to fine-tune from scratch)
#model_checkpoint = "bert-base-uncased"

print("Instantiating Model and Tokenizer...")
print("Model: " + model_checkpoint)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print("\nModel Architecture: BERT (base, uncased) + Linear head (2 output logits):")
print(model)

Instantiating Model and Tokenizer...
Model: bert-base-uncased


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model Architecture: BERT (base, uncased) + Linear head (2 output logits):
BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_featur

In [4]:
max_length = 384 
stride = 64

""" 
    Function to preprocess the training examples. It performs the following operations:  
        - tokenize examples into question-context token sequences of the form: 
            [CLS] question [SEP] context [SEP]         
        - apply windowing with given stride
        - compute output labels (start_index, end_index)
            - if answer not fully within windowed context, set label to (0, 0)
"""

def preprocess_training_examples(examples):
    questions = [question.strip() for question in examples['question']]
    inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    offset_mapping = inputs.pop('offset_mapping')
    sample_map = inputs.pop('overflow_to_sample_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # find start and end of the context
        idx = 0
        while sequence_ids[idx] != 1: 
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if answer not fully inside context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    
    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs


print("Preprocessing training data...")

train_dataset = spoken_squad_dataset['train'].map(
    preprocess_training_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['train'].column_names
)

Preprocessing training data...


Map:   0%|          | 0/37111 [00:00<?, ? examples/s]

In [5]:
# function to preprocess validation/test examples (performs tokenization, windowing)
def process_validation_examples(examples):
    questions = [question.strip() for question in examples['question']]
    inputs = tokenizer(
        questions, 
        examples['context'],
        max_length = max_length,
        truncation = 'only_second',
        stride = stride, 
        return_overflowing_tokens = True,
        return_offsets_mapping=True, 
        padding = 'max_length'
    )

    sample_map = inputs.pop('overflow_to_sample_mapping')
    example_ids = []

    for i in range(len(inputs['input_ids'])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offsets = inputs['offset_mapping'][i]
        inputs["offset_mapping"][i] = [
            offset if sequence_ids[k] == 1 else None for k, offset in enumerate(offsets)
        ]

    inputs['example_id'] = example_ids
    return inputs


print("Preprocessing test data (NO NOISE: 22.73% WER)...")
validation_dataset = spoken_squad_dataset['validation'].map(
    process_validation_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['validation'].column_names
)

print("Preprocessing V1 noise test data (44.22% WER)...")
test_WER44_dataset = spoken_squad_dataset['test_WER44'].map(
    process_validation_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['test_WER44'].column_names
)

print("Preprocessing V2 noise test data (54.82% WER)...")
test_WER54_dataset = spoken_squad_dataset['test_WER54'].map(
    process_validation_examples,
    batched = True,
    remove_columns=spoken_squad_dataset['test_WER54'].column_names
)


print(validation_dataset)

Preprocessing test data (NO NOISE: 22.73% WER)...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing V1 noise test data (44.22% WER)...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Preprocessing V2 noise test data (54.82% WER)...


Map:   0%|          | 0/5351 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 5423
})


In [6]:
metric = evaluate.load("squad")

n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features): 
        example_to_features[feature["example_id"]].append(idx)
    
    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []
        
        # loop thru all features associated with example ID
        for feature_index in example_to_features[example_id]: 
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]
            
            start_indexes = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
            end_indexes = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()
            for start_index in start_indexes: 
                for end_index in end_indexes: 
                    # skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None: 
                        continue
                    # skip answers with a length that is either <0 or >max_answer_length
                    if end_index < start_index or end_index-start_index+1 > max_answer_length: 
                        continue
                    
                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index]
                    }
                    answers.append(answer)
        # select answer with best score
        if len(answers) > 0: 
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else: 
            predicted_answers.append({"id": example_id, "prediction_text": ""})
        
    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [8]:
train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")
test_WER44_set = test_WER44_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER44_set.set_format("torch")
test_WER54_set = test_WER54_dataset.remove_columns(["example_id", "offset_mapping"])
test_WER54_set.set_format("torch")

print("Creating train dataloader...")
train_dataloader = DataLoader(
    train_dataset, 
    shuffle = True, 
    collate_fn=default_data_collator, 
    batch_size=20
)

print("Creating validation dataloader...")
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=20
)
print("Creating test V1 dataloader...")
test_WER44_dataloader = DataLoader(
    test_WER44_set, collate_fn=default_data_collator, batch_size=20
)
print("Creating test V2 dataloader...")
test_WER54_dataloader = DataLoader(
    test_WER54_set, collate_fn=default_data_collator, batch_size=20
)


Creating train dataloader...
Creating validation dataloader...
Creating test V1 dataloader...
Creating test V2 dataloader...


In [10]:
output_dir = "bert-base-uncased-finetuned-spoken-squad"

""" ### CODE USED TO UPLOAD THE FINETUNED MODEL TO 
    ###    huggingface.co/rein5/bert-base-uncased-finetuned-spoken-squad
    
notebook_login()

model_name = output_dir
repo_name = get_full_repo_name(model_name)
print("Trained model checkpoints uploaded to: " + repo_name)

print("Cloning model repo...")
repo = Repository(output_dir, clone_from=repo_name)

upload_to_hub = True
"""
upload_to_hub = False

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Trained model checkpoints uploaded to: rein5/bert-base-uncased-finetuned-spoken-squad
Cloning model repo...


c:\Users\fran\Desktop\spoken-squad-language-model\bert-base-uncased-finetuned-spoken-squad is already a clone of https://huggingface.co/rein5/bert-base-uncased-finetuned-spoken-squad. Make sure you pull the latest changes with `repo.git_pull()`.


In [11]:
# function to evaluate the model on a give dataset 
def evaluate_model(model, dataloader, dataset, dataset_before_preprocessing, accelerator=None):
    if not accelerator: 
        accelerator = Accelerator(mixed_precision='fp16')
        model, dataloader = accelerator.prepare(
            model, dataloader
        )
    
    model.eval()
    start_logits = []
    end_logits = []
    for batch in tqdm(dataloader):
        with torch.no_grad(): 
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(dataset)]
    end_logits = end_logits[: len(dataset)]

    metrics = compute_metrics(
        start_logits, end_logits, dataset, dataset_before_preprocessing
    )
    return metrics



### TRAINING LOOP
def train_model(model=model, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, epochs = 2):
        training_steps = epochs * len(train_dataloader)
        
        # Lists to store metrics for plotting
        train_losses = []
        val_exact_matches = []
        val_f1_scores = []

        accelerator = Accelerator(mixed_precision='fp16')
        optimizer = AdamW(model.parameters(), lr = 5e-5)

        model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
            model, optimizer, train_dataloader, eval_dataloader
        )

        # Evaluate before training
        print("Evaluating model before fine-tuning...")
        metrics = evaluate_model(model, eval_dataloader, validation_dataset, spoken_squad_dataset['validation'], accelerator)
        print("Initial metrics:", metrics)
        val_exact_matches.append(metrics['exact_match'])
        val_f1_scores.append(metrics['f1'])

        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=training_steps,
        )

        print_every = 100  # Print loss every 100 steps
        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}...")
            # train for 1 epoch
            model.train()
            total_loss = 0
            for step, batch in enumerate(train_dataloader):
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.item()
                
                if (step + 1) % print_every == 0:
                    avg_loss = total_loss / print_every
                    print(f"Step {step + 1}/{len(train_dataloader)}, Average Loss: {avg_loss:.4f}")
                    train_losses.append(avg_loss)
                    total_loss = 0
                    
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # evaluate after each epoch 
            #accelerator.print("Evaluation...")
            metrics = evaluate_model(model, eval_dataloader, validation_dataset, spoken_squad_dataset['validation'], accelerator)
            val_exact_matches.append(metrics['exact_match'])
            val_f1_scores.append(metrics['f1'])
            print(f"Epoch {epoch + 1} - metrics: {metrics}")

            # save and upload 
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if upload_to_hub and accelerator.is_main_process: 
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message = f"Training in progress. Completed epoch {epoch + 1}", blocking=False
                )

### Uncomment to fine-tune the model further
#notebook_launcher(train_model, num_processes=1) # change num_processes value for multi-gpu training
###

### EVALUATE FINETUNED MODEL 
print("Evaluating model on Test Set...")
test_metrics = evaluate_model(model, eval_dataloader, validation_dataset, spoken_squad_dataset['validation'])
print("Evaluating model on Test V1 Set...")
test_v1_metrics = evaluate_model(model, test_WER44_dataloader, test_WER44_dataset, spoken_squad_dataset['test_WER44'])
print("Evaluating model on Test V2 Set...")
test_v2_metrics = evaluate_model(model, test_WER54_dataloader, test_WER54_dataset, spoken_squad_dataset['test_WER54'])

print("============= RESULTS =============")
print("Test Set    (NO NOISE - 22.73% WER) - exact match: " + str(test_metrics['exact_match']) + ", F1 score: " + str(test_metrics['f1']))
print("Test V1 Set (V1 NOISE - 44.22% WER) - exact match: " + str(test_v1_metrics['exact_match']) + ", F1 score: " + str(test_v1_metrics['f1']))
print("Test V2 Set (V2 NOISE - 54.82% WER) - exact match: " + str(test_v2_metrics['exact_match']) + ", F1 score: " + str(test_v2_metrics['f1']))
print("===================================")

Launching training on one GPU.
Evaluating model before fine-tuning...


  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Initial metrics: {'exact_match': 0.16819286114744908, 'f1': 6.221580477399483}
Epoch 1/2...
Step 100/1866, Average Loss: 3.2164
Step 200/1866, Average Loss: 2.0450
Step 300/1866, Average Loss: 1.8930
Step 400/1866, Average Loss: 1.8262
Step 500/1866, Average Loss: 1.7346
Step 600/1866, Average Loss: 1.6516
Step 700/1866, Average Loss: 1.6209
Step 800/1866, Average Loss: 1.5943
Step 900/1866, Average Loss: 1.5286
Step 1000/1866, Average Loss: 1.5187
Step 1100/1866, Average Loss: 1.5113
Step 1200/1866, Average Loss: 1.4381
Step 1300/1866, Average Loss: 1.4730
Step 1400/1866, Average Loss: 1.5093
Step 1500/1866, Average Loss: 1.4255
Step 1600/1866, Average Loss: 1.3903
Step 1700/1866, Average Loss: 1.3534
Step 1800/1866, Average Loss: 1.3712


  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Epoch 1 - metrics: {'exact_match': 63.16576340870865, 'f1': 73.80180229828255}
Epoch 2/2...
Step 100/1866, Average Loss: 0.8438
Step 200/1866, Average Loss: 0.8188
Step 300/1866, Average Loss: 0.8208
Step 400/1866, Average Loss: 0.8036
Step 500/1866, Average Loss: 0.8217
Step 600/1866, Average Loss: 0.8092
Step 700/1866, Average Loss: 0.8248
Step 800/1866, Average Loss: 0.7925
Step 900/1866, Average Loss: 0.8392
Step 1000/1866, Average Loss: 0.8520
Step 1100/1866, Average Loss: 0.7753
Step 1200/1866, Average Loss: 0.7682
Step 1300/1866, Average Loss: 0.8117
Step 1400/1866, Average Loss: 0.8091
Step 1500/1866, Average Loss: 0.8282
Step 1600/1866, Average Loss: 0.7594
Step 1700/1866, Average Loss: 0.8602
Step 1800/1866, Average Loss: 0.8455


  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Epoch 2 - metrics: {'exact_match': 63.98803961876285, 'f1': 74.1485122086755}
Evaluating model on Test Set...


  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Evaluating model on Test V1 Set...


  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Evaluating model on Test V2 Set...


  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/5351 [00:00<?, ?it/s]

Test Set    (NO NOISE - 22.73% WER) - exact match: 63.98803961876285, F1 score: 74.1485122086755
Test V1 Set (V1 NOISE - 44.22% WER) - exact match: 40.34759857970473, F1 score: 55.20098517888367
Test V2 Set (V2 NOISE - 54.82% WER) - exact match: 28.518034012334144, F1 score: 42.237515977572336
