In [4]:
import torch
import json
from tqdm.auto import tqdm

# Define the function to read the SQuAD dataset
def read_squad(path):
    # Open JSON file and load into dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # Initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []

    # Iterate through all data in squad data
    for group in tqdm(squad_dict['data'], desc=f'Reading {path}'):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # Check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # Append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    # Return formatted data lists
    return contexts, questions, answers

# Execute our read SQuAD function for training and validation sets
train_contexts, train_questions, train_answers = read_squad('dataset/spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = read_squad('dataset/spoken_test-v1.1.json')

# Define the function to add end indices to answers
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

# Add end indices to answers for training and validation sets
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

# Load the DistilBert tokenizer
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Define the function to add token positions to encodings
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Tokenize the training and validation sets
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

# Add token positions to encodings for training and validation sets
for encodings, answers in zip([train_encodings, val_encodings], [train_answers, val_answers]):
    add_token_positions(encodings,


In [6]:
# function to add answer end index to the answer dictionary
def add_end_index(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        # check if the answer text matches the context text from start index to end index
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            # check if the answer text matches the context text with an offset of 1 or 2
            for offset in [1, 2]:
                if context[start_idx-offset:end_idx-offset] == gold_text:
                    answer['answer_start'] = start_idx - offset
                    answer['answer_end'] = end_idx - offset           

# call the function to add answer end index to the train and validation answer dictionaries
add_end_index(train_answers, train_contexts)
add_end_index(val_answers, val_contexts)

from transformers import DistilBertTokenizerFast
# initialize the DistilBertTokenizerFast from transformers library
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# encode the train and validation contexts and questions using the tokenizer with truncation and padding
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)


In [15]:
# function to add answer end index to answer dictionary
def add_end_index(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n           

# add answer end index to train and validation answer dictionaries
add_end_index(train_answers, train_contexts)
add_end_index(val_answers, val_contexts)

# create DistilBERT tokenizer
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# tokenize the context and question pairs, and pad/truncate to the maximum length
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

# function to add start and end token positions to encodings dictionary
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# add start and end token positions to train and validation encodings dictionaries
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)


In [17]:
# Define a custom PyTorch dataset for the SQuAD dataset
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # Initialize the encodings attribute with the input encodings
        self.encodings = encodings   
    def __getitem__(self, idx):
        # Get the tensor values for each encoding key and index
        tensor_values = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return tensor_values    
    def __len__(self):
        # Return the number of input IDs in the encodings
        return len(self.encodings.input_ids)
# Create custom PyTorch datasets for both the training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)
# Load the DistilBertForQuestionAnswering model from the Hugging Face Transformers library
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
##Data Preprocessing 

In [28]:
# Import required libraries
from transformers import BertForQuestionAnswering
from transformers import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

# Detect device and move model to device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
# Set model to training mode
model.train()
# Initialize Adam optimizer with weight decay to reduce overfitting
optimizer = AdamW(model.parameters(), lr=2e-6)
# Initialize DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Train the model for 10 epochs
for epoch in range(10):
    # Set model to training mode
    model.train()
    # Initialize tqdm loop for training data
    loop = tqdm(train_loader, leave=True)
    # Loop through batches in training data
    for batch in loop:
        # Zero the gradients for this batch
        optimizer.zero_grad()
        # Load batch tensors onto device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)        
        # Pass batch through model and calculate loss
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # Extract loss from model outputs
        loss = outputs[0]       
        # Backpropagate loss and update model parameters
        loss.backward()
        optimizer.step()        
        # Update tqdm loop with loss value
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 2320/2320 [14:21<00:00,  2.69it/s, loss=4.43]
Epoch 1: 100%|██████████| 2320/2320 [14:12<00:00,  2.72it/s, loss=2.29]
Epoch 2: 100%|██████████| 2320/2320 [14:19<00:00,  2.70it/s, loss=2.22]
Epoch 3: 100%|██████████| 2320/2320 [14:26<00:00,  2.68it/s, loss=1.78]
Epoch 4: 100%|██████████| 2320/2320 [14:32<00:00,  2.66it/s, loss=3.32] 
Epoch 5: 100%|██████████| 2320/2320 [14:37<00:00,  2.64it/s, loss=2.3]  
Epoch 6: 100%|██████████| 2320/2320 [14:30<00:00,  2.67it/s, loss=0.805]
Epoch 7: 100%|██████████| 2320/2320 [14:45<00:00,  2.62it/s, loss=1.37] 
Epoch 8: 100%|██████████| 2320/2320 [14:51<00:00,  2.60it/s, loss=1.74] 
Epoch 9: 100%|██████████| 2320/2320 [14:40<00:00,  2.64it/s, loss=1.3]  


In [32]:
# switch model out of training mode
# code to calculate accuracy, precision, recall, and F1 score
import numpy as np

model.eval()
val_loader = DataLoader(val_dataset, batch_size=16)
true_starts = []
true_ends = []
pred_starts = []
pred_ends = []

# loop through batches
for batch in tqdm(val_loader):
    # pull batched items from loader
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    # make predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    # pull preds out
    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)
    # append predictions and true values to lists
    true_starts.append(start_true.cpu().numpy())
    true_ends.append(end_true.cpu().numpy())
    pred_starts.append(start_pred.cpu().numpy())
    pred_ends.append(end_pred.cpu().numpy())

# concatenate lists to numpy arrays
true_starts_np = np.concatenate(true_starts)
true_ends_np = np.concatenate(true_ends)
pred_starts_np = np.concatenate(pred_starts)
pred_ends_np = np.concatenate(pred_ends)

# calculate true positives, false positives, and false negatives
true_pos_starts = np.sum(np.logical_and(true_starts_np == pred_starts_np, true_starts_np != -1))
true_pos_ends = np.sum(np.logical_and(true_ends_np == pred_ends_np, true_ends_np != -1))
false_pos_starts = np.sum(np.logical_and(true_starts_np != pred_starts_np, pred_starts_np != -1))
false_pos_ends = np.sum(np.logical_and(true_ends_np != pred_ends_np, pred_ends_np != -1))
false_neg_starts = np.sum(np.logical_and(true_starts_np != pred_starts_np, true_starts_np != -1))
false_neg_ends = np.sum(np.logical_and(true_ends_np != pred_ends_np, true_ends_np != -1))

# calculate precision and recall
precision_starts = true_pos_starts / (true_pos_starts + false_pos_starts + 1e-9)
recall_starts = true_pos_starts / (true_pos_starts + false_neg_starts + 1e-9)
precision_ends = true_pos_ends / (true_pos_ends + false_pos_ends + 1e-9)
recall_ends = true_pos_ends / (true_pos_ends + false_neg_ends + 1e-9)

# calculate F1 score
f1_starts = 2 * (precision_starts * recall_starts) / (precision_starts + recall_starts + 1e-9)
f1_ends = 2 * (precision_ends * recall_ends) / (precision_ends + recall_ends + 1e-9)
f1 = (f1_starts + f1_ends) / 2

print("F1 score: {:.4f}".format(f1))

# print prediction, start and end positions
print("PREDICTION\tSTART\tEND\n")

for i in range(len(start_true)):
    print(f"{i}\t{start_true[i]}\t{end_true[i]}\n"
          f"{i}\t{start_pred[i]}\t{end_pred[i]}\n")




100%|██████████| 993/993 [02:03<00:00,  8.07it/s]

F1 score: 0.5378
PREDICTION	START	END

0	59	59
0	52	45

1	59	60
1	52	45

2	59	59
2	52	45






In [None]:
#pre processing ends

In [None]:
#Model finetuning

In [40]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from accelerate import Accelerator

# check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# move model to device
model.to(device)
# set model to training mode
model.train()

# initialize optimizer
optim = AdamW(model.parameters(), lr=2e-6)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# calculate number of training steps
num_training_steps = len(train_loader) * 30

# initialize scheduler
scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=0, num_training_steps=num_training_steps
)

# initialize accelerator to speed up training
accelerator = Accelerator()
# prepare model, optimizer, and scheduler for training
model, optimizer, training_dataloader, scheduler = accelerator.prepare(model, optim, train_loader, scheduler)

# loop over training data for multiple epochs
for epoch in range(5):
    # set model to training mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # backpropagate gradients using the accelerator
        accelerator.backward(loss)
        # update model parameters
        optim.step()
        # update learning rate using the scheduler
        scheduler.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item(), lr=optim.param_groups[0]['lr'])
        
        # print every 500 steps
        if (loop.n % 500 == 0):
            print(f"Batch {loop.n}/{len(train_loader)} - Loss: {loss.item()}, Learning Rate: {optim.param_groups[0]['lr']}")



Epoch 0:   0%|          | 1/2320 [00:00<22:37,  1.71it/s, loss=1.08, lr=2e-6]

Batch 0/2320 - Loss: 1.0806083679199219, Learning Rate: 1.999971264367816e-06


Epoch 0:  22%|██▏       | 501/2320 [03:11<11:38,  2.60it/s, loss=1.36, lr=1.99e-6] 

Batch 500/2320 - Loss: 1.3576281070709229, Learning Rate: 1.985603448275862e-06


Epoch 0:  43%|████▎     | 1001/2320 [06:23<08:01,  2.74it/s, loss=0.882, lr=1.97e-6]

Batch 1000/2320 - Loss: 0.8818328380584717, Learning Rate: 1.9712356321839077e-06


Epoch 0:  65%|██████▍   | 1501/2320 [09:27<04:59,  2.74it/s, loss=1.24, lr=1.96e-6] 

Batch 1500/2320 - Loss: 1.2447423934936523, Learning Rate: 1.956867816091954e-06


Epoch 0:  86%|████████▋ | 2001/2320 [12:39<01:58,  2.70it/s, loss=1.57, lr=1.94e-6] 

Batch 2000/2320 - Loss: 1.5740764141082764, Learning Rate: 1.9424999999999996e-06


Epoch 0: 100%|██████████| 2320/2320 [14:38<00:00,  2.64it/s, loss=0.61, lr=1.93e-6] 
Epoch 1:   0%|          | 1/2320 [00:00<14:05,  2.74it/s, loss=0.936, lr=1.93e-6]

Batch 0/2320 - Loss: 0.9358725547790527, Learning Rate: 1.933304597701149e-06


Epoch 1:  22%|██▏       | 501/2320 [03:10<12:20,  2.46it/s, loss=0.799, lr=1.92e-6]

Batch 500/2320 - Loss: 0.7986644506454468, Learning Rate: 1.9189367816091953e-06


Epoch 1:  43%|████▎     | 1001/2320 [06:20<08:09,  2.70it/s, loss=1.44, lr=1.9e-6] 

Batch 1000/2320 - Loss: 1.4412868022918701, Learning Rate: 1.9045689655172412e-06


Epoch 1:  65%|██████▍   | 1501/2320 [09:31<05:23,  2.53it/s, loss=1.21, lr=1.89e-6] 

Batch 1500/2320 - Loss: 1.2114384174346924, Learning Rate: 1.8902011494252872e-06


Epoch 1:  86%|████████▋ | 2001/2320 [12:40<01:56,  2.73it/s, loss=1.09, lr=1.88e-6] 

Batch 2000/2320 - Loss: 1.0933077335357666, Learning Rate: 1.875833333333333e-06


Epoch 1: 100%|██████████| 2320/2320 [14:41<00:00,  2.63it/s, loss=1.26, lr=1.87e-6] 
Epoch 2:   0%|          | 1/2320 [00:00<14:37,  2.64it/s, loss=0.977, lr=1.87e-6]

Batch 0/2320 - Loss: 0.9769284725189209, Learning Rate: 1.8666379310344826e-06


Epoch 2:  22%|██▏       | 501/2320 [03:10<13:18,  2.28it/s, loss=0.739, lr=1.85e-6]

Batch 500/2320 - Loss: 0.7390005588531494, Learning Rate: 1.8522701149425285e-06


Epoch 2:  43%|████▎     | 1001/2320 [06:19<08:08,  2.70it/s, loss=1.11, lr=1.84e-6] 

Batch 1000/2320 - Loss: 1.1057175397872925, Learning Rate: 1.8379022988505745e-06


Epoch 2:  65%|██████▍   | 1501/2320 [09:25<05:04,  2.69it/s, loss=1, lr=1.82e-6]    

Batch 1500/2320 - Loss: 1.003229022026062, Learning Rate: 1.8235344827586206e-06


Epoch 2:  86%|████████▋ | 2001/2320 [12:28<01:55,  2.75it/s, loss=1.2, lr=1.81e-6]  

Batch 2000/2320 - Loss: 1.2010018825531006, Learning Rate: 1.8091666666666666e-06


Epoch 2: 100%|██████████| 2320/2320 [14:24<00:00,  2.68it/s, loss=0.792, lr=1.8e-6] 
Epoch 3:   0%|          | 1/2320 [00:00<14:03,  2.75it/s, loss=0.936, lr=1.8e-6]

Batch 0/2320 - Loss: 0.9359486699104309, Learning Rate: 1.799971264367816e-06


Epoch 3:  22%|██▏       | 501/2320 [03:13<12:56,  2.34it/s, loss=0.382, lr=1.79e-6]

Batch 500/2320 - Loss: 0.38213270902633667, Learning Rate: 1.785603448275862e-06


Epoch 3:  43%|████▎     | 1001/2320 [06:21<07:59,  2.75it/s, loss=1.66, lr=1.77e-6]

Batch 1000/2320 - Loss: 1.6595628261566162, Learning Rate: 1.771235632183908e-06


Epoch 3:  65%|██████▍   | 1501/2320 [09:34<04:59,  2.73it/s, loss=0.722, lr=1.76e-6]

Batch 1500/2320 - Loss: 0.7220509052276611, Learning Rate: 1.756867816091954e-06


Epoch 3:  86%|████████▋ | 2001/2320 [12:44<02:04,  2.55it/s, loss=1.27, lr=1.74e-6] 

Batch 2000/2320 - Loss: 1.2746026515960693, Learning Rate: 1.7424999999999998e-06


Epoch 3: 100%|██████████| 2320/2320 [14:44<00:00,  2.62it/s, loss=0.936, lr=1.73e-6]
Epoch 4:   0%|          | 1/2320 [00:00<14:25,  2.68it/s, loss=0.998, lr=1.73e-6]

Batch 0/2320 - Loss: 0.9981088638305664, Learning Rate: 1.7333045977011494e-06


Epoch 4:  22%|██▏       | 501/2320 [03:04<11:23,  2.66it/s, loss=1.16, lr=1.72e-6] 

Batch 500/2320 - Loss: 1.1577684879302979, Learning Rate: 1.7189367816091953e-06


Epoch 4:  43%|████▎     | 1001/2320 [06:11<08:03,  2.73it/s, loss=0.847, lr=1.7e-6]

Batch 1000/2320 - Loss: 0.8470999598503113, Learning Rate: 1.7045689655172412e-06


Epoch 4:  65%|██████▍   | 1501/2320 [09:27<05:18,  2.57it/s, loss=1.02, lr=1.69e-6] 

Batch 1500/2320 - Loss: 1.018200159072876, Learning Rate: 1.6902011494252872e-06


Epoch 4:  86%|████████▋ | 2001/2320 [12:33<01:55,  2.76it/s, loss=1.13, lr=1.68e-6] 

Batch 2000/2320 - Loss: 1.134835124015808, Learning Rate: 1.6758333333333333e-06


Epoch 4: 100%|██████████| 2320/2320 [14:31<00:00,  2.66it/s, loss=1.11, lr=1.67e-6] 


In [69]:
from itertools import groupby
from collections import Counter

def normalize_text(text):
    """Lowercase text, remove articles, punctuation and extra whitespace."""
    def remove_articles(text):
        return " ".join([word for word in text.split() if word.lower() not in ["a", "an", "the"]])
    def remove_punctuation(text):
        return "".join([char for char in text if char.isalpha() or char.isspace()])
    def lowercase(text):
        return text.lower()
    def fix_whitespace(text):
        return " ".join(list(filter(None, ["".join(list(group)) for key, group in groupby(text.split())])))
    return fix_whitespace(remove_articles(remove_punctuation(lowercase(text))))

def exact_match(prediction, ground_truth):
    """Check if two strings are exactly the same after normalization."""
    return normalize_text(prediction) == normalize_text(ground_truth)

def max_over_ground_truths(metric_fn, prediction, ground_truths):
    """Find the maximum metric score over multiple ground truth strings."""
    scores = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores.append(score)
    return max(scores) if scores else 0

def f1_score(prediction, ground_truth):
    """Calculate F1 score between two strings after normalization."""
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def evaluate(answers, predictions):
    """Evaluate model performance using F1 score and exact match accuracy."""
    f1 = em = total = 0

    for ground_truths, prediction in zip(answers, predictions):
        total += 1
        f1 += max_over_ground_truths(
            f1_score, prediction, [ground_truths])
    
    f1 = 100.0 * f1 / total

    return {"F1_score": f1}

# Example usage:
answers = [["the cat in the hat", "a cat in a hat"], ["the quick brown fox jumps over the lazy dog"]]
predictions = ["a cat wearing a hat", "the quick brown fox jumps over the lazy dog"]

results = evaluate(answers, predictions)
print("F1 = ", results["F1_score"])


f1 = 0.5438
