In [1]:
import requests
import json
import torch
import torch.nn as nn
import os
from tqdm import tqdm
from transformers import BertModel, BertTokenizerFast, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ExponentialLR
import matplotlib.pyplot as plt

  torch.utils._pytree._register_pytree_node(


In [2]:
def get_data(path): 
    
    with open(path, 'rb') as f:
        raw_data = json.load(f)
    
    contexts_train = []
    questions_train = []
    answers_train = []
    num_q_train = 0
    num_pos_train = 0
    num_imp_train = 0
    
    for group in raw_data['data']:
        for paragraph in group['paragraphs']:
            context_train = paragraph['context']
            for qa in paragraph['qas']:
                question_train = qa['question']
                num_q_train += 1
                for answer in qa['answers']:
                    contexts_train.append(context_train.lower())
                    questions_train.append(question_train.lower())
                    answers_train.append(answer)
    
    return num_q_train, num_pos_train, num_imp_train, contexts_train, questions_train, answers_train

In [3]:
num_of_questions_train = 0
num_of_possible_train = 0
num_of_impossible_train = 0
num_q_train, num_pos_train, num_imp_train, train_contexts_train, train_questions_train, train_answers_train = get_data('spoken_train-v1.1.json')
num_of_questions_train = num_q_train
num_of_possible_train = num_pos_train
num_of_impossible_train = num_imp_train

num_q_valid, num_pos_valid, num_imp_valid, valid_contexts_valid, valid_questions_valid, valid_answers_valid = get_data('spoken_test-v1.1.json')

In [4]:
def add_answer_at_end(answers_train, contexts_train):
    for answer, context in zip(answers_train, contexts_train):
        answer['text'] = answer['text'].lower()
        answer['answer_end'] = answer['answer_start'] + len(answer['text'])

add_answer_at_end(train_answers_train, train_contexts_train)
add_answer_at_end(valid_answers_valid, valid_contexts_valid)


MAX_LENGTH = 250
MODEL_PATH = "bert-base-uncased"

tokenizerFast = BertTokenizerFast.from_pretrained(MODEL_PATH)
train_encodings_fast = tokenizerFast(train_questions_train, train_contexts_train, max_length=MAX_LENGTH, truncation=True, padding=True)
valid_encodings_fast = tokenizerFast(valid_questions_valid, valid_contexts_valid, max_length=MAX_LENGTH, truncation=True, padding=True)



In [5]:
def return_Answer_startandend_train(idx):
    return_start = 0
    return_end = 0
    answer_encoding_fast = tokenizerFast(train_answers_train[idx]['text'], max_length=MAX_LENGTH, truncation=True, padding=True)
    
    for a in range(len(train_encodings_fast['input_ids'][idx]) - len(answer_encoding_fast['input_ids'])): 
        match = True
        for i in range(1, len(answer_encoding_fast['input_ids']) - 1):
            if answer_encoding_fast['input_ids'][i] != train_encodings_fast['input_ids'][idx][a + i]:
                match = False
                break
            if match:
                return_start = a + 1
                return_end = a + i + 1
                break
    return (return_start, return_end)

In [6]:
start_positions_train = []
end_positions_train = []
counter_train = 0

for t in range(len(train_encodings_fast['input_ids'])):
    s, e = return_Answer_startandend_train(t)
    start_positions_train.append(s)
    end_positions_train.append(e)
    
    if s == 0:
        counter_train += 1

train_encodings_fast.update({'start_positions': start_positions_train, 'end_positions': end_positions_train})
print(counter_train)

478


In [7]:
def return_answer_startend_valid(idx):
    return_start = 0
    return_end = 0
    answer_encoding_fast = tokenizerFast(valid_answers_valid[idx]['text'], max_length=MAX_LENGTH, truncation=True, padding=True)
    
    for a in range(len(valid_encodings_fast['input_ids'][idx]) - len(answer_encoding_fast['input_ids'])):
        match = True
        for i in range(1, len(answer_encoding_fast['input_ids']) - 1):
            if (answer_encoding_fast['input_ids'][i] != valid_encodings_fast['input_ids'][idx][a + i]):
                match = False
                break
            if match:
                return_start = a + 1
                return_end = a + i + 1
                break
    return(return_start, return_end)

In [8]:
start_positions = []
end_positions = []
counter = 0

for h in range(len(valid_encodings_fast['input_ids'])):
    s, e = return_answer_startend_valid(h)
    start_positions.append(s)
    end_positions.append(e)
    
    if s == 0:
        counter += 1

valid_encodings_fast.update({'start_positions': start_positions, 'end_positions': end_positions})
print(counter)

236


In [9]:
from transformers import BertForQuestionAnswering, BertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

class InputDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, i):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][i]),
            'token_type_ids': torch.tensor(self.encodings['token_type_ids'][i]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][i]),
            'start_positions': torch.tensor(self.encodings['start_positions'][i]),
            'end_positions': torch.tensor(self.encodings['end_positions'][i])
        }

    def __len__(self):
        return len(self.encodings['input_ids'])

# Assuming 'train_encodings_fast' and 'valid_encodings_fast' are already prepared with start and end positions
train_dataset = InputDataset(train_encodings_fast)
valid_dataset = InputDataset(valid_encodings_fast)

train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=1)

# Load the pre-trained BERT model for question answering
bert_model = BertForQuestionAnswering.from_pretrained(MODEL_PATH)  # MODEL_PATH = "bert-base-uncased"

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
class QAModel(nn.Module):
    def __init__(self):
        super(QAModel, self).__init__()
        self.bert = bert_model  # Pretrained Bert model
        self.drop_out = nn.Dropout(0.1)
        self.l1 = nn.Linear(768 * 2, 768 * 2)
        self.l2 = nn.Linear(768 * 2, 2)
        self.linear_relu_stack = nn.Sequential(
            self.drop_out,
            self.l1,
            nn.LeakyReLU(),
            self.l2
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        model_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
        hidden_states = model_output.hidden_states  # Extract hidden states
        out = torch.cat((hidden_states[-1], hidden_states[-3]), dim=-1)  # Concatenate last and second-to-last hidden states
        logits = self.linear_relu_stack(out)
        start_logits, end_logits = logits.split(1, dim=-1)  # Split logits for start and end
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits
model = QAModel()

In [11]:
def compute_loss(start_logits, end_logits, start_positions, end_positions):
    loss_func = nn.CrossEntropyLoss()
    start_loss_val = loss_func(start_logits, start_positions)
    end_loss_val = loss_func(end_logits, end_positions)
    avg_loss = (start_loss_val + end_loss_val) / 2
    return avg_loss

def compute_focal_loss(start_logits, end_logits, start_positions, end_positions, gamma):
    softmax = nn.Softmax(dim=1)
    start_probs = softmax(start_logits)
    inv_start_probs = 1 - start_probs
    end_probs = softmax(end_logits)
    inv_end_probs = 1 - end_probs
    log_softmax = nn.LogSoftmax(dim=1)
    start_log_probs = log_softmax(start_logits)
    end_log_probs = log_softmax(end_logits)
    
    negative_log_likelihood = nn.NLLLoss()
    
    focal_loss_start = negative_log_likelihood(torch.pow(inv_start_probs, gamma) * start_log_probs, start_positions)
    focal_loss_end = negative_log_likelihood(torch.pow(inv_end_probs, gamma) * end_log_probs, end_positions)
    
    return (focal_loss_start + focal_loss_end) / 2

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=2e-2)
lr_scheduler = ExponentialLR(optimizer, gamma=0.9)

total_accuracy = []
total_train_loss = []




In [19]:
def train_one_epoch(model, dataloader, epoch):
    model = model.train()
    loss_values = []
    accuracy_values = []
    batch_counter = 0
    for batch in tqdm(dataloader, desc=f'Training Epoch {epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass
        start_logits, end_logits = model(input_ids=input_ids, 
                                          attention_mask=attention_mask,
                                          token_type_ids=token_type_ids)

        # Calculate loss
        loss = compute_focal_loss(start_logits, end_logits, start_positions, end_positions, gamma=1)
        loss_values.append(loss.item())
        
        # Backward pass
        loss.backward()
        optimizer.step()

        # Predictions and accuracy calculation
        start_preds = torch.argmax(start_logits, dim=1)
        end_preds = torch.argmax(end_logits, dim=1)

        accuracy_values.append(((start_preds == start_positions).sum() / len(start_preds)).item())
        accuracy_values.append(((end_preds == end_positions).sum() / len(end_preds)).item())

        batch_counter += 1
        if batch_counter == 250 and epoch == 1:
            avg_acc = sum(accuracy_values) / len(accuracy_values)
            total_accuracy.append(avg_acc)
            avg_loss = sum(loss_values) / len(loss_values)
            total_train_loss.append(avg_loss)
            batch_counter = 0
    
    lr_scheduler.step()
    avg_accuracy = sum(accuracy_values) / len(accuracy_values)
    avg_loss = sum(loss_values) / len(loss_values)

    return avg_accuracy, avg_loss

In [13]:
def evaluate_on_model(model, dataloader):
    model = model.eval()
    loss_values = []
    accuracy_values = []
    batch_counter = 0
    predicted_answers = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating the Model'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_positions_true = batch['start_positions'].to(device)
            end_positions_true = batch['end_positions'].to(device)

            start_logits, end_logits = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            
            start_preds = torch.argmax(start_logits, dim=1)
            end_preds = torch.argmax(end_logits, dim=1)
            
            predicted_answer = tokenizerFast.convert_tokens_to_string(tokenizerFast.convert_ids_to_tokens(input_ids[0][start_preds:end_preds]))
            true_answer = tokenizerFast.convert_tokens_to_string(tokenizerFast.convert_ids_to_tokens(input_ids[0][start_positions_true[0]:end_positions_true[0]]))
            
            predicted_answers.append([predicted_answer, true_answer])
    
    return predicted_answers

In [14]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable


In [15]:
pip install --upgrade huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install --upgrade datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [17]:
!pip install jiwer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable


In [21]:
from evaluate import load
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

wer_metric = load("wer")  # Renamed 'wer' to 'wer_metric'
EPOCH_COUNT = 1  # Renamed 'EPOCHS' to 'EPOCH_COUNT'
model.to(device)

wer_scores = []  # Renamed 'wer_list' to 'wer_scores'

for epoch_num in range(EPOCH_COUNT):  # Renamed 'epoch' to 'epoch_num'
    # Training the model
    train_accuracy, train_loss = train_one_epoch(model, train_data_loader, epoch_num + 1)  # Renamed variables
    print(f"Train Accuracy: {train_accuracy}      Train Loss: {train_loss}")
    
    # Evaluating the model
    answer_pairs = evaluate_on_model(model, valid_data_loader)  # Renamed 'answer_list' to 'answer_pairs'
    predicted_answers = []  # Renamed 'pred_answers' to 'predicted_answers'
    true_answers = []  # Renamed 'true_answers' to 'true_answers'

    for i in range(len(answer_pairs)):
        # Handling empty answers
        if len(answer_pairs[i][0]) == 0:
            answer_pairs[i][0] = "$"
        if len(answer_pairs[i][1]) == 0:
            answer_pairs[i][1] = "$"
        
        predicted_answers.append(answer_pairs[i][0])
        true_answers.append(answer_pairs[i][1])

    # Calculate WER
    wer_score = wer_metric.compute(predictions=predicted_answers, references=true_answers)  # Renamed 'wer' to 'wer_metric'
    wer_scores.append(wer_score)

# Print WER scores after the loop
print(wer_scores)

# Example Tokenization
tokens = tokenizerFast.tokenize("This is a sentence.")
print(tokens)

output = tokenizerFast.convert_tokens_to_string(tokens)
print(output)


Training Epoch 1: 100%|██████████| 2320/2320 [03:57<00:00,  9.77it/s]


Train Accuracy: 0.7350658097400747      Train Loss: 0.680211986546758


Evaluating the Model: 100%|██████████| 15875/15875 [00:59<00:00, 266.90it/s]


[2.756976377952756]
['this', 'is', 'a', 'sentence', '.']
this is a sentence.
