In [1]:
import json
import os
import torch
import torch.nn as nn
import requests
import torch
from torch import nn
from torch.cuda.amp import GradScaler, autocast
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.optim.lr_scheduler import ExponentialLR
from transformers import BertTokenizerFast, BertModel, AdamW
from torch.utils.data import Dataset, DataLoader

# Setting the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  torch.utils._pytree._register_pytree_node(


In [2]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    contexts, questions, answers = [], [], []
    total_questions, possible_answers, impossible_answers = 0, 0, 0

    for entry in data['data']:
        for paragraph in entry['paragraphs']:
            text = paragraph['context']
            for qa_pair in paragraph['qas']:
                q_text = qa_pair['question']
                total_questions += 1
                for ans in qa_pair['answers']:
                    contexts.append(text.lower())
                    questions.append(q_text.lower())
                    answers.append(ans)
                    possible_answers += 1  # Assuming this refers to answer count

    return total_questions, possible_answers, impossible_answers, contexts, questions, answers


In [3]:
# Loading training data
total_train_q, train_pos_ans, train_imp_ans, train_contexts, train_questions, train_answers = load_data(
    '../spoken_train-v1.1.json'
)

# Loading validation data
total_val_q, val_pos_ans, val_imp_ans, val_contexts, val_questions, val_answers = load_data(
    '../spoken_test-v1.1.json'
)


In [4]:
def append_answer_end(answers, contexts):
    for ans, ctx in zip(answers, contexts):
        ans['text'] = ans['text'].lower()
        ans['answer_end'] = ans['answer_start'] + len(ans['text'])  # Using 'answer_start' as the starting position key

# Adding end positions to answers in training and validation data
append_answer_end(train_answers, train_contexts)
append_answer_end(val_answers, val_contexts)


In [5]:
# Set maximum sequence length and document stride for handling lengthy contexts
MAX_SEQ_LEN = 512
BERT_MODEL = "bert-base-uncased"
doc_stride_value = 128

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL)

# Tokenize and encode training data with stride and padding
train_encoded = tokenizer(
    train_questions,
    train_contexts,
    max_length=MAX_SEQ_LEN,
    truncation=True,
    stride=doc_stride_value,
    padding=True
)

# Tokenize and encode validation data with the same settings
val_encoded = tokenizer(
    val_questions,
    val_contexts,
    max_length=MAX_SEQ_LEN,
    truncation=True,
    stride=doc_stride_value,
    padding=True
)


In [6]:
def find_answer_positions_train(index):
    start_idx, end_idx = 0, 0
    answer_tokens = tokenizer(train_answers[index]['text'], max_length=MAX_SEQ_LEN, truncation=True, padding=True)

    for position in range(len(train_encoded['input_ids'][index]) - len(answer_tokens['input_ids'])):
        is_match = True
        for i in range(1, len(answer_tokens['input_ids']) - 1):
            if answer_tokens['input_ids'][i] != train_encoded['input_ids'][index][position + i]:
                is_match = False
                break
        if is_match:
            start_idx = position + 1
            end_idx = start_idx + len(answer_tokens['input_ids']) - 2
            break

    return start_idx, end_idx

# Ensure positions are on the GPU when used
start_positions_train = []
end_positions_train = []
for idx in range(len(train_encoded['input_ids'])):
    start_pos, end_pos = find_answer_positions_train(idx)
    start_positions_train.append(start_pos)
    end_positions_train.append(end_pos)


In [7]:
# Update the encoded training data with start and end positions
train_encoded.update({
    'start_positions': torch.tensor(start_positions_train).to(device),
    'end_positions': torch.tensor(end_positions_train).to(device)
})


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
def find_answer_positions_valid(index):
    start_idx, end_idx = 0, 0
    answer_tokens = tokenizer(val_answers[index]['text'], max_length=MAX_SEQ_LEN, truncation=True, padding=True)

    for position in range(len(val_encoded['input_ids'][index]) - len(answer_tokens['input_ids'])):
        is_match = True
        for i in range(1, len(answer_tokens['input_ids']) - 1):
            if answer_tokens['input_ids'][i] != val_encoded['input_ids'][index][position + i]:
                is_match = False
                break
        if is_match:
            start_idx = position + 1
            end_idx = start_idx + len(answer_tokens['input_ids']) - 2
            break

    return start_idx, end_idx

# Generate start and end positions for validation data, ensuring they are on the GPU
start_positions_valid = []
end_positions_valid = []
for idx in range(len(val_encoded['input_ids'])):
    start_pos, end_pos = find_answer_positions_valid(idx)
    start_positions_valid.append(start_pos)
    end_positions_valid.append(end_pos)


In [9]:
# Update the encoded validation data with start and end positions
val_encoded.update({
    'start_positions': torch.tensor(start_positions_valid).to(device),
    'end_positions': torch.tensor(end_positions_valid).to(device)
})


In [10]:
class QAInputDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]).to(device),
            'token_type_ids': torch.tensor(self.encodings['token_type_ids'][idx]).to(device),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]).to(device),
            'start_positions': torch.tensor(self.encodings['start_positions'][idx]).to(device),
            'end_positions': torch.tensor(self.encodings['end_positions'][idx]).to(device)
        }

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets for training and validation
train_dataset = QAInputDataset(train_encoded)
val_dataset = QAInputDataset(val_encoded)


In [11]:
# Create data loaders for batching the datasets
train_data_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=1)


In [12]:
class QuestionAnsweringModel(nn.Module):
    def __init__(self):
        super(QuestionAnsweringModel, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL)
        self.dropout_layer = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768 * 2, 768 * 2)
        self.fc2 = nn.Linear(768 * 2, 2)
        self.model_layers = nn.Sequential(
            self.dropout_layer,
            self.fc1,
            nn.LeakyReLU(),
            self.fc2
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
        hidden_states = bert_output[2]
        concatenated_outputs = torch.cat((hidden_states[-1], hidden_states[-3]), dim=-1)
        logits = self.model_layers(concatenated_outputs)

        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

# Initialize the model
model = QuestionAnsweringModel().to(device)


In [13]:
def focal_loss(start_logits, end_logits, start_positions, end_positions, gamma):
    softmax = nn.Softmax(dim=1)
    start_probs = softmax(start_logits)
    inv_start_probs = 1 - start_probs
    end_probs = softmax(end_logits)
    inv_end_probs = 1 - end_probs

    log_softmax = nn.LogSoftmax(dim=1)
    log_probs_start = log_softmax(start_logits)
    log_probs_end = log_softmax(end_logits)

    nll_loss = nn.NLLLoss()

    focal_loss_start = nll_loss(torch.pow(inv_start_probs, gamma) * log_probs_start, start_positions)
    focal_loss_end = nll_loss(torch.pow(inv_end_probs, gamma) * log_probs_end, end_positions)

    return (focal_loss_start + focal_loss_end) / 2


In [14]:
def postprocess_predictions(start_predictions, end_predictions):
    for i in range(len(start_predictions)):
        if end_predictions[i] < start_predictions[i]:
            end_predictions[i] = start_predictions[i]  # Ensure valid span
    return start_predictions, end_predictions


In [15]:
# Initialize optimizer
EPOCHS=6
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=2e-2)

# Set up the linear learning rate scheduler
scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.0, total_iters=EPOCHS)




In [16]:
def train_model_epoch(model, dataloader, epoch_number, accumulation_steps=4):
    model.train()
    loss_values = []
    accuracy_values = []
    scaler = GradScaler()  # Initialize the GradScaler for AMP
    batch_counter = 0

    for batch_index, batch in enumerate(tqdm(dataloader, desc='Training')):  # Updated description
        optimizer.zero_grad()  # Reset gradients at the start of each batch
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        with autocast():  # Enable mixed precision
            start_output, end_output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            loss = focal_loss(start_output, end_output, start_positions, end_positions, gamma=1)
            loss_values.append(loss.item())
            scaler.scale(loss).backward()  # Backpropagate the scaled loss

        # Perform optimizer step only after accumulating gradients
        if (batch_index + 1) % accumulation_steps == 0:
            scaler.step(optimizer)  # Update weights
            scaler.update()  # Update the scaler for the next iteration
            optimizer.zero_grad()  # Reset gradients for the next accumulation

        start_predictions = torch.argmax(start_output, dim=1)
        end_predictions = torch.argmax(end_output, dim=1)

        accuracy_values.append(((start_predictions == start_positions).sum() / len(start_predictions)).item())
        accuracy_values.append(((end_predictions == end_positions).sum() / len(end_predictions)).item())

        batch_counter += 1
        if batch_counter == 250 and epoch_number == 1:
            avg_accuracy = sum(accuracy_values) / len(accuracy_values)
            print(f'Average Accuracy after {batch_counter} batches: {avg_accuracy}')

    # Final step in case there are remaining gradients not updated
    if len(dataloader) % accumulation_steps != 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()  # Ensure gradients are reset after the last step

    average_accuracy = sum(accuracy_values) / len(accuracy_values)
    average_loss = sum(loss_values) / len(loss_values)
    return average_accuracy, average_loss


In [17]:
def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    answer_pairs = []

    with torch.no_grad():  # Disable gradient calculation
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)

            # Forward pass
            start_output, end_output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            start_predictions = torch.argmax(start_output, dim=1)
            end_predictions = torch.argmax(end_output, dim=1)

            # Postprocess predictions to ensure valid spans
            for i in range(len(start_predictions)):
                if end_predictions[i] < start_predictions[i]:
                    end_predictions[i] = start_predictions[i]  # Ensure valid end index

            # Extract answers for each sample in the batch
            for i in range(input_ids.size(0)):  # Loop over each sample in the batch
                predicted_answer = tokenizer.convert_tokens_to_string(
                    tokenizer.convert_ids_to_tokens(input_ids[i][start_predictions[i]:end_predictions[i]])
                )
                true_answer = tokenizer.convert_tokens_to_string(
                    tokenizer.convert_ids_to_tokens(input_ids[i][start_true[i]:end_true[i]])
                )
                answer_pairs.append([predicted_answer, true_answer])

    return answer_pairs


In [18]:
# import torch.multiprocessing as mp

# # Set the multiprocessing start method to 'spawn'
# mp.set_start_method('spawn', force=True)
from sklearn.metrics import f1_score  # Import the f1_score function

def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    all_start_predictions = []
    all_end_predictions = []
    all_start_true = []
    all_end_true = []

    with torch.no_grad():  # Disable gradient calculation
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)

            # Forward pass
            start_output, end_output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            start_predictions = torch.argmax(start_output, dim=1)
            end_predictions = torch.argmax(end_output, dim=1)

            # Postprocess predictions to ensure valid spans
            for i in range(len(start_predictions)):
                if end_predictions[i] < start_predictions[i]:
                    end_predictions[i] = start_predictions[i]  # Ensure valid end index

            # Store predictions and true values for F1 calculation
            all_start_predictions.extend(start_predictions.cpu().numpy())
            all_end_predictions.extend(end_predictions.cpu().numpy())
            all_start_true.extend(start_true.cpu().numpy())
            all_end_true.extend(end_true.cpu().numpy())

    # Calculate F1 score
    f1_start = f1_score(all_start_true, all_start_predictions, average='weighted')
    f1_end = f1_score(all_end_true, all_end_predictions, average='weighted')

    print(f"F1 Score (Start): {f1_start}")
    print(f"F1 Score (End): {f1_end}")

    return all_start_predictions, all_end_predictions


In [20]:
from jiwer import wer 

def calculate_wer(predictions, references):
    """Calculate the Word Error Rate between predictions and references."""
    return wer(references, predictions)

# Assuming EPOCHS is already defined
EPOCHS = 6
model.to(device)  # Move model to the specified device
wer_scores = []

for epoch in range(EPOCHS):
    print(f'Epoch - {epoch + 1}')
    
    # Training phase
    train_accuracy, train_loss = train_model_epoch(model, train_data_loader, epoch + 1)
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Training Loss: {train_loss:.4f}")

    # Evaluation phase
    answer_list = evaluate_model(model, val_data_loader)
    predicted_answers = []
    true_answers = []

    for ans_pair in answer_list:
        # Safely check for the predicted and true answers
        predicted = ans_pair[0] if isinstance(ans_pair[0], str) and len(ans_pair[0]) > 0 else "$"
        true = ans_pair[1] if isinstance(ans_pair[1], str) and len(ans_pair[1]) > 0 else "$"

        # If the answer is an integer, convert it to a string or handle accordingly
        if isinstance(ans_pair[0], int):
            predicted = str(ans_pair[0])
        if isinstance(ans_pair[1], int):
            true = str(ans_pair[1])

        predicted_answers.append(predicted)
        true_answers.append(true)

    # Ensure all answers are non-empty
    predicted_answers = [ans if ans else "$" for ans in predicted_answers]
    true_answers = [ans if ans else "$" for ans in true_answers]

    # Calculate WER score
    wer_score = calculate_wer(predicted_answers, true_answers)
    print(f"WER Score for Epoch {epoch + 1}: {wer_score:.4f}")
    wer_scores.append(wer_score)

# Final output of WER scores
print('WER scores (after adding document stride):', wer_scores)


  scaler = GradScaler()  # Initialize the GradScaler for AMP


Epoch - 1


  'start_positions': torch.tensor(self.encodings['start_positions'][idx]).to(device),
  'end_positions': torch.tensor(self.encodings['end_positions'][idx]).to(device)
  with autocast():  # Enable mixed precision
Training:   5%|▌         | 250/4639 [00:25<07:30,  9.73it/s]

Average Accuracy after 250 batches: 0.552


Training: 100%|██████████| 4639/4639 [08:00<00:00,  9.65it/s]


Training Accuracy: 0.5601
Training Loss: 1.3632


Evaluating: 100%|██████████| 15875/15875 [03:39<00:00, 72.45it/s]
  scaler = GradScaler()  # Initialize the GradScaler for AMP


F1 Score (Start): 0.5320981055960244
F1 Score (End): 0.5582406252431552
WER Score for Epoch 1: 0.0000
Epoch - 2


  'start_positions': torch.tensor(self.encodings['start_positions'][idx]).to(device),
  'end_positions': torch.tensor(self.encodings['end_positions'][idx]).to(device)
  with autocast():  # Enable mixed precision
Training: 100%|██████████| 4639/4639 [08:00<00:00,  9.65it/s]


Training Accuracy: 0.6156
Training Loss: 1.1343


Evaluating: 100%|██████████| 15875/15875 [03:39<00:00, 72.44it/s]
  scaler = GradScaler()  # Initialize the GradScaler for AMP


F1 Score (Start): 0.5438432894585569
F1 Score (End): 0.5833630719043977
WER Score for Epoch 2: 0.0000
Epoch - 3


  'start_positions': torch.tensor(self.encodings['start_positions'][idx]).to(device),
  'end_positions': torch.tensor(self.encodings['end_positions'][idx]).to(device)
  with autocast():  # Enable mixed precision
Training: 100%|██████████| 4639/4639 [08:00<00:00,  9.65it/s]


Training Accuracy: 0.6538
Training Loss: 0.9874


Evaluating: 100%|██████████| 15875/15875 [03:39<00:00, 72.43it/s]
  scaler = GradScaler()  # Initialize the GradScaler for AMP


F1 Score (Start): 0.5550467734786914
F1 Score (End): 0.5896741842488588
WER Score for Epoch 3: 0.0000
Epoch - 4


  'start_positions': torch.tensor(self.encodings['start_positions'][idx]).to(device),
  'end_positions': torch.tensor(self.encodings['end_positions'][idx]).to(device)
  with autocast():  # Enable mixed precision
Training: 100%|██████████| 4639/4639 [08:00<00:00,  9.65it/s]


Training Accuracy: 0.6876
Training Loss: 0.8620


Evaluating: 100%|██████████| 15875/15875 [03:39<00:00, 72.44it/s]
  scaler = GradScaler()  # Initialize the GradScaler for AMP


F1 Score (Start): 0.5598853609099081
F1 Score (End): 0.5954711657107753
WER Score for Epoch 4: 0.0000
Epoch - 5


  'start_positions': torch.tensor(self.encodings['start_positions'][idx]).to(device),
  'end_positions': torch.tensor(self.encodings['end_positions'][idx]).to(device)
  with autocast():  # Enable mixed precision
Training: 100%|██████████| 4639/4639 [08:00<00:00,  9.65it/s]


Training Accuracy: 0.7147
Training Loss: 0.7655


Evaluating: 100%|██████████| 15875/15875 [03:39<00:00, 72.45it/s]
  scaler = GradScaler()  # Initialize the GradScaler for AMP


F1 Score (Start): 0.5536433336802372
F1 Score (End): 0.5871261848539607
WER Score for Epoch 5: 0.0000
Epoch - 6


  'start_positions': torch.tensor(self.encodings['start_positions'][idx]).to(device),
  'end_positions': torch.tensor(self.encodings['end_positions'][idx]).to(device)
  with autocast():  # Enable mixed precision
Training: 100%|██████████| 4639/4639 [08:00<00:00,  9.65it/s]


Training Accuracy: 0.7406
Training Loss: 0.6763


Evaluating: 100%|██████████| 15875/15875 [03:39<00:00, 72.45it/s]


F1 Score (Start): 0.5435304243619612
F1 Score (End): 0.591615137902125
WER Score for Epoch 6: 0.0000
WER scores (after adding document stride): [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
