In [None]:
#Base model

In [2]:
import torch
import json
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import jiwer
import logging
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Configure logging for detailed output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Determine device for computation: GPU if available, otherwise CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Function to load data from a JSON file
def load_data_file(path):
    """
    Load and parse the dataset from a JSON file.
    Extract contexts, questions, and answers from the dataset.
    """
    try:
        with open(path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
    except FileNotFoundError:
        logger.error(f"File not found: {path}")
        raise

    contexts, questions, answers = [], [], []
    num_questions, num_possible, num_impossible = 0, 0, 0

    # Extract data from the nested structure
    for group in raw_data["data"]:
        for paragraph in group["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                num_questions += 1
                if "is_impossible" in qa and qa["is_impossible"]:
                    num_impossible += 1
                else:
                    num_possible += 1
                for answer in qa.get("answers", []):
                    contexts.append(context.lower())
                    questions.append(question.lower())
                    answers.append(answer)

    return num_questions, num_possible, num_impossible, contexts, questions, answers

# Load the training and validation datasets
try:
    num_train_questions, num_train_possible, num_train_impossible, train_contexts, train_questions, train_answers = load_data_file("spoken_train-v1.1.json")
    num_valid_questions, num_valid_possible, num_valid_impossible, valid_contexts, valid_questions, valid_answers = load_data_file("spoken_test-v1.1.json")
except Exception as e:
    logger.error(f"Error loading data: {e}")
    exit()

# Function to calculate and add end positions for each answer
def add_answer_end_positions(answers):
    """
    Calculate the end position of each answer based on its start position and length.
    Add this information to the answer dictionary.
    """
    for answer in answers:
        answer_text = answer.get("text", "").lower()
        answer_start = answer.get("answer_start", -1)
        answer["answer_end"] = answer_start + len(answer_text)

# Add end positions to training and validation datasets
add_answer_end_positions(train_answers)
add_answer_end_positions(valid_answers)

# Tokenizer and model configuration
MAX_LENGTH = 512
MODEL_PATH = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)

# Tokenize the training and validation datasets
train_encodings = tokenizer(train_questions, train_contexts, max_length=MAX_LENGTH, padding=True, truncation=True)
valid_encodings = tokenizer(valid_questions, valid_contexts, max_length=MAX_LENGTH, padding=True, truncation=True)

# Custom Dataset class to handle data preparation for PyTorch
class QADataset(Dataset):
    """
    Custom Dataset class to manage tokenized inputs and corresponding labels.
    """

    def __init__(self, encodings, answers):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        # Prepare tokenized inputs and corresponding answer positions
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["start_positions"] = torch.tensor(self.answers[idx].get("answer_start", -1))
        item["end_positions"] = torch.tensor(self.answers[idx].get("answer_end", -1))
        return item

    def __len__(self):
        # Return the total number of samples
        return len(self.encodings.input_ids)

# Create Dataset objects for training and validation
train_dataset = QADataset(train_encodings, train_answers)
valid_dataset = QADataset(valid_encodings, valid_answers)

# Initialize DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=1)

# Load the pre-trained DistilBERT model for question answering
qa_model = DistilBertForQuestionAnswering.from_pretrained(MODEL_PATH).to(device)

# Set up the optimizer
optimizer = AdamW(qa_model.parameters(), lr=5e-5)

# Function to train the model for one epoch
def train_one_epoch(model, dataloader, optimizer):
    """
    Train the model for one epoch using the given DataLoader and optimizer.
    """
    model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)  # Return the average loss for the epoch

# Function to evaluate the model on a validation dataset
def evaluate_model(model, dataloader):
    """
    Evaluate the model and compute the Word Error Rate (WER) and F1 score on the validation dataset.
    """
    model.eval()
    wer_list = []
    all_true_answers = []
    all_pred_answers = []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_true = batch["start_positions"].to(device)
        end_true = batch["end_positions"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predict start and end positions
        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Decode predictions and true answers
        for i in range(len(start_true)):
            pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1])
            true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1])
            if true_answer.strip():
                wer = jiwer.wer(true_answer, pred_answer)
                wer_list.append(wer)
                all_true_answers.append(true_answer)
                all_pred_answers.append(pred_answer)

    # Calculate F1 Score
    true_labels = [answer.split() for answer in all_true_answers]
    pred_labels = [answer.split() for answer in all_pred_answers]
    precision, recall, f1 = calculate_f1(true_labels, pred_labels)

    # Return the average WER and F1 score across all samples
    avg_wer = sum(wer_list) / len(wer_list) if wer_list else 0.0
    return avg_wer, precision, recall, f1

# Helper function to calculate F1 score
def calculate_f1(true_labels, pred_labels):
    """
    Calculate precision, recall, and F1 score for the given true and predicted labels.
    """
    precision_list = []
    recall_list = []
    f1_list = []

    for true, pred in zip(true_labels, pred_labels):
        true_set = set(true)
        pred_set = set(pred)
        common = true_set.intersection(pred_set)
        precision = len(common) / len(pred_set) if len(pred_set) > 0 else 0
        recall = len(common) / len(true_set) if len(true_set) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    return avg_precision, avg_recall, avg_f1

# Training loop with early stopping
EPOCHS = 5
best_wer = float("inf")
patience = 3
counter = 0

for epoch in range(EPOCHS):
    # Train the model for one epoch
    train_loss = train_one_epoch(qa_model, train_loader, optimizer)

    # Evaluate the model on the validation dataset
    wer_score, precision, recall, f1 = evaluate_model(qa_model, valid_loader)
    logger.info(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, WER Score: {wer_score:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Early stopping based on WER
    if wer_score < best_wer:
        best_wer = wer_score
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            logger.info("Early stopping triggered!")
            break


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.61it/s]
Evaluating: 100%|██████████| 15875/15875 [00:55<00:00, 286.86it/s]
INFO:__main__:Epoch 1/5, Train Loss: 5.9499, WER Score: 14.2205, Precision: 0.1986, Recall: 0.7467, F1 Score: 0.2289
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:53<00:00, 298.14it/s]
INFO:__main__:Epoch 2/5, Train Loss: 5.4005, WER Score: 5.6305, Precision: 0.4667, Recall: 0.6537, F1 Score: 0.4650
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:52<00:00, 302.57it/s]
INFO:__main__:Epoch 3/5, Train Loss: 4.8369, WER Score: 3.0083, Precision: 0.581

In [None]:
#Fine Tuned(model 2)

In [3]:
import torch
import json
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import jiwer
import logging
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Configure logging for detailed output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Determine device for computation: GPU if available, otherwise CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Function to load data from a JSON file
def load_data_file(path):
    """
    Load and parse the dataset from a JSON file.
    Extract contexts, questions, and answers from the dataset.
    """
    try:
        with open(path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
    except FileNotFoundError:
        logger.error(f"File not found: {path}")
        raise

    contexts, questions, answers = [], [], []
    num_questions, num_possible, num_impossible = 0, 0, 0

    # Extract data from the nested structure
    for group in raw_data["data"]:
        for paragraph in group["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                num_questions += 1
                if "is_impossible" in qa and qa["is_impossible"]:
                    num_impossible += 1
                else:
                    num_possible += 1
                for answer in qa.get("answers", []):
                    contexts.append(context.lower())
                    questions.append(question.lower())
                    answers.append(answer)

    return num_questions, num_possible, num_impossible, contexts, questions, answers

# Load the training and validation datasets
try:
    num_train_questions, num_train_possible, num_train_impossible, train_contexts, train_questions, train_answers = load_data_file("spoken_train-v1.1.json")
    num_valid_questions, num_valid_possible, num_valid_impossible, valid_contexts, valid_questions, valid_answers = load_data_file("spoken_test-v1.1.json")
except Exception as e:
    logger.error(f"Error loading data: {e}")
    exit()

# Function to calculate and add end positions for each answer
def add_answer_end_positions(answers):
    """
    Calculate the end position of each answer based on its start position and length.
    Add this information to the answer dictionary.
    """
    for answer in answers:
        answer_text = answer.get("text", "").lower()
        answer_start = answer.get("answer_start", -1)
        answer["answer_end"] = answer_start + len(answer_text)

# Add end positions to training and validation datasets
add_answer_end_positions(train_answers)
add_answer_end_positions(valid_answers)

# Tokenizer and model configuration
MAX_LENGTH = 512
MODEL_PATH = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)

# Preprocessing function to handle offset mappings
def preprocess_data(questions, contexts, answers, tokenizer, max_length=MAX_LENGTH):
    encodings = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True
    )
    start_positions = []
    end_positions = []

    for i, answer in enumerate(answers):
        start_char = answer['answer_start']
        end_char = answer['answer_end']
        offsets = encodings['offset_mapping'][i]
        start_token, end_token = 0, 0
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token = idx
            if start < end_char <= end:
                end_token = idx
        start_positions.append(start_token)
        end_positions.append(end_token)

    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    return encodings

# Preprocess the training and validation datasets
train_encodings = preprocess_data(train_questions, train_contexts, train_answers, tokenizer)
valid_encodings = preprocess_data(valid_questions, valid_contexts, valid_answers, tokenizer)

# Custom Dataset class to handle data preparation for PyTorch
class QADataset(Dataset):
    """
    Custom Dataset class to manage tokenized inputs and corresponding labels.
    """

    def __init__(self, encodings, answers):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        # Prepare tokenized inputs and corresponding answer positions
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['start_positions'] = torch.tensor(self.encodings['start_positions'][idx])
        item['end_positions'] = torch.tensor(self.encodings['end_positions'][idx])
        return item

    def __len__(self):
        # Return the total number of samples
        return len(self.encodings['input_ids'])

# Create Dataset objects for training and validation
train_dataset = QADataset(train_encodings, train_answers)
valid_dataset = QADataset(valid_encodings, valid_answers)

# Initialize DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=1)

# Load the pre-trained DistilBERT model for question answering
qa_model = DistilBertForQuestionAnswering.from_pretrained(MODEL_PATH).to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(qa_model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Function to train the model for one epoch
def train_one_epoch(model, dataloader, optimizer, scheduler):
    """
    Train the model for one epoch using the given DataLoader, optimizer, and scheduler.
    """
    model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)  # Return the average loss for the epoch

# Function to evaluate the model on a validation dataset
def evaluate_model(model, dataloader):
    """
    Evaluate the model and compute the Word Error Rate (WER) and F1 score on the validation dataset.
    """
    model.eval()
    wer_list = []
    all_true_answers = []
    all_pred_answers = []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_true = batch["start_positions"].to(device)
        end_true = batch["end_positions"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predict start and end positions
        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Decode predictions and true answers
        for i in range(len(start_true)):
            pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1])
            true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1])
            if true_answer.strip():
                wer = jiwer.wer(true_answer, pred_answer)
                wer_list.append(wer)
                all_true_answers.append(true_answer)
                all_pred_answers.append(pred_answer)

    # Calculate F1 Score
    true_labels = [answer.split() for answer in all_true_answers]
    pred_labels = [answer.split() for answer in all_pred_answers]
    precision, recall, f1 = calculate_f1(true_labels, pred_labels)

    # Return the average WER and F1 score across all samples
    avg_wer = sum(wer_list) / len(wer_list) if wer_list else 0.0
    return avg_wer, precision, recall, f1

# Helper function to calculate F1 score
def calculate_f1(true_labels, pred_labels):
    """
    Calculate precision, recall, and F1 score for the given true and predicted labels.
    """
    precision_list = []
    recall_list = []
    f1_list = []

    for true, pred in zip(true_labels, pred_labels):
        true_set = set(true)
        pred_set = set(pred)
        common = true_set.intersection(pred_set)
        precision = len(common) / len(pred_set) if len(pred_set) > 0 else 0
        recall = len(common) / len(true_set) if len(true_set) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    return avg_precision, avg_recall, avg_f1

# Training loop with early stopping
EPOCHS = 5
best_wer = float("inf")
patience = 3
counter = 0

for epoch in range(EPOCHS):
    # Train the model for one epoch
    train_loss = train_one_epoch(qa_model, train_loader, optimizer, scheduler)

    # Evaluate the model on the validation dataset
    wer_score, precision, recall, f1 = evaluate_model(qa_model, valid_loader)
    logger.info(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, WER Score: {wer_score:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Early stopping based on WER
    if wer_score < best_wer:
        best_wer = wer_score
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            logger.info("Early stopping triggered!")
            break


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 292.67it/s]
INFO:__main__:Epoch 1/5, Train Loss: 1.9182, WER Score: 1.8572, Precision: 0.6209, Recall: 0.6815, F1 Score: 0.6077
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 292.79it/s]
INFO:__main__:Epoch 2/5, Train Loss: 1.0386, WER Score: 2.1329, Precision: 0.6312, Recall: 0.6892, F1 Score: 0.6149
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 292.24it/s]
INFO:__main__:Epoch 3/5, Train Loss: 0.5879, WER Score: 2.1117, Precision: 0.6253

In [4]:
#pre processed(model3)
import torch
import json
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import jiwer
import logging
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Configure logging for detailed output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Determine device for computation: GPU if available, otherwise CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Function to load data from a JSON file
def load_data_file(path):
    """
    Load and parse the dataset from a JSON file.
    Extract contexts, questions, and answers from the dataset.
    """
    try:
        with open(path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
    except FileNotFoundError:
        logger.error(f"File not found: {path}")
        raise

    contexts, questions, answers = [], [], []
    num_questions, num_possible, num_impossible = 0, 0, 0

    # Extract data from the nested structure
    for group in raw_data["data"]:
        for paragraph in group["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                num_questions += 1
                if "is_impossible" in qa and qa["is_impossible"]:
                    num_impossible += 1
                else:
                    num_possible += 1
                for answer in qa.get("answers", []):
                    contexts.append(context.lower())
                    questions.append(question.lower())
                    answers.append(answer)

    return num_questions, num_possible, num_impossible, contexts, questions, answers

# Load the training and validation datasets
try:
    num_train_questions, num_train_possible, num_train_impossible, train_contexts, train_questions, train_answers = load_data_file("spoken_train-v1.1.json")
    num_valid_questions, num_valid_possible, num_valid_impossible, valid_contexts, valid_questions, valid_answers = load_data_file("spoken_test-v1.1.json")
except Exception as e:
    logger.error(f"Error loading data: {e}")
    exit()

# Function to calculate and add end positions for each answer
def add_answer_end_positions(answers):
    """
    Calculate the end position of each answer based on its start position and length.
    Add this information to the answer dictionary.
    """
    for answer in answers:
        answer_text = answer.get("text", "").lower()
        answer_start = answer.get("answer_start", -1)
        answer["answer_end"] = answer_start + len(answer_text)

# Add end positions to training and validation datasets
add_answer_end_positions(train_answers)
add_answer_end_positions(valid_answers)

# Tokenizer and model configuration
MAX_LENGTH = 512
MODEL_PATH = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)

# Preprocessing function to handle offset mappings
def preprocess_data(questions, contexts, answers, tokenizer, max_length=MAX_LENGTH):
    encodings = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True
    )
    start_positions = []
    end_positions = []

    for i, answer in enumerate(answers):
        start_char = answer['answer_start']
        end_char = answer['answer_end']
        offsets = encodings['offset_mapping'][i]
        start_token, end_token = 0, 0
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token = idx
            if start < end_char <= end:
                end_token = idx
        start_positions.append(start_token)
        end_positions.append(end_token)

    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    return encodings

# Preprocess the training and validation datasets
train_encodings = preprocess_data(train_questions, train_contexts, train_answers, tokenizer)
valid_encodings = preprocess_data(valid_questions, valid_contexts, valid_answers, tokenizer)

# Custom Dataset class to handle data preparation for PyTorch
class QADataset(Dataset):
    """
    Custom Dataset class to manage tokenized inputs and corresponding labels.
    """

    def __init__(self, encodings, answers):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        # Prepare tokenized inputs and corresponding answer positions
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['start_positions'] = torch.tensor(self.encodings['start_positions'][idx])
        item['end_positions'] = torch.tensor(self.encodings['end_positions'][idx])
        return item

    def __len__(self):
        # Return the total number of samples
        return len(self.encodings['input_ids'])

# Create Dataset objects for training and validation
train_dataset = QADataset(train_encodings, train_answers)
valid_dataset = QADataset(valid_encodings, valid_answers)

# Initialize DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=1)

# Load the pre-trained DistilBERT model for question answering
qa_model = DistilBertForQuestionAnswering.from_pretrained(MODEL_PATH).to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(qa_model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Function to train the model for one epoch
def train_one_epoch(model, dataloader, optimizer, scheduler):
    """
    Train the model for one epoch using the given DataLoader, optimizer, and scheduler.
    """
    model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)  # Return the average loss for the epoch

# Function to evaluate the model on a validation dataset
def evaluate_model(model, dataloader):
    """
    Evaluate the model and compute the Word Error Rate (WER), Precision, Recall, and F1 score on the validation dataset.
    """
    model.eval()
    wer_list = []
    all_true_answers = []
    all_pred_answers = []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_true = batch["start_positions"].to(device)
        end_true = batch["end_positions"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predict start and end positions
        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Decode predictions and true answers
        for i in range(len(start_true)):
            pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1])
            true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1])
            if true_answer.strip():
                wer = jiwer.wer(true_answer, pred_answer)
                wer_list.append(wer)
                all_true_answers.append(true_answer)
                all_pred_answers.append(pred_answer)

    # Calculate F1 Score
    true_labels = [answer.split() for answer in all_true_answers]
    pred_labels = [answer.split() for answer in all_pred_answers]
    precision, recall, f1 = calculate_f1(true_labels, pred_labels)

    # Return the average WER and F1 score across all samples
    avg_wer = sum(wer_list) / len(wer_list) if wer_list else 0.0
    return avg_wer, precision, recall, f1

# Helper function to calculate F1 score
def calculate_f1(true_labels, pred_labels):
    """
    Calculate precision, recall, and F1 score for the given true and predicted labels.
    """
    precision_list = []
    recall_list = []
    f1_list = []

    for true, pred in zip(true_labels, pred_labels):
        true_set = set(true)
        pred_set = set(pred)
        common = true_set.intersection(pred_set)
        precision = len(common) / len(pred_set) if len(pred_set) > 0 else 0
        recall = len(common) / len(true_set) if len(true_set) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    return avg_precision, avg_recall, avg_f1

# Training loop with early stopping
EPOCHS = 5
best_wer = float("inf")
patience = 3
counter = 0

for epoch in range(EPOCHS):
    # Train the model for one epoch
    train_loss = train_one_epoch(qa_model, train_loader, optimizer, scheduler)

    # Evaluate the model on the validation dataset
    wer_score, precision, recall, f1 = evaluate_model(qa_model, valid_loader)
    logger.info(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, WER Score: {wer_score:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Early stopping based on WER
    if wer_score < best_wer:
        best_wer = wer_score
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            logger.info("Early stopping triggered!")
            break


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 292.49it/s]
INFO:__main__:Epoch 1/5, Train Loss: 1.9238, WER Score: 2.3665, Precision: 0.6134, Recall: 0.6887, F1 Score: 0.6032
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 292.50it/s]
INFO:__main__:Epoch 2/5, Train Loss: 1.0523, WER Score: 1.8325, Precision: 0.6344, Recall: 0.6887, F1 Score: 0.6187
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 291.97it/s]
INFO:__main__:Epoch 3/5, Train Loss: 0.5978, WER Score: 1.8492, Precision: 0.6370

In [None]:
#post Processed(model4)
import torch
import json
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import jiwer
import logging
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Configure logging for detailed output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Determine device for computation: GPU if available, otherwise CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Function to load data from a JSON file
def load_data_file(path):
    # Read and parse the JSON file
    with open(path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    # Initialize lists to hold contexts, questions, and answers
    contexts, questions, answers = [], [], []

    # Iterate through the dataset structure to extract the required fields
    for group in raw_data["data"]:
        for paragraph in group["paragraphs"]:
            context = paragraph["context"].lower()
            for qa in paragraph["qas"]:
                question = qa["question"].lower()
                for answer in qa["answers"]:
                    # Add context, question, and answer to respective lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers
    """
    Load and parse the dataset from a JSON file.
    Extract contexts, questions, and answers from the dataset.
    """
    try:
        with open(path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
    except FileNotFoundError:
        logger.error(f"File not found: {path}")
        raise

    contexts, questions, answers = [], [], []
    num_questions, num_possible, num_impossible = 0, 0, 0

    # Extract data from the nested structure
    for group in raw_data["data"]:
        for paragraph in group["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                num_questions += 1
                if "is_impossible" in qa and qa["is_impossible"]:
                    num_impossible += 1
                else:
                    num_possible += 1
                for answer in qa.get("answers", []):
                    contexts.append(context.lower())
                    questions.append(question.lower())
                    answers.append(answer)

    return num_questions, num_possible, num_impossible, contexts, questions, answers

# Load the training and validation datasets
try:
    num_train_questions, num_train_possible, num_train_impossible, train_contexts, train_questions, train_answers = load_data_file("spoken_train-v1.1.json")
    num_valid_questions, num_valid_possible, num_valid_impossible, valid_contexts, valid_questions, valid_answers = load_data_file("spoken_test-v1.1.json")
except Exception as e:
    logger.error(f"Error loading data: {e}")
    exit()

# Function to calculate and add end positions for each answer
def add_answer_end_positions(answers):
    for answer in answers:
        # Convert the answer text to lowercase for consistency
        answer["text"] = answer["text"].lower()

        # Calculate the end position of the answer in the context
        answer["answer_end"] = answer["answer_start"] + len(answer["text"])
    """
    Calculate the end position of each answer based on its start position and length.
    Add this information to the answer dictionary.
    """
    for answer in answers:
        answer_text = answer.get("text", "").lower()
        answer_start = answer.get("answer_start", -1)
        answer["answer_end"] = answer_start + len(answer_text)

# Add end positions to training and validation datasets
add_answer_end_positions(train_answers)
add_answer_end_positions(valid_answers)
add_answer_end_positions(valid_answers)

# Tokenizer and model configuration
MAX_LENGTH = 512
MODEL_PATH = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)

# Preprocessing function to handle offset mappings
def preprocess_data(contexts, questions, answers, tokenizer, max_length=MAX_LENGTH):
    # Tokenize the questions and contexts together
    encodings = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True
    )

    # Initialize lists to store the start and end positions in tokenized data
    start_positions, end_positions = [], []

    # Align answer positions with tokenized inputs using offset mapping
    for i, answer in enumerate(answers):
        start_char = answer["answer_start"]  # Character-level start position
        end_char = answer["answer_end"]  # Character-level end position
        offsets = encodings["offset_mapping"][i]  # Token offsets for the current input

        # Default to start and end positions at 0 if alignment fails
        start_token, end_token = 0, 0

        # Match character positions to token positions
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:  # Start position is within the token span
                start_token = idx
            if start < end_char <= end:  # End position is within the token span
                end_token = idx

        start_positions.append(start_token)
        end_positions.append(end_token)

    # Add start and end positions to the tokenized encodings
    encodings.update({
        "start_positions": start_positions,
        "end_positions": end_positions,
    })

    return encodings
    encodings = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True
    )
    start_positions = []
    end_positions = []

    for i, answer in enumerate(answers):
        start_char = answer['answer_start']
        end_char = answer['answer_end']
        offsets = encodings['offset_mapping'][i]
        start_token, end_token = 0, 0
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token = idx
            if start < end_char <= end:
                end_token = idx
        start_positions.append(start_token)
        end_positions.append(end_token)

    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    return encodings

# Preprocess the training and validation datasets
train_encodings = preprocess_data(train_questions, train_contexts, train_answers, tokenizer)
valid_encodings = preprocess_data(valid_questions, valid_contexts, valid_answers, tokenizer)

# Custom Dataset class to handle data preparation for PyTorch
class QADataset(Dataset):
    """
    Custom Dataset class to manage tokenized inputs and corresponding labels.
    """

    def __init__(self, encodings, answers):
        self.encodings = encodings
        self.answers = answers

    def __getitem__(self, idx):
        # Prepare tokenized inputs and corresponding answer positions
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['start_positions'] = torch.tensor(self.encodings['start_positions'][idx])
        item['end_positions'] = torch.tensor(self.encodings['end_positions'][idx])
        return item

    def __len__(self):
        # Return the total number of samples
        return len(self.encodings['input_ids'])

# Create Dataset objects for training and validation
train_dataset = QADataset(train_encodings, train_answers)
valid_dataset = QADataset(valid_encodings, valid_answers)

# Initialize DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=1)

# Load the pre-trained DistilBERT model for question answering
qa_model = DistilBertForQuestionAnswering.from_pretrained(MODEL_PATH).to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(qa_model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Function to train the model for one epoch
def train_one_epoch(model, dataloader, optimizer, scheduler):
    """
    Train the model for one epoch using the given DataLoader, optimizer, and scheduler.
    """
    model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc="Training"):
        # Zero out gradients from previous step
        optimizer.zero_grad()

        # Move inputs and labels to the device (GPU or CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        # Forward pass and loss computation
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions,
        )
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)  # Average loss for the epoch
    """
    Train the model for one epoch using the given DataLoader, optimizer, and scheduler.
    """
    model.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)  # Return the average loss for the epoch

# Function to evaluate the model on a validation dataset
def evaluate_model(model, dataloader):
    """
    Evaluate the model and compute the Word Error Rate (WER), Precision, Recall, and F1 score on the validation dataset.
    """
    model.eval()
    wer_list = []
    all_true_answers = []
    all_pred_answers = []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move inputs to the device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_true = batch["start_positions"].to(device)
        end_true = batch["end_positions"].to(device)

        # Forward pass without gradient computation
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Decode predictions and true answers
        for i in range(len(start_true)):
            pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1])
            true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1])
            if true_answer.strip():  # Avoid empty true answers
                wer = jiwer.wer(true_answer, pred_answer)
                wer_list.append(wer)
                all_true_answers.append(true_answer)
                all_pred_answers.append(pred_answer)

    # Calculate F1 Score
    true_labels = [answer.split() for answer in all_true_answers]
    pred_labels = [answer.split() for answer in all_pred_answers]
    precision, recall, f1 = calculate_f1(true_labels, pred_labels)

    # Return the average WER and F1 score across all samples
    avg_wer = sum(wer_list) / len(wer_list) if wer_list else 0.0
    return avg_wer, precision, recall, f1
    """
    Evaluate the model and compute the Word Error Rate (WER), Precision, Recall, and F1 score on the validation dataset.
    """
    model.eval()
    wer_list = []
    all_true_answers = []
    all_pred_answers = []

    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_true = batch["start_positions"].to(device)
        end_true = batch["end_positions"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predict start and end positions
        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Decode predictions and true answers
        for i in range(len(start_true)):
            pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1])
            true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1])
            if true_answer.strip():
                wer = jiwer.wer(true_answer, pred_answer)
                wer_list.append(wer)
                all_true_answers.append(true_answer)
                all_pred_answers.append(pred_answer)

    # Calculate F1 Score
    true_labels = [answer.split() for answer in all_true_answers]
    pred_labels = [answer.split() for answer in all_pred_answers]
    precision, recall, f1 = calculate_f1(true_labels, pred_labels)

    # Return the average WER and F1 score across all samples
    avg_wer = sum(wer_list) / len(wer_list) if wer_list else 0.0
    return avg_wer, precision, recall, f1

# Helper function to calculate F1 score
def calculate_f1(true_labels, pred_labels):
    """
    Calculate precision, recall, and F1 score for the given true and predicted labels.
    """
    precision_list = []
    recall_list = []
    f1_list = []

    for true, pred in zip(true_labels, pred_labels):
        true_set = set(true)
        pred_set = set(pred)
        common = true_set.intersection(pred_set)
        precision = len(common) / len(pred_set) if len(pred_set) > 0 else 0
        recall = len(common) / len(true_set) if len(true_set) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    return avg_precision, avg_recall, avg_f1

# Training loop with early stopping
EPOCHS = 5
best_wer = float("inf")
patience = 3
counter = 0

for epoch in range(EPOCHS):
    # Train the model for one epoch
    train_loss = train_one_epoch(qa_model, train_loader, optimizer, scheduler)

    # Evaluate the model on the validation dataset
    wer_score, precision, recall, f1 = evaluate_model(qa_model, valid_loader)
    logger.info(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, WER Score: {wer_score:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Early stopping based on WER
    if wer_score < best_wer:
        best_wer = wer_score
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            logger.info("Early stopping triggered!")
            break


ERROR:__main__:Error loading data: not enough values to unpack (expected 6, got 3)
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.62it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 291.81it/s]
INFO:__main__:Epoch 1/5, Train Loss: 2.3039, WER Score: 1.7629, Precision: 0.5646, Recall: 0.6015, F1 Score: 0.5393
Training: 100%|██████████| 2320/2320 [02:28<00:00, 15.61it/s]
Evaluating: 100%|██████████| 15875/15875 [00:54<00:00, 291.86it/s]
INFO:__main__:Epoch 2/5, Train Loss: 1.3667, WER Score: 1.8195, Precision: 0.5859, Recall: 0.6299, F1 Score: 0.5641
Training:   1%|          | 27/2320 [00:01<02:26, 15.64it/s]