<a href="https://colab.research.google.com/github/Tizian15/QA-project/blob/main/EvaluateModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch
!pip install evaluate
!pip install datasets
!pip install gradio



In [2]:
from google.colab import drive
drive.mount("/content/drive")

from datasets import load_from_disk

dataset_path = "/content/drive/MyDrive/TDS-Project/QA-project/content/business-questionnaire-dataset"

dataset = load_from_disk(dataset_path)
print(dataset)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DatasetDict({
    train: Dataset({
        features: ['question', 'context', 'option', 'type', 'answer_length', 'answers_s3auf_mdeberta-v3-squad2-ft-busiQA-3ep', 'score_s3auf_mdeberta-v3-squad2-ft-busiQA-3ep', 'answers_s3auf/bert-finetuned-busiQA', 'score_s3auf/bert-finetuned-busiQA', 'answers_timpal0l/mdeberta-v3-base-squad2', 'score_timpal0l/mdeberta-v3-base-squad2', 'ranked_answer', 'ranked_score'],
        num_rows: 1586
    })
    valid: Dataset({
        features: ['question', 'context', 'option', 'type', 'answer_length', 'answers_s3auf_mdeberta-v3-squad2-ft-busiQA-3ep', 'score_s3auf_mdeberta-v3-squad2-ft-busiQA-3ep', 'answers_s3auf/bert-finetuned-busiQA', 'score_s3auf/bert-finetuned-busiQA', 'answers_timpal0l/mdeberta-v3-base-squad2', 'score_timpal0l/mdeberta-v3-base-squad2', 'ranked_answer', 'ranked_score'],
        num_rows: 680
    })
})


In [3]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

# Load fine-tuned RoBERTa
roberta_model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/MyDrive/fine_tuned_roberta")
roberta_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/fine_tuned_roberta")

# Load fine-tuned mDeBERTa
mdeberta_model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/MyDrive/fine_tuned_mdeberta_v3")
mdeberta_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/fine_tuned_mdeberta_v3")

# Load fine-tuned DistilBERT
distilbert_model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/MyDrive/fine_tuned_distilbert")
distilbert_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/fine_tuned_distilbert")

print("✅ All fine-tuned models loaded successfully!")


✅ All fine-tuned models loaded successfully!


In [4]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

# Load baseline RoBERTa
baseline_roberta_model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")
baseline_roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Load baseline mDeBERTa
baseline_mdeberta_model = AutoModelForQuestionAnswering.from_pretrained("microsoft/mdeberta-v3-base")
baseline_mdeberta_tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")

# Load baseline DistilBERT
baseline_distilbert_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
baseline_distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

print("✅ All baseline models loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ All baseline models loaded successfully!


In [5]:
train_df = dataset["train"].to_pandas()
valid_df = dataset["valid"].to_pandas()

train_df[["question", "ranked_answer"]].head(10)

Unnamed: 0,question,ranked_answer
0,What are your primary concerns about implement...,"{'answer': ' providing effective training', 'a..."
1,Which training methods are you planning to imp...,"{'answer': ' gamified learning elements', 'ans..."
2,What are your biggest challenges in online adv...,{'answer': ' finding qualified traffic and the...
3,"On a scale of 1 to 10, how would you rate the ...","{'answer': ' 2', 'answer_start': 50, 'model': ..."
4,Which sustainability initiatives has your comp...,{'answer': ' reducing water consumption throug...
5,What is your company's primary industry?,"{'answer': ' healthcare', 'answer_start': 31, ..."
6,What is your company’s annual revenue?,"{'answer': ' $10 million to $50 million', 'ans..."
7,What CRM platforms have you previously conside...,"{'answer': ' HubSpot and Zoho CRM.', 'answer_s..."
8,What are your company's main obstacles to achi...,"{'answer': ' lack of employee engagement', 'an..."
9,How many employees have access to sensitive co...,"{'answer': ' 7', 'answer_start': 15, 'model': ..."


In [6]:
import evaluate

# Load the SQuAD evaluation metric
metric = evaluate.load("squad")


In [7]:
import ast  # Needed to convert string representations of dictionaries

# Function to extract answer and answer_start from the "ranked_answer" column
def extract_answer_info(row):
    answer_data = row["ranked_answer"]
    if isinstance(answer_data, str):  # Convert string to dictionary if necessary
        answer_data = ast.literal_eval(answer_data)
    return answer_data["answer"], answer_data["answer_start"]

# Apply the function to extract relevant data
train_df["answer"], train_df["answer_start"] = zip(*train_df.apply(extract_answer_info, axis=1))

# Display the first few rows to verify the extraction
train_df[["question", "context", "answer", "answer_start"]].head(5)

Unnamed: 0,question,context,answer,answer_start
0,What are your primary concerns about implement...,Our primary concern regarding the implementati...,providing effective training,83
1,Which training methods are you planning to imp...,"For next year, we're focusing on enhancing our...",gamified learning elements,110
2,What are your biggest challenges in online adv...,Our two biggest challenges in online advertisi...,finding qualified traffic and the lack of tim...,52
3,"On a scale of 1 to 10, how would you rate the ...",I'd rate the ease of use of our current softwa...,2,50
4,Which sustainability initiatives has your comp...,We've implemented a variety of sustainability ...,reducing water consumption through improved t...,70


In [8]:
import torch

def evaluate_model(model, tokenizer, dataset, num_samples=100):
    results = []

    for i in range(num_samples):
        example = dataset.iloc[i]
        question = example["question"]
        context = example["context"]
        ground_truth = example["answer"]

        # Tokenize input
        inputs = tokenizer(question, context, return_tensors="pt", truncation=True)

        with torch.no_grad():
            outputs = model(**inputs)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_index = torch.argmax(start_logits)
        end_index = torch.argmax(end_logits)

        predicted_answer = tokenizer.decode(inputs["input_ids"][0][start_index:end_index+1])

        results.append({
            "prediction_text": predicted_answer,
            "reference_text": ground_truth
        })

    # Compute Exact Match and F1 Score
    predictions = [{"id": str(i), "prediction_text": res["prediction_text"]} for i, res in enumerate(results)]
    references = [{"id": str(i), "answers": {"text": [res["reference_text"]], "answer_start": [0]}} for i, res in enumerate(results)]

    return metric.compute(predictions=predictions, references=references)


In [14]:
# Evaluate Baseline Models
baseline_roberta_score = evaluate_model(baseline_roberta_model, baseline_roberta_tokenizer, train_df)
baseline_mdeberta_score = evaluate_model(baseline_mdeberta_model, baseline_mdeberta_tokenizer, train_df)
baseline_distilbert_score = evaluate_model(baseline_distilbert_model, baseline_distilbert_tokenizer, train_df)

print("📊 Baseline RoBERTa Score:", baseline_roberta_score)
print("📊 Baseline mDeBERTa Score:", baseline_mdeberta_score)
print("📊 Baseline DistilBERT Score:", baseline_distilbert_score)

# Evaluate Fine-Tuned Models
fine_tuned_roberta_score = evaluate_model(roberta_model, roberta_tokenizer, train_df)
fine_tuned_mdeberta_score = evaluate_model(mdeberta_model, mdeberta_tokenizer, train_df)
fine_tuned_distilbert_score = evaluate_model(distilbert_model, distilbert_tokenizer, train_df)

print("📊 Fine-Tuned RoBERTa Score:", fine_tuned_roberta_score)
print("📊 Fine-Tuned mDeBERTa Score:", fine_tuned_mdeberta_score)
print("📊 Fine-Tuned DistilBERT Score:", fine_tuned_distilbert_score)


KeyError: 'answer'

In [10]:
!pip install rouge_score
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")



In [11]:
def answer_question(model, tokenizer, question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    answer = tokenizer.decode(inputs["input_ids"][0][start_index:end_index+1])
    return answer


In [12]:
def evaluate_rouge(model, tokenizer, dataset, num_samples=100):
    results = []

    for i in range(num_samples):
        example = dataset.iloc[i]
        question = example["question"]
        context = example["context"]
        ground_truth = example["answer"]

        # Generate model answer
        predicted_answer = answer_question(model, tokenizer, question, context)

        results.append({
            "prediction_text": predicted_answer,
            "reference_text": ground_truth
        })

    # Compute ROUGE scores
    predictions = [res["prediction_text"] for res in results]
    references = [res["reference_text"] for res in results]

    return rouge.compute(predictions=predictions, references=references)

# Compute ROUGE scores for all models
baseline_roberta_rouge = evaluate_rouge(baseline_roberta_model, baseline_roberta_tokenizer, train_df)
fine_tuned_roberta_rouge = evaluate_rouge(roberta_model, roberta_tokenizer, train_df)

baseline_mdeberta_rouge = evaluate_rouge(baseline_mdeberta_model, baseline_mdeberta_tokenizer, train_df)
fine_tuned_mdeberta_rouge = evaluate_rouge(mdeberta_model, mdeberta_tokenizer, train_df)

baseline_distilbert_rouge = evaluate_rouge(baseline_distilbert_model, baseline_distilbert_tokenizer, train_df)
fine_tuned_distilbert_rouge = evaluate_rouge(distilbert_model, distilbert_tokenizer, train_df)

# Print results
print("📊 Baseline RoBERTa ROUGE:", baseline_roberta_rouge)
print("📊 Fine-Tuned RoBERTa ROUGE:", fine_tuned_roberta_rouge)

print("📊 Baseline mDeBERTa ROUGE:", baseline_mdeberta_rouge)
print("📊 Fine-Tuned mDeBERTa ROUGE:", fine_tuned_mdeberta_rouge)

print("📊 Baseline DistilBERT ROUGE:", baseline_distilbert_rouge)
print("📊 Fine-Tuned DistilBERT ROUGE:", fine_tuned_distilbert_rouge)


📊 Baseline RoBERTa ROUGE: {'rouge1': 0.024399108077005946, 'rouge2': 0.005297570850202429, 'rougeL': 0.022352714105331767, 'rougeLsum': 0.022906249046111812}
📊 Fine-Tuned RoBERTa ROUGE: {'rouge1': 0.0832174688057041, 'rouge2': 0.04841775965969514, 'rougeL': 0.08039660143336613, 'rougeLsum': 0.08208655427773075}
📊 Baseline mDeBERTa ROUGE: {'rouge1': 0.056079892779767174, 'rouge2': 0.02878671812480838, 'rougeL': 0.05541923261790811, 'rougeLsum': 0.05561750693256173}
📊 Fine-Tuned mDeBERTa ROUGE: {'rouge1': 0.4885863858363858, 'rouge2': 0.3765056022408963, 'rougeL': 0.48696412411118284, 'rougeLsum': 0.48664712166918045}
📊 Baseline DistilBERT ROUGE: {'rouge1': 0.048707964302087896, 'rouge2': 0.03751635324540697, 'rougeL': 0.04887090119203191, 'rougeLsum': 0.047894458949150544}
📊 Fine-Tuned DistilBERT ROUGE: {'rouge1': 0.09864361843629485, 'rouge2': 0.05655898905589307, 'rougeL': 0.09675356952396025, 'rougeLsum': 0.09759535796220434}
