In [28]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
!pip install datasets transformers torch torchvision torchaudio evaluate tqdm
!pip install requests>=2.32.1
!pip install accelerate
!pip install transformers[sentencepiece]
!pip install accelerate -U
!pip install transformers[torch]
!pip install sacrebleu
!pip install nltk
!pip install rouge
!pip install evaluate rouge-score

Collecting rouge-score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=0fc47cdac3f74e00f5264c64da3f949e16430a708fad01271c77a1365f50131c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [1]:
import datasets
import json
import numpy as np
import evaluate
import torch
import torch.nn.functional as F
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
import collections
from tqdm.auto import tqdm
from transformers import TrainingArguments
from transformers import Trainer
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from rouge import Rouge

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the SQuAD JSON files
with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/train-v2.0.json') as f:
    train_data = json.load(f)

with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json') as g:
    dev_data = json.load(g)

# Extract the 'data' field which contains the actual dataset
training_data = train_data['data']
validation_data = dev_data['data']

In [4]:
# Overall, this function takes raw data with articles, questions, and answers,
# and transforms it into a structured dictionary separating titles, contexts,
# questions, answer texts, and answer starting positions.

# Function to transform the data into the required format
def transform_data(data):
    transformed_data = {
        'id': [],
        'title': [],
        'context': [],
        'question': [],
        'answers': []
    }
    for article in data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                transformed_data['id'].append(qa['id'])
                transformed_data['title'].append(title)
                transformed_data['context'].append(context)
                transformed_data['question'].append(qa['question'])
                transformed_data['answers'].append({
                    'text': [answer['text'] for answer in qa['answers']],
                    'answer_start': [answer['answer_start'] for answer in qa['answers']]
                })
    return transformed_data


In [5]:
# Transform the data
train_transformed = transform_data(train_data)
dev_transformed = transform_data(dev_data)

# Create Dataset objects
train_dataset = Dataset.from_dict(train_transformed)
dev_dataset = Dataset.from_dict(dev_transformed)

# Create a DatasetDict with the Dataset objects to prepare it processing
raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset
})

In [6]:
def remove_empty_answers(dataset):
    def is_valid_example(example):
        return len(example['answers']['text']) > 0

    return dataset.filter(is_valid_example)

In [7]:
# filtering out examples (question-answer pairs) where there are no answers.
# This ensures your training data focuses on questions with actual answers.

train_dataset_cleaned = remove_empty_answers(raw_datasets["train"])

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [8]:
# bring the train and validation sets together in a single dictionary

raw_datasets_processed = DatasetDict({
    'train': train_dataset_cleaned,
    'validation': raw_datasets["validation"]
})

In [9]:
raw_datasets["validation"]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})

In [10]:
# preprocesses validation data for question answering in a similar way to the training function.
# It performs tokenization, handles potential splitting due to the sliding window,
# and prepares the data for evaluation by the model.

def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    contexts = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        contexts.append(examples["context"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    inputs["context"] = contexts
    return inputs

In [11]:
def make_predictions(model, dataset, batch_size=16, no_answer_threshold=0.5):
    model.eval()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    predictions = []

    with torch.no_grad():
        for i in tqdm(range(0, len(dataset), batch_size)):
            batch = dataset[i:i + batch_size]

            # Prepare inputs and move them to the correct device
            inputs = {
                key: torch.tensor(val).to(device)
                for key, val in batch.items()
                if key in ["input_ids", "attention_mask"]
            }

            # Forward pass through the model
            outputs = model(**inputs)

            start_logits = outputs.start_logits.cpu().numpy()
            end_logits = outputs.end_logits.cpu().numpy()

            # Calculate no-answer probability using max logits
            no_answer_prob = F.softmax(torch.tensor(np.maximum(start_logits[:, 0], end_logits[:, 0])), dim=-1).cpu().numpy()

            for j, example_id in enumerate(batch["example_id"]):
                start_idx = np.argmax(start_logits[j])
                end_idx = np.argmax(end_logits[j])
                no_ans_prob = no_answer_prob[j]

                # Check if the no-answer probability exceeds the threshold
                if no_ans_prob > no_answer_threshold:
                    prediction = (example_id, None, None, no_ans_prob)
                else:
                    prediction = (example_id, start_idx, end_idx, no_ans_prob)

                predictions.append(prediction)

    return predictions

In [12]:
def postprocess_predictions(dataset, predictions):
    # Aggregate predictions by example_id
    example_predictions = collections.defaultdict(list)
    for example, (example_id, start_idx, end_idx, no_answer_prob) in zip(dataset, predictions):
        offsets = example["offset_mapping"]
        context = example["context"]

        if start_idx is None or end_idx is None or start_idx >= len(offsets) or end_idx >= len(offsets):
            text = ""
        else:
            while start_idx < len(offsets) and offsets[start_idx] is None:
                start_idx += 1
            while end_idx >= 0 and offsets[end_idx] is None:
                end_idx -= 1

            if start_idx >= len(offsets) or end_idx < 0 or offsets[start_idx] is None or offsets[end_idx] is None:
                text = ""
            else:
                start_char = offsets[start_idx][0]
                end_char = offsets[end_idx][1]
                text = context[start_char:end_char]

        example_predictions[example_id].append({
            "text": text,
            "no_answer_prob": no_answer_prob
        })

    # Choose the best prediction per example_id (e.g., the one with the lowest no_answer_prob)
    formatted_predictions = [
        {
            "id": example_id,
            "answer": min(preds, key=lambda x: x["no_answer_prob"])["text"],
            "no_answer_probability": min(preds, key=lambda x: x["no_answer_prob"])["no_answer_prob"]
        }
        for example_id, preds in example_predictions.items()
    ]

    return formatted_predictions

In [13]:
def save_predictions(predictions, filename):
    with open(filename, 'w') as f:
        json.dump(predictions, f)

In [14]:
def save_na_probs(na_probs_dict, filename):
    # Convert numpy float32 to regular Python floats for JSON serialization
    modified_dict = {k: float(v) for k, v in na_probs_dict.items()}
    with open(filename, 'w') as f:
        json.dump(modified_dict, f, indent=2)

In [15]:
# Overall, this code is initializing a tokenizer specifically designed

model_checkpoint = "ozgurkk/bert-base-uncased-finetuned-squad_v1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [16]:

max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [17]:
validation_dataset

Dataset({
    features: ['context', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 12134
})

In [18]:
# Load the pretrained model for question answering.

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/673 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [19]:
predictions = make_predictions(model, validation_dataset)

  0%|          | 0/759 [00:00<?, ?it/s]

In [20]:
final_predictions = postprocess_predictions(validation_dataset, predictions)

# Create the no-answer probabilities dictionary
na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}

# Convert to dictionary format
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

In [21]:
# Save your predictions to a file in your mounted drive
save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_bert_fine_tuned_squad.json')

# Save your predictions to a file in your mounted drive
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_bert_fine_tuned_squad.json')


In [22]:
# Paths to the evaluation script and data files
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_bert_fine_tuned_squad.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_bert_fine_tuned_squad.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_bert_fine_tuned_squad.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_bert_fine_tuned_squad'

# Command to run the evaluation script

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]
import subprocess
# Use the evaluation script
subprocess.run(command, check=True)

CompletedProcess(args=['python', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_bert_fine_tuned_squad.json', '-n', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_bert_fine_tuned_squad.json', '-o', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_bert_fine_tuned_squad.json', '-p', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_bert_fine_tuned_squad'], returncode=0)

In [23]:
json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_bert_fine_tuned_squad.json'
with open(json_file_path, 'r') as f:
    pred_dict = json.load(f)

with open(json_file_path, 'w') as f:
    json.dump(pred_dict, f, indent=2)

In [24]:
# Extract references and hypotheses
references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                # Each reference should be a list of lists of strings
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)
print(references)
print(hypotheses)

['France', '', 'Denmark, Iceland and Norway', 'Rollo', '10th', 'William the Conqueror, led to the Norman conquest of England', '', 'Christian piety, becoming exponents of the Catholic orthodoxy', 'Northman', '9th century', 'In the course of the 10th century, the initially destructive incursions of Norse war bands into the rivers of France evolved into more permanent encampments that included local women and personal property. The Duchy of Normandy, which began in 911', '', '', '', 'Christianity', 'north', 'The Normans thereafter adopted the growing feudal doctrines of the rest of France, and worked them into a functional hierarchical system in both Normandy and in England. The new Norman rulers were culturally and ethnically distinct from the old French aristocracy, most of whom traced their lineage to Franks of the Carolingian dynasty. Most Norman knights remained poor and land-hungry, and by 1066 Normandy had been exporting fighting horsemen', 'the Pechenegs, the Bulgars, and especia

In [27]:
from evaluate import load
# Convert each hypothesis to a list to match bleu's expected format
hypotheses_bleu = hypotheses
references_bleu = references
print(hypotheses_bleu)
print(references_bleu)

bleu = evaluate.load('bleu')

bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

print(f"BLEU score: {bleu_scores}")

# Function to compute ROUGE scores
def compute_rouge(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

# Filter out empty hypotheses and corresponding references
filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]

# Ensure that references are flat
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

print(filtered_hypotheses)
print(flat_references)


# Compute ROUGE scores
rouge = evaluate.load('rouge')

rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)
print(f"ROUGE scores: {rouge_scores}")


['France', '', 'Denmark, Iceland and Norway', 'Rollo', '10th', 'William the Conqueror, led to the Norman conquest of England', '', 'Christian piety, becoming exponents of the Catholic orthodoxy', 'Northman', '9th century', 'In the course of the 10th century, the initially destructive incursions of Norse war bands into the rivers of France evolved into more permanent encampments that included local women and personal property. The Duchy of Normandy, which began in 911', '', '', '', 'Christianity', 'north', 'The Normans thereafter adopted the growing feudal doctrines of the rest of France, and worked them into a functional hierarchical system in both Normandy and in England. The new Norman rulers were culturally and ethnically distinct from the old French aristocracy, most of whom traced their lineage to Franks of the Carolingian dynasty. Most Norman knights remained poor and land-hungry, and by 1066 Normandy had been exporting fighting horsemen', 'the Pechenegs, the Bulgars, and especia

ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['rouge_score'] using 'pip install rouge_score' for instance'

In [None]:
# Path to the existing JSON file
json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_bert_fine_tuned_squad.json'

# Load the existing JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

# Save the updated JSON data back to the file
with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)  # indent=2 for pretty formatting

print(f"Updated JSON saved to {json_file_path}")


In [None]:
# Creating metrics for bert - level 0

model_checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

predictions = make_predictions(model, validation_dataset)
final_predictions = postprocess_predictions(validation_dataset, predictions)

na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_bert_level0.json')
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_bert_level0.json')

eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_bert_level0.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_bert_level0.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_bert_level0.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_bert_level0'

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]
subprocess.run(command, check=True)


references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)

hypotheses_bleu = hypotheses
references_bleu = references

bleu = evaluate.load('bleu')
bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_bert_level0.json'
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
# Creating metrics for roberta - level 0

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

model_checkpoint = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

predictions = make_predictions(model, validation_dataset)
final_predictions = postprocess_predictions(validation_dataset, predictions)

na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level0.json')
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_level0.json')

eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level0.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_level0.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level0.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_level0'

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]
subprocess.run(command, check=True)

json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level0.json'
with open(json_file_path, 'r') as f:
    pred_dict = json.load(f)

with open(json_file_path, 'w') as f:
    json.dump(pred_dict, f, indent=2)

references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)

hypotheses_bleu = hypotheses
references_bleu = references

bleu = evaluate.load('bleu')
bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level0.json'
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
# Creating metrics for albert - level 0

model_checkpoint = "albert/albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

predictions = make_predictions(model, validation_dataset)
final_predictions = postprocess_predictions(validation_dataset, predictions)

na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_level0.json')
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_albert_level0.json')

eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_level0.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_albert_level0.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_level0.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_albert_level0'

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]
subprocess.run(command, check=True)

json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_level0.json'
with open(json_file_path, 'r') as f:
    pred_dict = json.load(f)

with open(json_file_path, 'w') as f:
    json.dump(pred_dict, f, indent=2)

references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)

print(hypotheses)
print(references)

hypotheses_bleu = hypotheses
references_bleu = references

bleu = evaluate.load('bleu')
bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_level0.json'
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
# Creating metrics for roberta - level 1

model_checkpoint = "f-arnold/roberta-base-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

predictions = make_predictions(model, validation_dataset)
final_predictions = postprocess_predictions(validation_dataset, predictions)

na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level1.json')
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_level1.json')

eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level1.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_level1.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level1.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_level1'

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]
subprocess.run(command, check=True)

json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level1.json'
with open(json_file_path, 'r') as f:
    pred_dict = json.load(f)

with open(json_file_path, 'w') as f:
    json.dump(pred_dict, f, indent=2)

references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)

hypotheses_bleu = hypotheses
references_bleu = references

bleu = evaluate.load('bleu')
bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level1.json'
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
# Creating metrics for albert - level 1

model_checkpoint = "TugceCaglayan/albert-base-v2-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

predictions = make_predictions(model, validation_dataset)
final_predictions = postprocess_predictions(validation_dataset, predictions)

na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_level1.json')
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_albert_level1.json')

eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_level1.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_albert_level1.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_level1.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_albert_level1'

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]
subprocess.run(command, check=True)

json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_albert_level1.json'
with open(json_file_path, 'r') as f:
    pred_dict = json.load(f)

with open(json_file_path, 'w') as f:
    json.dump(pred_dict, f, indent=2)

references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)

hypotheses_bleu = hypotheses
references_bleu = references

bleu = evaluate.load('bleu')
bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_level1.json'
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
# Creating metrics for roberta base - level 3

model_checkpoint = "ozgurkk/roberta-base-finetuned-squad-v3"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)



model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

predictions = make_predictions(model, validation_dataset)
final_predictions = postprocess_predictions(validation_dataset, predictions)

na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level3.json')
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_level3.json')

eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level3.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_level3.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level3.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_level3'

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]

import subprocess
subprocess.run(command, check=True)

json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_level3.json'
with open(json_file_path, 'r') as f:
    pred_dict = json.load(f)

with open(json_file_path, 'w') as f:
    json.dump(pred_dict, f, indent=2)

references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)

print(hypotheses)
print(references)

hypotheses_bleu = hypotheses
references_bleu = references

bleu = evaluate.load('bleu')
bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level3.json'
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
# Creating metrics for roberta large - level 3

model_checkpoint = "ozgurkk/roberta-large-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 384
stride = 128
validation_dataset = raw_datasets_processed["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

predictions = make_predictions(model, validation_dataset)
final_predictions = postprocess_predictions(validation_dataset, predictions)

na_probs_dict = {pred['id']: pred['no_answer_probability'] for pred in final_predictions}
pred_dict = {pred['id']: pred['answer'] for pred in final_predictions}

save_predictions(pred_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_large_level3.json')
save_na_probs(na_probs_dict, '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_large_level3.json')

eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'
pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_large_level3.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/na_probs_roberta_large_level3.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_large_level3.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/images_roberta_large_level3'

command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]

import subprocess
subprocess.run(command, check=True)

json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/pred_roberta_large_level3.json'
with open(json_file_path, 'r') as f:
    pred_dict = json.load(f)

with open(json_file_path, 'w') as f:
    json.dump(pred_dict, f, indent=2)

!python "/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py" \
    "/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json" \
    "/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/pred_v2.json" \
    --out-file "/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/eval_result_v2.json"

references = []
hypotheses = []

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            id = qa['id']
            if qa['answers']:
              ground_truth_answers = [answer['text'] for answer in qa['answers']]
              if id in pred_dict:
                predicted_answer = pred_dict[id]
                references.append(ground_truth_answers)
                hypotheses.append(predicted_answer)

hypotheses_bleu = hypotheses
references_bleu = references

bleu = evaluate.load('bleu')
bleu_scores = bleu.compute(predictions= hypotheses_bleu, references= references_bleu,
          max_order = 2)

filtered_hypotheses = [hyp for hyp in hypotheses if hyp]
filtered_references = [ref for hyp, ref in zip(hypotheses, references) if hyp]
flat_references = [ref[0] if isinstance(ref, list) else ref for ref in filtered_references]

rouge = evaluate.load('rouge')
rouge_scores = rouge.compute(predictions=filtered_hypotheses, references=flat_references)


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_large_level3.json'
with open(json_file_path, 'r') as f:
    data = json.load(f)

data['BLEU'] = bleu_scores
data['ROUGE'] = rouge_scores

with open(json_file_path, 'w') as f:
    json.dump(data, f, indent=2)

In [None]:
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Path to the JSON file
json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_bert_level0.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_bert_level0 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_bert_level0 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}

json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level0.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_roberta_level0 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_roberta_level0 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_level0.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_albert_level0 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_albert_level0 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_bert_fine_tuned_squad.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_bert_level1 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_bert_level1 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level1.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_roberta_level1 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_roberta_level1 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_albert_level1.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_albert_level1 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_albert_level1 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_level3.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_roberta_level3 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_roberta_level3 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}


json_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/model_outputs/eval_results_roberta_large_level3.json'


# Load the JSON data
with open(json_file_path, 'r') as f:
    data = json.load(f)

# Extract relevant metrics
exact = data['exact']
f1 = data['f1']
has_ans_exact = data['HasAns_exact']
has_ans_f1 = data['HasAns_f1']
no_ans_exact = data['NoAns_exact']
no_ans_f1 = data['NoAns_f1']
bleu = data['BLEU']['bleu']
rouge1 = data['ROUGE']['rouge1']
rouge2 = data['ROUGE']['rouge2']
rougeL = data['ROUGE']['rougeL']

# Prepare data for visualization
metrics_roberta_large_level3 = {
    'Exact Match': exact,
    'F1 Score': f1,
    'Has Answer Exact': has_ans_exact,
    'Has Answer F1': has_ans_f1,
    'No Answer Exact': no_ans_exact,
    'No Answer F1': no_ans_f1
}

add_metrics_roberta_large_level3 = {
    'BLEU': bleu,
    'ROUGE-1': rouge1,
    'ROUGE-2': rouge2,
    'ROUGE-L': rougeL
}




In [None]:
# Combine all metrics into a single DataFrame
data = {
    'Model': [],
    'Metric': [],
    'Value': []
}

# Add metrics
for metric, value in metrics_bert_level0.items():
    data['Model'].append('BERT - Level 0')
    data['Metric'].append(metric)
    data['Value'].append(value)


for metric, value in metrics_roberta_level0.items():
    data['Model'].append('RoBERTa - Level 0')
    data['Metric'].append(metric)
    data['Value'].append(value)

for metric, value in metrics_albert_level0.items():
    data['Model'].append('ALBERT - Level 0')
    data['Metric'].append(metric)
    data['Value'].append(value)


for metric, value in metrics_bert_level1.items():
    data['Model'].append('BERT - Level 1')
    data['Metric'].append(metric)
    data['Value'].append(value)

for metric, value in metrics_roberta_level1.items():
    data['Model'].append('RoBERTa - Level 1')
    data['Metric'].append(metric)
    data['Value'].append(value)


for metric, value in metrics_albert_level1.items():
    data['Model'].append('ALBERT - Level 1')
    data['Metric'].append(metric)
    data['Value'].append(value)

for metric, value in metrics_roberta_level3.items():
    data['Model'].append('RoBERTa base - Level 3')
    data['Metric'].append(metric)
    data['Value'].append(value)


for metric, value in metrics_roberta_large_level3.items():
    data['Model'].append('RoBERTa Large - Level 3')
    data['Metric'].append(metric)
    data['Value'].append(value)



df = pd.DataFrame(data)


In [None]:
# Combine all metrics into a single DataFrame
add_data = {
    'Model': [],
    'Metric': [],
    'Value': []
}

# Add metrics
for metric, value in add_metrics_bert_level0.items():
    add_data['Model'].append('BERT - Level 0')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)


for metric, value in add_metrics_roberta_level0.items():
    add_data['Model'].append('RoBERTa - Level 0')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)

for metric, value in add_metrics_albert_level0.items():
    add_data['Model'].append('ALBERT - Level 0')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)


for metric, value in add_metrics_bert_level1.items():
    add_data['Model'].append('BERT - Level 1')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)

for metric, value in add_metrics_roberta_level1.items():
    add_data['Model'].append('RoBERTa - Level 1')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)


for metric, value in add_metrics_albert_level1.items():
    add_data['Model'].append('ALBERT - Level 1')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)

for metric, value in add_metrics_roberta_level3.items():
    add_data['Model'].append('RoBERTa base - Level 3')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)


for metric, value in add_metrics_roberta_large_level3.items():
    add_data['Model'].append('RoBERTa Large - Level 3')
    add_data['Metric'].append(metric)
    add_data['Value'].append(value)


add_df = pd.DataFrame(add_data)


In [None]:
from matplotlib import rcParams

model_colors = {
    'BERT - Level 0': '#AED6F1',
    'RoBERTa - Level 0': '#21618C',
    'ALBERT - Level 0': '#2980B9',
    'BERT - Level 1': '#F1948A',
    'RoBERTa - Level 1': '#B03A2E',
    'ALBERT - Level 1': '#E74C3C',
    'RoBERTa base - Level 3': '#1D8348',
    'RoBERTa Large - Level 3': '#52BE80'

}

palette = [model_colors[model] for model in df['Model'].unique()]

sns.set(style="whitegrid")


# Create the figure and axis
plt.figure(figsize=(12, 6))

# Create line plot
lineplot = sns.lineplot(
    data=df,
    x='Metric',
    y='Value',
    hue='Model',
    palette=model_colors,  # Use the custom color palette
    markers=True,          # Show markers on the lines
    style='Model',         # Ensure style differentiation for each model
    dashes=False,          # Use solid lines for all models
    linewidth=2.5,         # Increase line thickness
    markersize=10          # Increase marker size
)

# Customize plot
plt.title('Model Performance Metrics', fontsize=12, weight='bold')
plt.xlabel('Metric', fontsize=10, weight='bold')
plt.ylabel('Value', fontsize=10, weight='bold')
plt.xticks(rotation=0)
plt.legend(
    title='Model',
    bbox_to_anchor=(0.5, -0.15),
    loc='upper center',
    ncol=4,  # Number of columns in the legend (adjust as needed)
    fontsize=8,
    title_fontsize='10'
)

# Add light grid lines only for y-axis
plt.grid(True, which='major', axis='y', linestyle='--', linewidth=0.6, color='gray', alpha=0.7)

# Remove top and right spines for a cleaner look
sns.despine()

# Show plot
plt.tight_layout()
plt.show()

In [None]:
from matplotlib import rcParams

model_colors = {
    'BERT - Level 0': '#AED6F1',
    'RoBERTa - Level 0': '#21618C',
    'ALBERT - Level 0': '#2980B9',
    'BERT - Level 1': '#F1948A',
    'RoBERTa - Level 1': '#B03A2E',
    'ALBERT - Level 1': '#E74C3C',
    'RoBERTa base - Level 3': '#1D8348',
    'RoBERTa Large - Level 3': '#52BE80'

}

palette = [model_colors[model] for model in add_df['Model'].unique()]

sns.set(style="whitegrid")


# Create the figure and axis
plt.figure(figsize=(12, 6))

# Create line plot
lineplot = sns.lineplot(
    data = add_df,
    x=' Additional Metric',
    y='Value',
    hue='Model',
    palette=model_colors,  # Use the custom color palette
    markers=True,          # Show markers on the lines
    style='Model',         # Ensure style differentiation for each model
    dashes=False,          # Use solid lines for all models
    linewidth=2.5,         # Increase line thickness
    markersize=10          # Increase marker size
)

# Customize plot
plt.title('Additional Performance Metrics', fontsize=12, weight='bold')
plt.xlabel('Metric', fontsize=10, weight='bold')
plt.ylabel('Value', fontsize=10, weight='bold')
plt.xticks(rotation=0)
plt.legend(
    title='Model',
    bbox_to_anchor=(0.5, -0.15),
    loc='upper center',
    ncol=4,  # Number of columns in the legend (adjust as needed)
    fontsize=8,
    title_fontsize='10'
)

# Add light grid lines only for y-axis
plt.grid(True, which='major', axis='y', linestyle='--', linewidth=0.6, color='gray', alpha=0.7)

# Remove top and right spines for a cleaner look
sns.despine()

# Show plot
plt.tight_layout()
plt.show()