# Knowledge Extraction with open-source LLMs
## Level 0:
Choose one open-source LLM, use it AS IS for Answer generation given questions from the dev part of SQuAD 1.0 and evaluate it using the provided evaluation script.

In [None]:
# Install required libraries
!pip install datasets transformers tqdm
!pip install requests>=2.32.1
!pip install accelerate
!pip install transformers[sentencepiece]
!pip install transformers[torch]

import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from datasets import Dataset
import subprocess
import random
from google.colab import drive
import torch




In [None]:
# Seed for reproducibility
seed = 123
random.seed(seed)
if torch.cuda.is_available():
      torch.manual_seed(seed)
      torch.cuda.manual_seed_all(seed)
      torch.use_deterministic_algorithms(True)

In [None]:
# Mount Google Drive (specific to Google Colab environment)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# Load data set
with open('/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json') as g:
    dev_data = json.load(g)


In [None]:
# Overall, this function takes raw data with articles, questions, and answers,
# and transforms it into a structured dictionary separating titles, contexts,
# questions, answer texts, and answer starting positions.

# Function to transform the data into the required format
def transform_data(data):
    transformed_data = {
        'id': [],
        'title': [],
        'context': [],
        'question': [],
        'answers': []
    }
    for article in data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                transformed_data['id'].append(qa['id'])
                transformed_data['title'].append(title)
                transformed_data['context'].append(context)
                transformed_data['question'].append(qa['question'])
                transformed_data['answers'].append({
                    'text': [answer['text'] for answer in qa['answers']],
                    'answer_start': [answer['answer_start'] for answer in qa['answers']]
                })
    return transformed_data

In [None]:
# Transform the data
dev_transformed = transform_data(dev_data)

# Create Dataset objects
dev_dataset = Dataset.from_dict(dev_transformed)

#quick look to dataset
dev_dataset


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})

In [None]:
# example of one of instances in data
dev_dataset[0]

{'id': '56ddde6b9a695914005b9628',
 'title': 'Normans',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'question': 'In what country is Normandy located?',
 'answers': {'answer_start': [159, 159, 159, 159],
  'text': ['France', 'France', 'France', 'France']}}

In [None]:
# Choose a pre-trained model
model_id = "albert/albert-base-v2"
#model_id = "FacebookAI/roberta-base"


# initialize model and tokanizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForQuestionAnswering.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Check if GPU is available and move the model to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Create a QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)

In [None]:
## Example prediction of model using qa_pipeline
context_example = dev_dataset[0]['context']
q_example = dev_dataset[0]['question']
print("Question: " + q_example)
print("from context : " + context_example)

print("answer: " + str(qa_pipeline(q_example,context_example)))

Question: In what country is Normandy located?
from context : The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
answer: {'score': 0.00013911955466028303, 'start': 57, 'end': 102, 'answer': 'Normanni) were the people who in the 10th and'}


In [None]:

# Evaluate the model on the dev set
counter = 0

#create dict to store answers
preds = {}

#create dict to store scores
pred_probs = {}



# for loop to predict and store each instance in data
for article in dev_dataset:
        context = article['context']
        question =article['question']
        answer = qa_pipeline(question=question, context=context) #predict answer
        preds[article['id']] =  answer['answer'] # store answer by using same id in evaluation data
        pred_probs[article['id']] =  answer['score'] # store score by using same id in evaluation data
        counter = counter +1
        if counter % 1000 == 0 : print(counter)



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000


In [None]:
# Paths to the evaluation script and data files
eval_script_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py'
data_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json'

pred_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/pred_level0_bert.json'
na_prob_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/na_probs_level0_bert.json'
out_file_path = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/eval_level0_bert.json'
out_image_dir = '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/images_level0_bert'

In [None]:

# Save the predictions to a file
with open(pred_file_path, 'w') as f:
    json.dump(preds, f)

# Save the scores to a file
with open(na_prob_file_path, 'w') as f:
    json.dump(pred_probs, f)


In [None]:
# Command to run the evaluation script
command = [
    'python', eval_script_path,
    data_file_path,
    pred_file_path,
    '-n', na_prob_file_path,
    '-o', out_file_path,
    '-p', out_image_dir
]

In [None]:
# Use the evaluation script
subprocess.run(command, check=True)

CompletedProcess(args=['python', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/evaluate-v2.0.py', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/SQuAD/dev-v2.0.json', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/pred_level0_bert.json', '-n', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/na_probs_level0_bert.json', '-o', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/eval_level0_bert.json', '-p', '/content/drive/MyDrive/Colab Notebooks/E.ON_Data_Challenge/Results/images_level0_bert'], returncode=0)