# Load data and data analysis

In [None]:
# TODO
# TODO: maybe load the dataset as follows from huggingface
# https://huggingface.co/docs/datasets/v1.9.0/loading_datasets.html
from datasets import load_dataset
dataset = load_dataset('squad')

# Setup generative model (open/closed model)

In [24]:
from transformers import AutoTokenizer, OPTForCausalLM, pipeline

class GenerativeModel:
    def __init__(self, max_answer_length) -> None:
        self.generator = pipeline('text-generation', model="facebook/opt-1.3b")
        self.model = OPTForCausalLM.from_pretrained("facebook/opt-1.3b")
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
        self.tokenizer.padding_side = "left"   # so that the text will continue as without padding
        self.max_answer_length = max_answer_length

    def get_open_model_answer(self, question, context, use_pipeline=False):
        prompt = f"CONTEXT:\n{context}\nQUESTION:\n{question}"
        # generate answer
        answer = self._generate_answer(prompt, use_pipeline)
        # remove prompt from generated text
        answer = answer.removeprefix(prompt)
        return answer

    def get_closed_model_answer(self, question, use_pipeline=False):
        prompt = question
        # generate answer
        answer = self._generate_answer(prompt, use_pipeline)
        # remove prompt from generated text
        answer = answer.removeprefix(prompt)
        return answer

    def get_open_batch_answers(self, questions, contexts):
        assert len(questions) == len(contexts), "questions and contexts should have the same length"
        prompts = [f"CONTEXT:\n{contexts[i]}\nQUESTION:\n{questions[i]}" for i in range(len(questions))]
        # generate answers
        answers = self._generate_batch_answers(prompts)
        # remove prompts from generated text
        answers = [answers[i].removeprefix(prompts[i]) for i in range(len(prompts))]
        return answers

    # https://github.com/huggingface/transformers/issues/10704
    def get_closed_batch_answers(self, questions):
        prompts = questions
        # generate answers
        answers = self._generate_batch_answers(prompts)
        # remove prompts from generated text
        # TEMP answers = [answers[i].removeprefix(prompts[i]) for i in range(len(prompts))]
        return answers
    
    def _generate_answer(self, prompt, use_pipeline):
        if use_pipeline:
            answer = self.generator(prompt, max_new_tokens=self.max_answer_length)[0]['generated_text']
        else:
            inputs = self.tokenizer(prompt, return_tensors="pt")
            generate_ids = self.model.generate(inputs.input_ids, max_new_tokens=self.max_answer_length)
            answer = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        return answer

    def _generate_batch_answers(self, prompts):
        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True)  # padding, so that all prompts have same length for computing it as a batch
        generate_ids = self.model.generate(inputs.input_ids, max_new_tokens=self.max_answer_length)
        answers = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        return answers

In [25]:
generative_model = GenerativeModel(max_answer_length=42)

In [None]:
# test for single question
single_context = """Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50."""
single_question = "Which NFL team represented the AFC at Super Bowl 50?"
single_correct_answers = ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']

print(f"Context: {single_context}\nQuestion: {single_question}\nCorrect answer: {single_correct_answers}")
print("Closed generative Model:")
print(generative_model.get_closed_model_answer(single_question))
print("Open generative Model:")
print(generative_model.get_open_model_answer(single_question, single_context))

# Evaluation

In [None]:
from transformers import pipeline

class ExtractiveModel:
  def __init__(self) -> None:
    model_name = "deepset/roberta-base-squad2"
    self.pipeline = pipeline('question-answering', model=model_name, tokenizer=model_name)

  def get_extractive_batch_answer(self, question, context):
    answer = self.pipeline(question=question, context=context)["answer"]
    return answer

  def get_extractive_batch_answers(self, questions, contexts):
    answers = [self.pipeline(question=q, context=c)["answer"] for (q,c) in zip(questions, contexts)]
    return answers

In [None]:
extractive_model = ExtractiveModel()

In [None]:
# test for single question
single_context = """Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50."""
single_question = "Which NFL team represented the AFC at Super Bowl 50?"
single_correct_answers = ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']

print(f"Context: {single_context}\nQuestion: {single_question}\nCorrect answer: {single_correct_answers}")
print("Extractive Model:")
print(extractive_model.get_extractive_batch_answer(single_question, single_context))

## Compute the answers from the different models

In [None]:
# TEMP
from datasets import load_dataset
test_set = load_dataset('squad', split='validation[:20]')  # get first n entries from the test set
test_contexts = test_set['context']  # list of strings
test_questions = test_set['question']  # list of strings
test_answers = [d['text'] for d in test_set['answers']]  # list of lists of answers
#TEMP end

In [None]:
%%time
closed_answers = generative_model.get_closed_batch_answers(test_questions)

In [None]:
%%time
open_answers = generative_model.get_open_batch_answers(test_questions, test_contexts)

In [None]:
%%time
extractive_answers = extractive_model.get_extractive_batch_answers(test_questions, test_contexts)

## Compute the scores for the answers

## Plots