In [7]:
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import torch
from huggingface_hub import login
import json

In [1]:
from datasets import load_dataset

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
# from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [3]:
import torch
import re
import os

In [4]:
model_path = "courses/ANLP/project/flan-t5-small-question-decomp-ver-1.0.3/"

In [8]:
# Force load from safetensors
model_decompose = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer_decompose = T5Tokenizer.from_pretrained(model_path)

def decompose(input_text: str) -> str:
    inputs = tokenizer_decompose(input_text, return_tensors="pt", truncation=True)
    outputs = model_decompose.generate(**inputs, max_new_tokens=128)
    return tokenizer_decompose.decode(outputs[0], skip_special_tokens=True)

In [9]:
def decomposion_to_dict(decomposed: str) -> dict:
    result = {}
    # Split into lines, in case they are separated by newlines or just numbered with spaces
    parts = re.findall(r'(\d+):\s*(.+?)(?=(?:\d+:|$))', decomposed, re.DOTALL)
    
    for number, question in parts:
        # Find all references in the form #<number>
        placeholders = re.findall(r'#(\d+)', question)
        result[number] = {
            'q': question.strip(),
            'placeholders': placeholders
        }
    
    return result

In [10]:
# with open('worker_model/keys.json', 'r') as f:  
#     keys = json.load(f)
#     login(keys['hf_token_read'])

In [None]:
login("HuggingFaceToken")

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [13]:
# model_name = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
# model_qa = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [14]:
# model_name = "deepset/xlm-roberta-large-squad2"
# model_qa = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [15]:
model_name = "deepset/roberta-base-squad2"
model_qa = pipeline('question-answering', model=model_name, tokenizer=model_name)

Device set to use cuda:0


In [16]:
unmasker = pipeline('fill-mask', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [17]:
def find_first_root_question(questions_dict: dict) -> tuple[str, str] | None:
    for key, value in questions_dict.items():
        if not value.get('placeholders') and 'a' not in value:
            return key, value['q']
    return None

In [18]:
import re
from typing import Dict, Tuple

def update_qst_dict_with_answer(key_and_answer: Tuple[str, str], qst_dict: Dict[str, dict]) -> Dict[str, dict]:
    key, answer = key_and_answer
    
    # Step 1: Add the answer to the corresponding question
    qst_dict[key]['a'] = answer

    # Step 2: Replace #<key> in other questions with the answer
    for other_key, entry in qst_dict.items():
        if other_key == key:
            continue
        if key in entry.get('placeholders', []):
            # Replace #<key> with the answer in the question string
            entry['q'] = re.sub(fr'#\b{re.escape(key)}\b', answer, entry['q'])
            # Remove the key from the placeholders list
            entry['placeholders'] = [ph for ph in entry['placeholders'] if ph != key]

    return qst_dict

## Examples

In [None]:
context = "Snoopy is a beagle, a breed of dog known for its friendly and playful nature. He is a fictional character from the Peanuts comic strip created by Charles M. Schulz."
original_text = "What is the index of the the second letter in the word of the name of the kind of animal Snoopy is?"

In [None]:
decomposed_text = decompose(original_text)
decomposed_text

'1: return the second letter in the word of the name of the kind of animal Snoopy 2: return the index of #1'

In [None]:
qst_dict = decomposion_to_dict(decomposed_text)
qst_dict

{'1': {'q': 'return the second letter in the word of the name of the kind of animal Snoopy',
  'placeholders': []},
 '2': {'q': 'return the index of #1', 'placeholders': ['1']}}

In [None]:
available_qst = find_first_root_question(qst_dict)
available_qst

('1',
 'return the second letter in the word of the name of the kind of animal Snoopy')

In [None]:
QA_input = {
	'question': available_qst[1],
	'context': context
}
res = model_qa(QA_input)
current_answer = res['answer']
current_answer



'beagle'

In [None]:
qst_dict = update_qst_dict_with_answer((available_qst[0], current_answer), qst_dict)
qst_dict

{'1': {'q': 'return the second letter in the word of the name of the kind of animal Snoopy',
  'placeholders': [],
  'a': 'beagle'},
 '2': {'q': 'return the index of beagle', 'placeholders': []}}

In [None]:
available_qst = find_first_root_question(qst_dict)
available_qst

('2', 'return the index of beagle')

In [22]:
QA_input = {
	'question': available_qst[1],
	'context': context
}
res = model_qa(QA_input)
current_answer = res['answer']
current_answer

'Snoopy'

In [23]:
qst_dict = update_qst_dict_with_answer((available_qst[0], current_answer), qst_dict)
qst_dict

{'1': {'q': 'return the second letter in the word of the name of the kind of animal Snoopy',
  'placeholders': [],
  'a': 'beagle'},
 '2': {'q': 'return the index of beagle', 'placeholders': [], 'a': 'Snoopy'}}

In [24]:
available_qst = find_first_root_question(qst_dict)

In [25]:
available_qst

# The Rest

In [19]:
def get_question_options(question, current_answer=None):
    qst = f"""Question: "{question}"
    An optional answer is: "[MASK]"."""
    options = unmasker(qst)
    # print(f'{options=}')
    result = [option['token_str'] for option in options]
    if current_answer:
        result = [current_answer] + result
    
    print(str(result))
    return str(result)

In [20]:
def answer_complex_question(original_text: str, context: str, show_process=False, suggest_options=True) -> str:
	decomposed_text = decompose(original_text)
	qst_dict = decomposion_to_dict(decomposed_text)
	qst_label = '0'

	while (available_qst := find_first_root_question(qst_dict)):
		qst_label, qst_text = available_qst
		
		# 1) Get answer for the available question
		res1 = model_qa(question=qst_text, context=context)
		current_answer = res1['answer']

		if suggest_options:
			# 2) Help model with otions
			options = get_question_options(qst_text, current_answer)
			# options_context = f"\"{qst_text} from the list\": {options}."
			new_context = context + '\n' + options
			new_qst_text = f'From the list at the end of context: {qst_text}'
			print(f'{new_qst_text=}')

			# 3) Get IMPROVED answer for the available question
			res2 = model_qa(question=new_qst_text, context=new_context)
			if res2['score']>res1['score']:
				current_answer = res2['answer']
			print(f"{res1['answer']=}\t{res1['score']=:3f}")
			print(f"{res2['answer']=}\t{res2['score']=:3f}")

		# 4) Update the question dictionary with the answer
		qst_dict = update_qst_dict_with_answer((qst_label, current_answer), qst_dict)

		if show_process:
			print(f"Processed question {qst_label}: {qst_text} -> Answer: {current_answer}")

	final_answer = qst_dict[qst_label]['a'] if qst_label in qst_dict else None
	return final_answer

In [21]:
context = (
    "Mount Everest is the highest mountain on Earth, located in the Himalayas between Nepal and China. "
    "It has an elevation of 8,848 meters and attracts climbers from all over the world. "
    "Its name starts with the letter 'E'."
)

original_text = "What is the height of the highest mountain on Earth?"

answer_complex_question(original_text, context, show_process=True)

['Mount Everest', 'mountain', 'yes', 'hill', 'summit', 'no']
new_qst_text='From the list at the end of context: return highest mountain on Earth'
res1['answer']='Mount Everest'	res1['score']=0.883721
res2['answer']='Mount Everest'	res2['score']=0.448290
Processed question 1: return highest mountain on Earth -> Answer: Mount Everest
['8,848 meters', 'everest', 'summit', 'yes', 'altitude', 'no']
new_qst_text='From the list at the end of context: return height of Mount Everest'
res1['answer']='8,848 meters'	res1['score']=0.912982
res2['answer']='8,848 meters'	res2['score']=0.174631
Processed question 2: return height of Mount Everest -> Answer: 8,848 meters


'8,848 meters'

In [47]:
context = (
    "The Himalayas are home to many of the world's tallest mountains. Mount Everest is the highest mountain on Earth, "
    "standing at an elevation of 8,848 meters above sea level. When measured from base to peak, considering its base on the Tibetan Plateau, "
    "the mountain rises approximately 4,000 meters. Everest lies on the border between Nepal and China and is part of the greater Himalayan mountain range. "
    "It attracts thousands of climbers each year despite the risks posed by its harsh conditions, including thin air, extreme cold, and avalanches. "
    "Its name begins with the letter 'E', and it was named after Sir George Everest, a British surveyor. "
    "Other famous peaks in the region include K2, which is the second-highest mountain at 8,611 meters, and Kangchenjunga at 8,586 meters. "
    "Although K2 is slightly lower than Everest in elevation, it is considered more technically challenging to climb. "
    "Everest’s elevation has been measured several times using different technologies, including satellite GPS, radar, and physical surveying, "
    "but the commonly accepted height remains 8,848 meters."
)

original_text = "What is the height of the highest mountain on Earth?"

answer_complex_question(original_text, context, show_process=True)

['Mount Everest', 'mountain', 'yes', 'hill', 'summit', 'no']
new_qst_text='From the list at the end of context: return highest mountain on Earth'
res1['answer']='Mount Everest'	res1['score']=0.903739
res2['answer']='Mount Everest'	res2['score']=0.499674
Processed question 1: return highest mountain on Earth -> Answer: Mount Everest
['8,848 meters above sea level', 'everest', 'summit', 'yes', 'altitude', 'no']
new_qst_text='From the list at the end of context: return height of Mount Everest'
res1['answer']='8,848 meters above sea level'	res1['score']=0.232520
res2['answer']='4,000 meters'	res2['score']=0.029349
Processed question 2: return height of Mount Everest -> Answer: 8,848 meters above sea level


'8,848 meters above sea level'

In [48]:
context = (
    "Jupiter is the largest planet in our solar system. "
    "It is known for its Great Red Spot, a massive storm that has been raging for centuries. "
    "Jupiter has a radius of about 69,911 kilometers and is mostly made of gas."
)

original_text = "What is the radius of the largest planet in our solar system?"

answer_complex_question(original_text, context, show_process=True, suggest_options=True)

['Jupiter', 'return', 'yes', 'no', 'zero', 'earth']
new_qst_text='From the list at the end of context: return the largest planet in our solar system'
res1['answer']='Jupiter'	res1['score']=0.326319
res2['answer']='Jupiter'	res2['score']=0.000001
Processed question 1: return the largest planet in our solar system -> Answer: Jupiter
['69,911 kilometers', 'yes', 'return', 'zero', 'no', 'jupiter']
new_qst_text='From the list at the end of context: return the radius of Jupiter'
res1['answer']='69,911 kilometers'	res1['score']=0.648343
res2['answer']='zero'	res2['score']=0.000126
Processed question 2: return the radius of Jupiter -> Answer: 69,911 kilometers


'69,911 kilometers'

In [49]:
context = (
    "The Declaration of Independence was signed in 1776, marking the formal separation of the Thirteen Colonies from British rule. "
    "The American Revolutionary War, however, had already begun in 1775 and lasted until 1783. "
    "One of the most well-known figures associated with the Declaration is Thomas Jefferson, who was the principal author. "
    "Thomas Jefferson later served as the third president of the United States. "
    "Other notable contributors included John Adams and Benjamin Franklin. "
    "Interestingly, while July 4th is celebrated as Independence Day, the actual signing took place over several weeks. "
    "At the time, George Washington had already been appointed commander of the Continental Army."
)

original_text = (
    "What is the number of years between the beginning of the war associated with the document written by the man who served "
    "as the third president of the United States and the year the document was signed?"
)

answer_complex_question(original_text, context, show_process=True)

['Thomas Jefferson', 'return', 'yes', 'no', 'president', 'answer']
new_qst_text='From the list at the end of context: return the man who served as the third president of the United States'
res1['answer']='Thomas Jefferson'	res1['score']=0.430063
res2['answer']='Thomas Jefferson'	res2['score']=0.041663
Processed question 1: return the man who served as the third president of the United States -> Answer: Thomas Jefferson
['1776', 'date', 'return', 'year', 'no', 'yes']
new_qst_text='From the list at the end of context: return the year the document was signed'
res1['answer']='1776'	res1['score']=0.974838
res2['answer']='1776'	res2['score']=0.146510
Processed question 2: return the year the document was signed -> Answer: 1776
['1783', 'year', '1776', 'time', 'years', '1800']
new_qst_text='From the list at the end of context: return the number of years between Thomas Jefferson and 1776'
res1['answer']='1783'	res1['score']=0.000022
res2['answer']='1800'	res2['score']=0.002899
Processed questi

'1800'

In [134]:
context = """
In 1085, Guadalajara was retaken by the Christian forces of Alfonso VI .
The chronicles say that the Christian army was led by Alvar Fanez de Minaya, one of the lieutenants of El Cid.
From 1085 until the Battle of Las Navas de Tolosa in 1212, the city suffered wars against the Almoravidand the Almohad Empires.
In spite of the wars, the Christian population could definitely settle down in the area thanks to the repopulation with people \
    from the North who received their first fuero in 1133 from Alfonso VII.
    In 1219, the king Fernando III gave a new fuero to the city .
    During the reign of Alfonso X of Castile, the protection of the king allowed the city to develop its economy by protecting merchants and allowing markets.
"""

original_text = "What is the year of the first fuero given to the city?"

answer_complex_question(original_text, context, show_process=True, suggest_options=True)

res1['answer']='Alfonso VII'	res1['score']=0.028263
res2['answer']='\n   '	res2['score']=0.001372
Processed question 1: return the first fuero given to the city -> Answer: Alfonso VII
res1['answer']='1133'	res1['score']=0.010991
res2['answer']='1133'	res2['score']=0.031238
Processed question 2: return year of Alfonso VII -> Answer: 1133


'1133'

In [91]:
original_text = "Which occured first, the Battle of Las Navas de Tolosa or king Fernando III gave a new fuero to the city?"
answer_complex_question(original_text, context, show_process=True, )

options=[{'score': 0.23625893890857697, 'token': 2709, 'token_str': 'return', 'sequence': 'question : " return when was the battle of las navas de tolosa " an optional answer is : " return ".'}, {'score': 0.015970222651958466, 'token': 2748, 'token_str': 'yes', 'sequence': 'question : " return when was the battle of las navas de tolosa " an optional answer is : " yes ".'}, {'score': 0.012277494184672832, 'token': 2053, 'token_str': 'no', 'sequence': 'question : " return when was the battle of las navas de tolosa " an optional answer is : " no ".'}, {'score': 0.010860994458198547, 'token': 2188, 'token_str': 'home', 'sequence': 'question : " return when was the battle of las navas de tolosa " an optional answer is : " home ".'}, {'score': 0.009617802686989307, 'token': 13407, 'token_str': 'farewell', 'sequence': 'question : " return when was the battle of las navas de tolosa " an optional answer is : " farewell ".'}]
['1212', 'return', 'yes', 'no', 'home', 'farewell']
Processed question

'Question'

# Eval

In [50]:
import datasets
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher
from datasets import load_dataset

AttributeError: module 'pyarrow' has no attribute 'PyExtensionType'

In [22]:
drop_val = pd.DataFrame(load_dataset("drop", split="validation"))

# Set index for fast lookup
drop_val.set_index("query_id", inplace=True)

In [23]:
from tqdm import tqdm

def evaluate_model(dataset):
    results = []
    for _, row in tqdm(dataset.iterrows()):
        # print(row)
        question = row['original_question']
        context = row['context']
        answer = row['answer']
        try:
            prediction = answer_complex_question(original_text=question, context=context, show_process=True)
        except Exception as e:
            print(e)
            prediction = ""
        # print(prediction)
        results.append({
            'question': question,
            'predicted_answer': prediction,
            'true_answer': answer
        })
    return pd.DataFrame(results)


In [139]:
break_val = pd.DataFrame(load_dataset("break_data", "QDMR-high-level", split="train"))

In [24]:
hotpotqa = load_dataset("hotpotqa/hotpot_qa", 'fullwiki')

In [None]:
df = pd.DataFrame(hotpotqa['train'])

In [33]:
df_val = pd.DataFrame(hotpotqa['validation'])

In [None]:
df_val.groupby(by=['level', 'type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,question,answer,supporting_facts,context
level,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hard,bridge,5918,5918,5918,5918,5918
hard,comparison,1487,1487,1487,1487,1487


In [57]:
def answer_from_df(row):
    answer_from_df.iteration += 1
    question = row['question']
    context = row['context']
    sentences = context['sentences']
    sentences = ''.join([''.join(p) for p in sentences])
    print(answer_from_df.iteration)
    return answer_complex_question(question, sentences, True)
answer_from_df.iteration = 0

In [58]:
from time import time

In [59]:
start = time()
temp = df[(df.level == 'easy') & (df.type == 'comparison')][200:220].copy()
temp['pred_answer'] = temp.apply(answer_from_df, axis=1)
end = time()

1
['tree ferns', 'return', 'yes', 'no', 'answer', 'see']
new_qst_text='From the list at the end of context: return if Cyathea is a plant'
res1['answer']='tree ferns'	res1['score']=0.042398
res2['answer']='Cyathea chinensis'	res2['score']=0.000662
Processed question 1: return if Cyathea is a plant -> Answer: tree ferns
['decumbent', 'return', 'no', 'yes', 'answer', 'see']
new_qst_text='From the list at the end of context: return if Carpobrotus is a plant'
res1['answer']='decumbent'	res1['score']=0.002469
res2['answer']='decumbent'	res2['score']=0.000028
Processed question 2: return if Carpobrotus is a plant -> Answer: decumbent
['Erect when young', 'return', 'answer', 'yes', 'check', 'find']
new_qst_text='From the list at the end of context: return if both tree ferns and decumbent are true'
res1['answer']='Erect when young'	res1['score']=0.001008
res2['answer']='bi- or tripinnate'	res2['score']=0.006728
Processed question 3: return if both tree ferns and decumbent are true -> Answer: bi

In [41]:
duration = end - start

In [44]:
df.shape[0]

90447

In [46]:
(12000 * duration/20)/3600

4.311411023139954

In [48]:
# Sample exactly 2000 rows from each (level, type) combination
df_sampled = (
    df.groupby(['level', 'type'], group_keys=False)
      .apply(lambda g: g.sample(n=2, random_state=42))
)

print(df_sampled['level'].value_counts())
print(df_sampled['type'].value_counts())
print(df_sampled.shape)

level
easy      4
hard      4
medium    4
Name: count, dtype: int64
type
bridge        6
comparison    6
Name: count, dtype: int64
(12, 7)


  .apply(lambda g: g.sample(n=2, random_state=42))


In [None]:
df_sampled['pred_answer'] = df_sampled.apply(answer_from_df, axis=1)
# df_sampled.to_pickle('courses/ANLP/project/hotpotqa_result_prediction_manager_worker.pkl')

['James Goldman', 'yes', 'no', 'applause', 'sorry', 'hello']
new_qst_text='From the list at the end of context: return author that Danny Burstein is a six-time Tony Award nominee for The Drowsy Chaperone (2006) , South Pacific (2008) , Follies , a musical with music and lyrics by Stephen Sondheim'
res1['answer']='James Goldman'	res1['score']=0.001724
res2['answer']='James Goldman'	res2['score']=0.000132
Processed question 1: return author that Danny Burstein is a six-time Tony Award nominee for The Drowsy Chaperone (2006) , South Pacific (2008) , Follies , a musical with music and lyrics by Stephen Sondheim -> Answer: James Goldman
['James Goldman', 'yes', 'no', 'applause', 'sorry', 'hello']
new_qst_text='From the list at the end of context: return author that Danny Burstein is a six-time Tony Award nominee for The Drowsy Chaperone (2006) , South Pacific (2008) , Follies , a musical with music and lyrics by Stephen Sondheim'
res1['answer']='James Goldman'	res1['score']=0.001724
res2['a

In [53]:
df_sampled.to_pickle('courses/ANLP/project/hotpotqa_result_prediction_manager_worker.pkl')

In [54]:
!ls courses/ANLP/project

flan-t5-small-question-decomp-ver-1.0.3
hotpotqa_result_prediction_manager_worker.pkl


In [36]:
temp[['context', 'question', 'answer', 'pred_answer']]

Unnamed: 0,context,question,answer,pred_answer
5691,"{'title': ['Cyathea geluensis', 'Carpobrotus',...",Are Cyathea and Carpobrotus similar plants?,no,bi- or tripinnate
5705,"{'title': ['Hong Sung-taek', 'Willie Benegas',...",What do The Man Who Skied Down Everest and Inn...,documentary,Rob DesLauriers
5754,"{'title': ['Cheryl Wilson', 'League of America...","Gândul Mâței and KMFDM, are what type of music...",band,industrial
5772,"{'title': ['Aicha Elbasri', 'Saint Genet', 'Te...","Robert Lepage and Jean Genet, are of which nat...",French,French
5797,"{'title': ['Voyager One (band)', 'Crank (song)...",how is Catherine Wheel and The Go-Betweens rel...,rock band,The breaking wheel
5808,"{'title': ['Why Girls Love Sailors', 'Mummy's ...",How are Fred Guiol and Stuart Paton alike?,film director and screenwriter.,never well-regarded
5815,"{'title': ['Out of My System', 'Feargal Sharke...",What do Constantine Maroulis and Feargal Shark...,singer,If This is Love
5857,{'title': ['Arthur C. Clarke's Mysterious Univ...,Both Arthur C. Clarke and John Richardson are ...,The New Observer,The New Observer
5935,"{'title': ['Nembe Creek Trunk Line', 'Sweet Cr...",What is a documentary film by Sandy Cioffi abo...,Sweet Crude,['Orthodox Jewish attitudes to homosexuality
6064,"{'title': ['Eriogonum', 'Plant Life Records', ...","Which genus of plant life is more palm like, P...",Pandanus is a genus of monocots with some 750 ...,Sago palms
