In [1]:
!pip install transformers sentence-transformers scipy torch evaluate


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [2]:
!pip install -U datasets



# Load Dataset (DROP)

In [3]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher

# Load DROP dataset
drop_dataset = load_dataset("drop", split="train")

# Load Break dataset (QDMR-high-level is the structured version)
break_dataset = load_dataset("break_data", "QDMR-high-level", split="train")

# Convert to DataFrame for easier processing
drop_df = pd.DataFrame(drop_dataset)
break_df = pd.DataFrame(break_dataset)

print(f"🔹 DROP size: {len(drop_df)}")
print(f"🔹 Break size: {len(break_df)}")

# Print examples from DROP
print("🔹 Example from DROP:")
for i in range(3):
    ex = drop_df.iloc[i]
    print(f"\nExample {i+1}")
    print(ex)
    print(f"Question: {ex['question']}")
    print(f"Passage: {ex['passage'][:300]}...")  # Trimmed for readability
    print(f"Answer spans: {ex['answers_spans']['spans']}\n")

# Print examples from Break
print("🔹 Example from Break:")
for i in range(3):
    ex = break_df.iloc[i]
    print(f"\nExample {i+1}")
    print(ex)
    print(f"Question: {ex['question_text']}")
    print(f"Decomposition: {ex['decomposition']}\n")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/77400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9535 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

QDMR-high-level/train-00000-of-00001.par(…):   0%|          | 0.00/2.41M [00:00<?, ?B/s]

QDMR-high-level/validation-00000-of-0000(…):   0%|          | 0.00/438k [00:00<?, ?B/s]

QDMR-high-level/test-00000-of-00001.parq(…):   0%|          | 0.00/263k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17503 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3130 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3195 [00:00<?, ? examples/s]

🔹 DROP size: 77400
🔹 Break size: 17503
🔹 Example from DROP:

Example 1
section_id                                                nfl_2201
query_id                      f16c0ee7-f131-4a8b-a6ac-4d275ea68066
passage          To start the season, the Lions traveled south ...
question         How many points did the buccaneers need to tie...
answers_spans                {'spans': ['3'], 'types': ['number']}
Name: 0, dtype: object
Question: How many points did the buccaneers need to tie in the first?
Passage: To start the season, the Lions traveled south to Tampa, Florida to take on the Tampa Bay Buccaneers. The Lions scored first in the first quarter with a 23-yard field goal by Jason Hanson. The Buccaneers tied it up with a 38-yard field goal by Connor Barth, then took the lead when Aqib Talib intercep...
Answer spans: ['3']


Example 2
section_id                                                nfl_2201
query_id                      c9582e03-b01b-42ed-83e0-b90a5334aefa
passage          To s

In [4]:
from datasets import load_dataset
import pandas as pd

# Load DROP splits
drop_train = pd.DataFrame(load_dataset("drop", split="train"))
drop_val = pd.DataFrame(load_dataset("drop", split="validation"))

# Set index for fast lookup
drop_train.set_index("query_id", inplace=True)
drop_val.set_index("query_id", inplace=True)

def safe_extract_answer(answer_obj):
    if isinstance(answer_obj, dict):
        spans = answer_obj.get("spans", [])
        if isinstance(spans, list) and spans:
            return spans[0]
    return ""

def match_break_with_drop(break_split_name, drop_df, drop_split_name):
    print(f"\n🔹 Matching Break {break_split_name} with DROP {drop_split_name}")
    break_df = pd.DataFrame(load_dataset("break_data", "QDMR-high-level", split=break_split_name))

    # Filter Break rows that are from DROP
    break_drop = break_df[break_df["question_id"].str.contains("DROP", na=False)].copy()
    break_drop["query_id"] = break_drop["question_id"].apply(lambda x: x.split("_")[-1])

    matched = []
    for _, row in break_drop.iterrows():
        qid = row["query_id"]
        if qid in drop_df.index:
            drop_row = drop_df.loc[qid]
            matched.append({
                "question_id": row["question_id"],
                "split": drop_split_name,
                "original_question": row["question_text"],
                "sub_questions": row["decomposition"],
                "context": drop_row["passage"],
                "answer": safe_extract_answer(drop_row["answers_spans"]),
            })

    return pd.DataFrame(matched)

# Match separately
train_matched = match_break_with_drop("train", drop_train, "train")
val_matched = match_break_with_drop("validation", drop_val, "validation")

# Add column: number of sub-questions and filter to keep only those with 2 or more sub-questions
train_matched['num_subquestions'] = train_matched['sub_questions'].apply(
    lambda x: len(x.split(';')) if isinstance(x, str) else 0
)
val_matched['num_subquestions'] = val_matched['sub_questions'].apply(
    lambda x: len(x.split(';')) if isinstance(x, str) else 0
)

train_drop = train_matched[train_matched['num_subquestions'] >= 2]
val_drop = val_matched[val_matched['num_subquestions'] >= 2]


print(f"\n✅ Total matched:")
print(f"  🔹 Train: {len(train_matched)}")
print(f"  🔹 Validation: {len(val_matched)}")
print(f"  🔹 Multi sub-question Train: {len(train_drop)}")
print(f"  🔹 Multi sub-question Validation: {len(val_drop)}")

print("\n🔹 First 3 training examples:")
print(train_drop.head(3).to_string(index=False))



🔹 Matching Break train with DROP train

🔹 Matching Break validation with DROP validation

✅ Total matched:
  🔹 Train: 7705
  🔹 Validation: 1273
  🔹 Multi sub-question Train: 6646
  🔹 Multi sub-question Validation: 1090

🔹 First 3 training examples:
                                                question_id split                                                                         original_question                                                                                                                                                                         sub_questions                                                                                                                                                                                                                                                                                                                                                                                                                             

In [5]:
def flatten_context(x):
    if isinstance(x, pd.Series):
        return " ".join(x.astype(str).values)
    return str(x)

val_drop['context'] = val_drop['context'].apply(flatten_context)
train_drop['context'] = train_drop['context'].apply(flatten_context)

# train_drop.head()
val_drop.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_drop['context'] = val_drop['context'].apply(flatten_context)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_drop['context'] = train_drop['context'].apply(flatten_context)


Unnamed: 0,question_id,split,original_question,sub_questions,context,answer,num_subquestions
0,DROP_dev_history_10_778cc4f0-4264-4780-9a05-57...,validation,"Which side suffered more casualties, the Feder...",return casualties of the Federales ;return ca...,"More than 5,000 US troops of General John J. P...",the Federales,3
1,DROP_dev_history_10_a0c02482-0031-400d-b6f0-aa...,validation,"How many total people were killed, captured an...",return people who were killed ;return number ...,"More than 5,000 US troops of General John J. P...",80,7
2,DROP_dev_history_10_b4a57b06-bcd2-4ed5-82de-48...,validation,What happened first: Battle of Carrizal or pri...,return date of Battle of Carrizal ;return dat...,"More than 5,000 US troops of General John J. P...",Battle of Carrizal,3
3,DROP_dev_history_1002_45be3d97-6563-4ac3-a107-...,validation,How many years after Nyaungyan captured Nyaung...,return year when Nyaungyan captured Nyaungshw...,"After the fall of Pegu in December 1599, Lower...",2,3
4,DROP_dev_history_1002_79bd0ad5-b0a9-4a6b-947e-...,validation,How many years after capturing Nyaungshwe did ...,return year that Nyaungyan capture Nyaungshwe ...,"After the fall of Pegu in December 1599, Lower...",2,3


# Load Dataset (HotpotQA)

In [6]:
import pandas as pd

# Load the pickle
hotpotqa_df = pd.read_pickle("hotpotqa_result_prediction_manager_worker_old.pkl")

# Map existing columns to DROP-style schema
hotpotqa_df = hotpotqa_df.rename(columns={
    "id": "question_id",
    "question": "original_question",
    "answer": "answer",
    "context": "context",
    "pred_answer": "pred_answer_manager_worker"
})

def context_to_text(c):
    if isinstance(c, dict):
        titles = c.get("title", [])
        sents  = c.get("sentences", []) or c.get("sent", []) or []
        titles_txt = " | ".join(map(str, titles)) if isinstance(titles, list) else str(titles)
        if isinstance(sents, list) and len(sents) and isinstance(sents[0], list):
            sents_txt = " ".join(" ".join(x) for x in sents)
        elif isinstance(sents, list):
            sents_txt = " ".join(map(str, sents))
        else:
            sents_txt = str(sents)
        return (titles_txt + " — " + sents_txt).strip()
    return "" if pd.isna(c) else str(c)

hotpotqa_df["context_text"] = hotpotqa_df["context"].apply(context_to_text)

# Add the required columns (sub_questions + split + num_subquestions)
hotpotqa_df["sub_questions"] = ""      # HotpotQA doesn’t provide decomposition here
hotpotqa_df["split"] = "test"          # mark as test set by default
hotpotqa_df["num_subquestions"] = hotpotqa_df["sub_questions"].apply(lambda x: len(x.split(";")) if x else 0)

print(hotpotqa_df.head())


                    question_id  \
81477  5a8056255542995d8a8ddf6a   
82167  5abbe54f554299114383a085   
62705  5a8b31a155429950cd6afc6a   
32634  5ae223d3554299492dc91bc9   
2886   5a8930c3554299669944a4f3   

                                       original_question  \
81477  Danny Burstein  is a six-time Tony Award nomin...   
82167  The 1992–93 NBA season was the Magic's fourth ...   
62705  When was the counterpart to a high adventure p...   
32634  Who collaborated with a Scottish-born American...   
2886   What  is a 2013 horror omnibus film made up of...   

                      answer    type level  \
81477          James Goldman  bridge  easy   
82167  Golden State Warriors  bridge  easy   
62705                   1978  bridge  easy   
32634              Brian Eno  bridge  easy   
2886        Horror Stories 2  bridge  easy   

                                        supporting_facts  \
81477  {'title': ['Danny Burstein', 'Danny Burstein',...   
82167  {'title': ['1992–93 Orla

In [12]:
import pandas as pd
from datasets import load_dataset

# Load HotpotQA prediction dataframe
hotpotqa_df = pd.read_pickle("hotpotqa_result_prediction_manager_worker_old.pkl")

# Rename columns for consistency
hotpotqa_df = hotpotqa_df.rename(columns={
    "id": "question_id",
    "question": "original_question",
    "answer": "answer",
    "context": "context",
    "pred_answer": "pred_answer_manager_worker"
})

# Normalize the context field
def context_to_text(c):
    if isinstance(c, dict):
        titles = c.get("title", [])
        sents  = c.get("sentences", []) or c.get("sent", []) or []
        titles_txt = " | ".join(map(str, titles)) if isinstance(titles, list) else str(titles)
        if isinstance(sents, list) and len(sents) and isinstance(sents[0], list):
            sents_txt = " ".join(" ".join(x) for x in sents)
        elif isinstance(sents, list):
            sents_txt = " ".join(map(str, sents))
        else:
            sents_txt = str(sents)
        return (titles_txt + " — " + sents_txt).strip()
    return "" if pd.isna(c) else str(c)

hotpotqa_df["context_text"] = hotpotqa_df["context"].apply(context_to_text)

# Load Break data (validation + test sets)
break_val = pd.DataFrame(load_dataset("break_data", "QDMR-high-level", split="validation"))
break_test = pd.DataFrame(load_dataset("break_data", "QDMR-high-level", split="test"))
break_all = pd.concat([break_val, break_test], ignore_index=True)

# Filter only HotpotQA-origin questions
break_hot = break_all[break_all["question_id"].str.startswith("HOTPOT")].copy()

# Extract HotpotQA-style ID
break_hot["hotpotqa_id"] = break_hot["question_id"].apply(lambda x: x.split("_")[-1])

print(hotpotqa_df.head())
print("#########")
print(break_hot.head())

# Merge on exact ID
merged_df = pd.merge(
    hotpotqa_df,
    break_hot[["hotpotqa_id", "decomposition"]],
    left_on="question_id",
    right_on="hotpotqa_id",
    how="inner"
)

# Add sub-questions and their count
merged_df = merged_df.rename(columns={"decomposition": "sub_questions"})
merged_df["num_subquestions"] = merged_df["sub_questions"].apply(lambda x: len(x.split(";")) if isinstance(x, str) else 0)
merged_df["split"] = "test"

print(f"\n✅ Matched {len(merged_df)} HotpotQA questions with Break decompositions")

print("\n🔹 Example rows:")
print(merged_df.head(3).to_string(index=False))


                    question_id  \
81477  5a8056255542995d8a8ddf6a   
82167  5abbe54f554299114383a085   
62705  5a8b31a155429950cd6afc6a   
32634  5ae223d3554299492dc91bc9   
2886   5a8930c3554299669944a4f3   

                                       original_question  \
81477  Danny Burstein  is a six-time Tony Award nomin...   
82167  The 1992–93 NBA season was the Magic's fourth ...   
62705  When was the counterpart to a high adventure p...   
32634  Who collaborated with a Scottish-born American...   
2886   What  is a 2013 horror omnibus film made up of...   

                      answer    type level  \
81477          James Goldman  bridge  easy   
82167  Golden State Warriors  bridge  easy   
62705                   1978  bridge  easy   
32634              Brian Eno  bridge  easy   
2886        Horror Stories 2  bridge  easy   

                                        supporting_facts  \
81477  {'title': ['Danny Burstein', 'Danny Burstein',...   
82167  {'title': ['1992–93 Orla

#Define Model

In [None]:
from transformers import pipeline

# Load pretrained QA pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")


# Evaluate Pretrained Model (Zero-shot)

In [14]:
from tqdm import tqdm

def evaluate_model(dataset, qa_pipeline):
    results = []
    for _, row in tqdm(dataset.iterrows(), total=len(dataset)):
        question = row['original_question']
        context  = row['context_text']
        out = qa_pipeline({"question": question, "context": context})  # pass a dict
        results.append(out['answer'])  # extract the answer string
    dataset["pred_answer_oneshot"] = results
    return dataset  # return the full table with the new column

# Drop
# zero_shot_results = evaluate_model(val_drop, qa_pipeline)
# zero_shot_results.head()

# HotpotQA
zero_shot_results = evaluate_model(hotpotqa_df, qa_pipeline)
print(zero_shot_results.head())

zero_shot_results.to_csv("zero_shot_results_hotpotqa.csv", index=False)

  1%|          | 72/9600 [11:26<25:13:13,  9.53s/it]


KeyboardInterrupt: 

# Fine tuning

In [None]:

from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

def prepare_data_with_span(df):
    dataset = Dataset.from_pandas(df[['original_question', 'context', 'answer']])

    def add_positions(example):
        question = example["original_question"]
        context = example["context"]
        answer = example["answer"]

        tokenized = tokenizer(
            question,
            context,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_offsets_mapping=True
        )

        answer_start = context.find(answer)
        answer_end = answer_start + len(answer)

        # Default values if not found
        start_pos = end_pos = 0
        for i, (start, end) in enumerate(tokenized["offset_mapping"]):
            if start <= answer_start < end:
                start_pos = i
            if start < answer_end <= end:
                end_pos = i
                break

        tokenized["start_positions"] = start_pos
        tokenized["end_positions"] = end_pos

        tokenized.pop("offset_mapping")
        return tokenized

    return dataset.map(add_positions)


print("Preparing train and validation datasets...")
# Drop
train_dataset = prepare_data_with_span(train_drop)
val_dataset = prepare_data_with_span(val_drop)


In [None]:
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()


# Evaluate Fine-Tuned Model

In [None]:
fine_tuned_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

fine_tuned_results = evaluate_model(val_drop, fine_tuned_pipeline)
fine_tuned_results.head()

In [None]:
zero_shot_results.to_csv("zero_shot_results.csv", index=False)
fine_tuned_results.to_csv("fine_tuned_results.csv", index=False)

# Compare Results Using Semantic Textual Similarity

In [None]:
!pip install -U sentence-transformers


In [None]:
print(zero_shot_results.head())

For pred_answer_manager_worker & hotpotqa case:

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load STS model
sts_model = SentenceTransformer('all-MiniLM-L6-v2')

# Merge results into a single DataFrame
comparison_df = pd.DataFrame({
    "question": zero_shot_results["original_question"],
    "true_answer": zero_shot_results["answer"],
    "zero_shot_answer": zero_shot_results["pred_answer_oneshot"],
    "pred_answer_manager_worker": zero_shot_results["pred_answer_manager_worker"]
})

# Compute similarity per row
def compute_similarity(row):
    true = row["true_answer"]
    zshot = row["zero_shot_answer"]
    manager_worker = row["pred_answer_manager_worker"]
    sims = util.cos_sim(
        sts_model.encode([true], convert_to_tensor=True),
        sts_model.encode([zshot, manager_worker], convert_to_tensor=True)
    )
    return pd.Series({
        "sts_zero_shot": float(sims[0][0]),
        "sts_manager_worker": float(sims[0][1])
    })

# Apply per row
similarity_scores = comparison_df.apply(compute_similarity, axis=1)
comparison_df = pd.concat([comparison_df, similarity_scores], axis=1)

# Add a column: which model is better
comparison_df["manager_worker_better"] = comparison_df["sts_manager_worker"] >= comparison_df["sts_zero_shot"]

# Show improvement stats
improved = comparison_df["manager_worker_better"].sum()
total = len(comparison_df)


print(f"✅ manager_worker model improved on {improved}/{total} examples ({100*improved/total:.1f}%)")

# Optionally: save for inspection
comparison_df.to_csv("comparison_with_sts.csv", index=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.histplot(comparison_df['sts_manager_worker'], color='blue', label='Manager-Worker', kde=True, bins=20, alpha=0.6)
sns.histplot(comparison_df['sts_zero_shot'], color='red', label='Zero-shot', kde=True, bins=20, alpha=0.6)

plt.title("Distribution of STS Scores")
plt.xlabel("STS Score")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


For fine tuned & DROP case:

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load STS model
sts_model = SentenceTransformer('all-MiniLM-L6-v2')

# Merge results into a single DataFrame
comparison_df = pd.DataFrame({
    "question": zero_shot_results["question"],
    "true_answer": zero_shot_results["true_answer"],
    "zero_shot_answer": zero_shot_results["predicted_answer"],
    "fine_tuned_answer": fine_tuned_results["predicted_answer"]
})

# Compute similarity per row
def compute_similarity(row):
    true = row["true_answer"]
    zshot = row["zero_shot_answer"]
    fine = row["fine_tuned_answer"]
    sims = util.cos_sim(
        sts_model.encode([true], convert_to_tensor=True),
        sts_model.encode([zshot, fine], convert_to_tensor=True)
    )
    return pd.Series({
        "sts_zero_shot": float(sims[0][0]),
        "sts_fine_tuned": float(sims[0][1])
    })

# Apply per row
similarity_scores = comparison_df.apply(compute_similarity, axis=1)
comparison_df = pd.concat([comparison_df, similarity_scores], axis=1)

# Add a column: which model is better
comparison_df["fine_tuned_better"] = comparison_df["sts_fine_tuned"] > comparison_df["sts_zero_shot"]

# Show improvement stats
improved = comparison_df["fine_tuned_better"].sum()
total = len(comparison_df)

print(f"✅ Fine-tuned model improved on {improved}/{total} examples ({100*improved/total:.1f}%)")

# Optionally: save for inspection
comparison_df.to_csv("comparison_with_sts.csv", index=False)


In [None]:
"fine_tuned_answeravg_zero_shot = comparison_df["sts_zero_shot"].mean()
avg_fine_tuned = comparison_df["sts_fine_tuned"].mean()

print(f"🔹 Average Zero-shot STS:     {avg_zero_shot:.4f}")
print(f"🔹 Average Fine-tuned STS:   {avg_fine_tuned:.4f}")
print(f"🔼 Absolute improvement:     {avg_fine_tuned - avg_zero_shot:.4f}")
