## Connect to drive (to save and load data)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Project Constants

In [None]:
# Saving locations
model_direct_name = "flan-t5-small"
model_name = "google/" + model_direct_name

model_version = "1.0.3"
data_version = "1.0.3"

base_path = "/content/drive/My Drive/Manager"
all_models_path = base_path + "/Model Versions"

model_folder_name = "/" + model_direct_name + "-question-decomp-ver-" + model_version
model_path = all_models_path + model_folder_name

data_folder_name = base_path + "/Data/Ver-" + data_version
data_path = data_folder_name + "/train_question_decomp.jsonl"

# for wandb
run_name = model_direct_name + "-decomp-run-" + model_version

# Format Break Dataset

In [None]:
!pip install pandas
!pip install pathlib

Collecting pathlib
  Downloading pathlib-1.0.1-py3-none-any.whl.metadata (5.1 kB)
Downloading pathlib-1.0.1-py3-none-any.whl (14 kB)
Installing collected packages: pathlib
Successfully installed pathlib-1.0.1


In [None]:
import zipfile
import pandas as pd
import json
from io import TextIOWrapper

zip_path = base_path + "/Data" + "/Break-dataset.zip"

def read_csv_from_zip(zip_path, inner_path):
    with zipfile.ZipFile(zip_path) as z:
        with z.open(inner_path) as f:
            return pd.read_csv(TextIOWrapper(f, 'utf-8'))

def convert_to_finetune_format(df):
    data = []
    for _, row in df.iterrows():
        question = row["question_text"].strip()
        decomposition = str(row["decomposition"]).strip()
        steps = [f"{i+1}: {step.strip()}" for i, step in enumerate(decomposition.split(";"))]
        data.append({
            "input": question,
            "output": "\n".join(steps)
        })
    return data

# Paths inside the ZIP file
train_path = "Break-dataset/QDMR-high-level/train.csv"
dev_path = "Break-dataset/QDMR-high-level/dev.csv"
test_path = "Break-dataset/QDMR-high-level/test.csv"

# Load and convert
train_df = read_csv_from_zip(zip_path, train_path)
dev_df = read_csv_from_zip(zip_path, dev_path)
test_df = read_csv_from_zip(zip_path, test_path)

train_data = convert_to_finetune_format(train_df)
dev_data = convert_to_finetune_format(dev_df)
test_data = convert_to_finetune_format(test_df)

In [None]:
# Save the data as jsonl file
with open(data_folder_name + "/train_question_decomp.jsonl", "w") as f:
    for row in train_data:
        f.write(json.dumps(row) + "\n")

with open(data_folder_name + "/dev_question_decomp.jsonl", "w") as f:
    for row in dev_data:
        f.write(json.dumps(row) + "\n")

with open(data_folder_name + "/test_question_decomp.jsonl", "w") as f:
    for row in test_data:
        f.write(json.dumps(row) + "\n")

# Creating The Manager via Finetuning

In [None]:
!pip install datasets
!pip install transformers



## Training

In [None]:
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
import json

# --- 1. Load combined dataset ---
with open(data_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]
dataset = Dataset.from_list(data) # Convert to Hugging Face Dataset

# --- 2. Load tokenizer and model ---
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# --- 3. Tokenization function ---
def preprocess_function(examples):
    inputs = examples["input"]
    targets = examples["output"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# --- 4. Tokenize the dataset ---
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input", "output"])

# --- 5. Setup training arguments ---
training_args = TrainingArguments(
    learning_rate=3e-5,
    num_train_epochs=5,
    weight_decay=0.05,
    logging_steps=100,
    per_device_train_batch_size=8,
    save_strategy="no",
    save_steps=0,
    report_to="wandb",
    run_name="flan-t5-decomp-run",
    fp16=False # not allowed on current GPU!
)

# --- 6. Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# --- 7. Train! ---
trainer.train()

Map:   0%|          | 0/17503 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
100,22.1056
200,4.4206
300,2.4318
400,1.2333
500,0.5846
600,0.297
700,0.1764
800,0.1271
900,0.1109
1000,0.0993


TrainOutput(global_step=10940, training_loss=0.34234627413357416, metrics={'train_runtime': 3786.5538, 'train_samples_per_second': 23.112, 'train_steps_per_second': 2.889, 'total_flos': 8134103759585280.0, 'train_loss': 0.34234627413357416, 'epoch': 5.0})

In [None]:
# --- 8. Save final model ---
trainer.save_model(model_path)

## Testing / Dev

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import json
from tqdm import tqdm
import numpy as np
import torch
# from datasets import load_metric

testOrDev = "dev"

# Load dataset
with open(data_folder_name + "/" + testOrDev + "_question_decomp.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]
dataset = Dataset.from_list(data) # Convert to Hugging Face Dataset

# Force load from safetensors
model = T5ForConditionalGeneration.from_pretrained(model_path, trust_remote_code=True)
tokenizer = T5Tokenizer.from_pretrained(model_path)

def evaluate(model, tokenizer, dataset, max_input_length=512, max_output_length=128):
    model.eval()
    preds, refs = [], []
    for example in tqdm(dataset):
        input_text = example["input"]  # adjust key to match dataset format
        ref = example["output"]        # adjust key

        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_length
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_output_length)

        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        preds.append(pred.strip())
        refs.append(ref.strip())

    return preds, refs

# metric_em = load_metric("exact_match")
# metric_f1 = load_metric("f1")

def compute_metrics(preds, refs):
    em_score = np.mean([p == r for p, r in zip(preds, refs)]) * 100

    from collections import Counter
    def f1_score(pred, ref):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common = Counter(pred_tokens) & Counter(ref_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0
        precision = num_same / len(pred_tokens)
        recall = num_same / len(ref_tokens)
        return (2 * precision * recall) / (precision + recall)
    f1_scores = [f1_score(p, r) for p, r in zip(preds, refs)]
    return em_score, np.mean(f1_scores) * 100

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

preds, refs = evaluate(model, tokenizer, dataset)
em, f1 = compute_metrics(preds, refs)

print(f"Dev Set — EM: {em:.2f}%, F1: {f1:.2f}%")

100%|██████████| 3130/3130 [22:22<00:00,  2.33it/s]

Dev Set — EM: 0.26%, F1: 84.44%





# Run the saved model

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline

# Force load from safetensors
model = T5ForConditionalGeneration.from_pretrained(model_path, trust_remote_code=True)
tokenizer = T5Tokenizer.from_pretrained(model_path)

## Simple Decomposition

In [None]:
def decompose(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# listOfStrings = [
#     "Danny Burstein  is a six-time Tony Award nominee; for 'The Drowsy Chaperone' (2006), 'South Pacific' (2008), 'Follies' , a musical with music and lyrics by Stephen Sondheim and a book by what author?",
#     "Sid Avery was an American photographer and director who was best known for capturing the private moments of legendary Hollywood celebrities like which British-American actress, businesswoman, and humanitarian?",
#     "Peter Pitegoff was the Dean at the University of Maine School of Law, the school is located in Portland, Maine, and is Maine's only what?",
#     "Beyond Our Ken (1958â€“1964) is a radio comedy programme, the predecessor to 'Round the Horne' (1965â€“1968), both programmes starred which English actor, whose most notable role was in the 1960s BBC radio?",
#     "Warrel Dane is the lead singer of what American power metal band, founded in Seattle in 1985?",
#     "What role on 'Love in the Moonlight' is  a South Korean actress and model recognized for, who also featured in a South Korean television series starring So Ji-sub and Shin Min-a ?",
#     "Carrie Pilby is a 2016 American comedy film directed by Susan Johnson, the film stars which Irish actor and musician, best known for portraying Captain Killian 'Hook' Jones on the TV show 'Once Upon a Time'?",
#     "What day month and year did rick lazio become well known for running against and losing to hilary clinton for the u.s senate?",
#     "In what sovereign states with a capital city that was named after Guadalupe Victoria, is there a species of salamander in the family Plethodontidae?",
#     "'Heathens' is a song by American musical duo Twenty One Pilots, released on which date, 'Heathens' peaked at number two on the US 'Billboard' Hot 100, tying with 'Stressed Out'?",
# ]


# for string in listOfStrings:
#     print(decompose(string) + "\n\n")

print(decompose(
"Zora Fair was a native to South Carolina who nearly exposed the military campaign conducted through Georgia from when to when?"
))

1: return native to South Carolina 2: return #1 that Zora Fair was a native to South Carolina 3: return when was #2 conducted through Georgia from


## Update The DB with decomposition

In [None]:
import pandas as pd
import re

df = pd.read_csv(base_path + "/comparison_with_sts_and_decomposition.csv")

# df["decomposition"] = df["question"].apply(lambda x: decompose(x))

# check question length
df["question_length"] = df["question"].apply(lambda x: len(str(x).split()))

# whether it is long or not
df["is_long"] = df["question_length"].apply(lambda x: "TRUE" if x > 30 else "FALSE")

# check question length
df["num_decompositions"] = df["decomposition"].apply(lambda x: len(re.findall(r"\d+:", str(x))))

In [None]:
# Save
df.to_csv(base_path + "/updated_comparison.csv", index=False)

## Self-Consistency Decoding

In [None]:
from collections import Counter

def self_consistent_decompose(text, model, tokenizer, num_samples=5):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=num_samples,
        max_new_tokens=128
    )
    decoded = [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]
    most_common = Counter(decoded).most_common(1)[0][0]
    return most_common, decoded

In [None]:
text = "What South African politician formed a government of national unity with nelson Mandela's African National Congress and Mangosuthu Buthelezi's Inkatha Freedom Party?"
best_output, all_outputs = self_consistent_decompose(text, model, tokenizer, num_samples=5)
print("Best:", best_output)
print("All Samples:", all_outputs)

Best: 1: return South African politician that formed a government of national unity with nelson Mandela 's African National Congress and Mangosuthu Buthelezi 's Inkatha Freedom Party 2: return South African politician that formed a government of national unity with nelson Mandela 's African National Congress and Mangosuthu Buthelezi 's Inkatha Freedom Party 3: return South African politician that formed a government of national unity with nelson Mandela 's African National Congress and Mangosuthu Buthe
All Samples: ["1: return South African politician that formed a government of national unity with nelson Mandela 's African National Congress and Mangosuthu Buthelezi 's Inkatha Freedom Party 2: return South African politician that formed a government of national unity with nelson Mandela 's African National Congress and Mangosuthu Buthelezi 's Inkatha Freedom Party 3: return South African politician that formed a government of national unity with nelson Mandela 's African National Congr

## Verifier-Based Re-Ranking (Using Previous Self-Consistency outpus)

In [None]:
from sentence_transformers import SentenceTransformer, util

verifier_model = SentenceTransformer("all-MiniLM-L6-v2")

def verifier_rerank(text, candidates):
    input_embedding = verifier_model.encode(text, convert_to_tensor=True)
    candidate_embeddings = verifier_model.encode(candidates, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx], list(zip(candidates, scores.tolist()))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
best_reranked, score_list = verifier_rerank(text, all_outputs)
print("Verifier Best:", best_reranked)

Verifier Best: 1: return Argentine former professional boxer that Kelly Robert Pavlik is an American former professional boxer who competed from 2000 to 2012 2: return #1 that won the unified WBC , WBO , Ring magazine and lineal middleweight titles 3: return #2 that made three successful defenses before losing them to Sergio Mart­nez
