# Load required packages

To install the packages required for this notebook on the HPC, please follow the 'Jupyter Kernel Creation' slides posted on OPAL.

In [2]:
# Load required packages
import re
import ast
import unicodedata

import pandas as pd
import torch
import tqdm as notebook_tqdm
from torch.utils.data import Dataset as TorchDataset

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


In [26]:
import os
os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN_HERE"

In [None]:

# 1. Load the raw training file
df_train = pd.read_csv("train_dataset_saq.csv")

# 2. Parse the 'annotations' column to extract the answer
def extract_answer(row):
    try:
        # Convert string representation of list to actual list
        data = ast.literal_eval(row['annotations'])
        # Get the first answer from the list
        return data[0]['answers'][0]
    except:
        return ""

df_train['clean_answer'] = df_train.apply(extract_answer, axis=1)

# 3. Format into the Prompt Structure
# We train the model to output the EXACT format we want.
def formatting_prompts_func(examples):
    output_texts = []
    for q, c, a in zip(examples['en_question'], examples['country'], examples['clean_answer']):
        # The prompt template
        text = f"Question: {q}\nCountry: {c}\nAnswer: {a}"
        output_texts.append(text)
    return output_texts

# 4. Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df_train)
print(f"Training on {len(train_dataset)} examples.")

## Helper Functions

In [27]:
train_df = pd.read_csv("train_dataset_saq.csv")

train_df.head()

Unnamed: 0,ID,question,en_question,annotations,idks,country
0,Na-ko-25,درایران ورزش‌های معمولی که در روز ورزش مدارس ا...,What are the typical sports played in Iran's s...,"[{'answers': ['فوتبال'], 'en_answers': ['footb...","{'idk': 0, 'no-answer': 0, 'not-applicable': 0}",IR
1,New-en-79,Which region of the UK is well known for its t...,Which region of UK is well known for its theat...,"[{'answers': ['london'], 'en_answers': ['londo...","{'idk': 0, 'no-answer': 0, 'not-applicable': 0}",GB
2,Al-en-16,At what age do kids start nursery in the UK? (...,At what age do kids start preschool in UK? (Pr...,"[{'answers': ['3'], 'en_answers': ['3'], 'coun...","{'idk': 0, 'no-answer': 0, 'not-applicable': 0}",GB
3,New-as-89,What traditional festival accessories do peopl...,What traditional festival accessories do peopl...,"[{'answers': ['fourth of july clothes'], 'en_a...","{'no-answer': 2, 'not-applicable': 2, 'idk': 1}",US
4,Gu-ch-32,در کشور شما مدت زمان آموزش اجباری به سال چقدر ...,What is the duration of compulsory education i...,"[{'answers': ['8'], 'en_answers': ['8'], 'coun...","{'no-answer': 1, 'idk': 1, 'not-applicable': 0}",IR


## FineTune

In [28]:
def extract_clean_answer(ann):
    try:
        ann_list = ast.literal_eval(ann)
        if isinstance(ann_list, list) and ann_list:
            en_answers = ann_list[0].get("en_answers", [])
            if not en_answers:
                return "idk"

            ans = unicodedata.normalize("NFKD", en_answers[0].lower())
            ans = re.sub(r"[^\w\s]", "", ans)
            ans = re.sub(r"\s+", " ", ans).strip()

            return ans if ans else "idk"
    except Exception:
        return "idk"


In [29]:
train_df["answer"] = train_df["annotations"].apply(extract_clean_answer)
train_df = train_df[["en_question", "country", "answer"]]
train_df.head()

Unnamed: 0,en_question,country,answer
0,What are the typical sports played in Iran's s...,IR,football
1,Which region of UK is well known for its theat...,GB,london
2,At what age do kids start preschool in UK? (Pr...,GB,3
3,What traditional festival accessories do peopl...,US,fourth of july clothes
4,What is the duration of compulsory education i...,IR,8


# Load the model (Llama-8B)

Note that you need to be on the partition with GPU (e.g. capella, alpha).

In [30]:
device = "cuda"

This is the model which doesn't require requesting access. If you have the access to the Llama-8B model, you can use it instead.

In [32]:
model_name = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)
model.config.pad_token_id = tokenizer.pad_token_id 

Loading checkpoint shards: 100%|██████████| 4/4 [02:10<00:00, 32.58s/it]


## Dataset

In [33]:
class SAQDataset(TorchDataset):
    def __init__(self, df, tokenizer, max_length=256):
        """
        df: pandas DataFrame with columns ['en_question', 'country', 'answer']
        """
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Prompt only includes question + country
        prompt_text = f"Answer the following question using ONLY ONE WORD or ONE NUMBER.\nCountry: {row['country']}\nQuestion: {row['en_question']}\nAnswer:"

        answer = row["answer"] if pd.notna(row["answer"]) else "idk"
        full_text = prompt_text + " " + answer


        enc = self.tokenizer(
            full_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Mask the prompt tokens in labels
        prompt_len = self.tokenizer(prompt_text, add_special_tokens=False)["input_ids"]
        labels = enc["input_ids"].clone()
        labels[0, :len(prompt_len)] = -100

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0),
        }


In [34]:
train_dataset = SAQDataset(train_df, tokenizer)


In [35]:
def decode_without_special(ids):
    return tokenizer.decode([i for i in ids if i != tokenizer.pad_token_id and i != -100]).strip()

for i in range(3):
    sample = train_dataset[i]
    decoded_input = decode_without_special(sample["input_ids"])
    decoded_label = decode_without_special(sample["labels"])
    print(f"--- Sample {i} ---")
    print("Input: ", decoded_input)
    print("Label: ", decoded_label)
    print("-" * 50)


--- Sample 0 ---
Input:  <|begin_of_text|>Answer the following question using ONLY ONE WORD or ONE NUMBER.
Country: IR
Question: What are the typical sports played in Iran's school sports day?
Answer: football
Label:  : football
--------------------------------------------------
--- Sample 1 ---
Input:  <|begin_of_text|>Answer the following question using ONLY ONE WORD or ONE NUMBER.
Country: GB
Question: Which region of UK is well known for its theatrical performances?
Answer: london
Label:  : london
--------------------------------------------------
--- Sample 2 ---
Input:  <|begin_of_text|>Answer the following question using ONLY ONE WORD or ONE NUMBER.
Country: GB
Question: At what age do kids start preschool in UK? (Provide Arabic numerals (e.g., 12) only.)
Answer: 3
Label:  : 3
--------------------------------------------------


## Finetuning

In [36]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [37]:
training_args = TrainingArguments(
    output_dir="./llama3_saq_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    report_to="none"
)

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
50,0.2891
100,0.018
150,0.016
200,0.0155
250,0.0135


TrainOutput(global_step=252, training_loss=0.06992823012646228, metrics={'train_runtime': 323.8279, 'train_samples_per_second': 12.349, 'train_steps_per_second': 0.778, 'total_flos': 4.614059508262502e+16, 'train_loss': 0.06992823012646228, 'epoch': 3.0})

## Cleaning

In [39]:
def clean_generation(text):
    text = unicodedata.normalize("NFKD", text.lower())
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()[0] if text else "idk"


# SAQ Task

In [40]:
def saq_predict(question, country, max_new_tokens=5):
    prompt_text = f"Answer the following question using ONLY ONE WORD or ONE NUMBER.\nCountry: {country}\nQuestion: {question}\nAnswer:"

    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # Return only the first word of the generated answer
    cleaned = unicodedata.normalize("NFKD", decoded.lower())
    cleaned = re.sub(r"[^\w\s]", "", cleaned)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned.split()[0] if cleaned else "idk"


In [None]:
test_df = pd.read_csv("test_dataset_saq.csv")

test_df["answer"] = test_df.apply(
    lambda row: saq_predict(row["en_question"], row["country"]),
    axis=1
)

test_df[["ID", "answer"]].to_csv(
    "saq_prediction.tsv",
    sep="\t",
    index=False
)

test_df[["ID", "answer"]].head()


In [None]:
saq = pd.read_csv("test_dataset_saq.csv")

saq = saq.sample(n=10, random_state=12)
saq = saq[["ID", "en_question"]]

In [None]:
preds = []
for q in saq["en_question"]:
    answer = saq_func(q)
    preds.append(answer)

saq["answer"] = preds

In [9]:
saq_submission = saq[["ID", "answer"]]
saq_submission.to_csv("saq_prediction.tsv", sep='\t', index=False)