<a href="https://colab.research.google.com/github/shannn1/goodRAG/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine-tune

In [36]:
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import wandb
from peft import get_peft_model, LoraConfig, TaskType
ds = load_dataset("lighteval/natural_questions_clean")

In [37]:
df_train = ds["train"]
df_validation = ds["validation"]

In [38]:
def preprocess_dataset(dataset):
    dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ['document', 'question', 'short_answers']])
    def process_short_answers(example):
        example['short_answers'] = example['short_answers'][0] if isinstance(example['short_answers'], list) and len(example['short_answers']) > 0 else None
        return example

    dataset = dataset.map(process_short_answers)

    dataset = dataset.filter(lambda x: x['short_answers'] is not None)

    return dataset

df_train = preprocess_dataset(df_train)
df_validation = preprocess_dataset(df_validation)

In [6]:
datasets = DatasetDict({
    "train": df_train,
    "validation": df_validation
})

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [8]:
for name, param in model.named_parameters():
    if "decoder.block.9" not in name and "decoder.block.10" not in name and "decoder.block.11" not in name and "decoder.block.8" not in name:  # train only last 2 layers
        param.requires_grad = False

trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable Parameters:", trainable_params)

Trainable Parameters: ['decoder.block.8.layer.0.SelfAttention.q.weight', 'decoder.block.8.layer.0.SelfAttention.k.weight', 'decoder.block.8.layer.0.SelfAttention.v.weight', 'decoder.block.8.layer.0.SelfAttention.o.weight', 'decoder.block.8.layer.0.layer_norm.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.8.layer.1.EncDecAttention.k.weight', 'decoder.block.8.layer.1.EncDecAttention.v.weight', 'decoder.block.8.layer.1.EncDecAttention.o.weight', 'decoder.block.8.layer.1.layer_norm.weight', 'decoder.block.8.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.2.DenseReluDense.wo.weight', 'decoder.block.8.layer.2.layer_norm.weight', 'decoder.block.9.layer.0.SelfAttention.q.weight', 'decoder.block.9.layer.0.SelfAttention.k.weight', 'decoder.block.9.layer.0.SelfAttention.v.weight', 'decoder.block.9.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.0.layer_norm.weight', 'decoder.block.9.layer.1.EncDecAttention.q.weight', 'decoder.block.9.layer.1.EncDecAt

In [11]:
def preprocess_function(examples):
    inputs = [f"question: {q}  context: {doc}" for q, doc in zip(examples["question"], examples["document"])]
    targets = examples["short_answers"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Map:   0%|          | 0/106926 [00:00<?, ? examples/s]

Map:   0%|          | 0/4289 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets.save_to_disk("./tokenized_datasets")

Saving the dataset (0/1 shards):   0%|          | 0/106926 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4289 [00:00<?, ? examples/s]

In [9]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk("tokenized_datasets")

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
# training
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",    # eval for every epoch
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50,
    logging_strategy="epoch",
    report_to="wandb",
    run_name="t5-large-finetune1",
    fp16=True
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [12]:
wandb.init(project="newsqa-finetuning", name="t5-base-finetuning3")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [34]:
import gc
torch.cuda.empty_cache()
gc.collect()

21032

In [14]:
trainable_params = sum(param.numel() for param in model.parameters() if param.requires_grad)
print(f"Trainable Parameters: {trainable_params}")

Trainable Parameters: 37757952


In [15]:
trainer.train()
model.save_pretrained("./finetuned_t5_2")
tokenizer.save_pretrained("./finetuned_t5_2")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.6448,1.577095
2,1.5224,1.541625


Epoch,Training Loss,Validation Loss
1,1.6448,1.577095
2,1.5224,1.541625
3,1.4737,1.53061


('./finetuned_t5_2/tokenizer_config.json',
 './finetuned_t5_2/special_tokens_map.json',
 './finetuned_t5_2/spiece.model',
 './finetuned_t5_2/added_tokens.json',
 './finetuned_t5_2/tokenizer.json')

In [16]:
wandb.finish()

0,1
eval/loss,█▃▁
eval/runtime,▁▃█
eval/samples_per_second,█▆▁
eval/steps_per_second,█▆▁
train/epoch,▁▁▅▅███
train/global_step,▁▁▅▅███
train/grad_norm,▂▁█
train/learning_rate,█▅▁
train/loss,█▃▁

0,1
eval/loss,1.53061
eval/runtime,98.0989
eval/samples_per_second,43.721
eval/steps_per_second,2.742
total_flos,1.9534029421805568e+17
train/epoch,3.0
train/global_step,20049.0
train/grad_norm,1.64546
train/learning_rate,0.0
train/loss,1.4737


## Inference

In [17]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "./finetuned_t5_2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [35]:
def generate_answer(question, max_length=50):
    input_text = f"question: {question}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=3,
            early_stopping=True
        )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return answer

In [39]:
df_validation = df_validation.map(
    lambda example: {"predicted_answer": generate_answer(example["question"])},
    batched=False
)


Map:   0%|          | 0/4289 [00:00<?, ? examples/s]

In [25]:
df_validation

Dataset({
    features: ['document', 'question', 'short_answers', 'predicted_answer'],
    num_rows: 4289
})

## Upload to hf

In [26]:
from huggingface_hub import login

In [27]:
login(token="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")

In [None]:
import pandas as pd
from datasets import Dataset
hf_dataset = Dataset.from_pandas(df_validation)

In [None]:
dataset_name = "my-preprocessed-dataset"
tokenized_datasets.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/107 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Shannnh/my-preprocessed-dataset/commit/6f700570677146286792ee83194533cd0c36b395', commit_message='Upload dataset', commit_description='', oid='6f700570677146286792ee83194533cd0c36b395', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Shannnh/my-preprocessed-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Shannnh/my-preprocessed-dataset'), pr_revision=None, pr_num=None)

In [40]:
df_validation.push_to_hub("baseline-dataset-t5-base-2", private=False)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Shannnh/baseline-dataset-t5-base-2/commit/25feaa365a7af330f382790aabb494b4543934f0', commit_message='Upload dataset', commit_description='', oid='25feaa365a7af330f382790aabb494b4543934f0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Shannnh/baseline-dataset-t5-base-2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Shannnh/baseline-dataset-t5-base-2'), pr_revision=None, pr_num=None)

## Metrics

In [45]:
from datasets import load_dataset
dataset = load_dataset("Shannnh/baseline-dataset-t5-base-1")

In [46]:
data = dataset["validation"]
def calculate_f1(predicted, ground_truth):
    pred_tokens = predicted.split()
    gt_tokens = ground_truth.split()
    common = set(pred_tokens) & set(gt_tokens)
    if len(common) == 0:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)

f1_scores = [calculate_f1(pred, gt) for pred, gt in zip(data["predicted_answer"], data["short_answers"])]
average_f1 = sum(f1_scores) / len(f1_scores)
print(f"Average F1 Score: {average_f1:.4f}")

Average F1 Score: 0.0343


In [47]:
def calculate_exact_match(predicted, ground_truth):
    return int(predicted.strip() == ground_truth.strip())

em_scores = [calculate_exact_match(pred, gt) for pred, gt in zip(data["predicted_answer"], data["short_answers"])]
average_em = sum(em_scores) / len(em_scores)
print(f"Exact Match Score: {average_em:.4f}")

Exact Match Score: 0.0075


In [49]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def calculate_q_bleu(predicted, ground_truth, question):
    # Tokenize inputs
    ref_tokens = ground_truth.split()
    hyp_tokens = predicted.split()
    question_tokens = question.split()

    reference = [ref_tokens]
    hypothesis = hyp_tokens
    bleu_score = sentence_bleu(reference, hypothesis)

    stop_words = set(stopwords.words('english'))
    important_ref_tokens = [token for token in ref_tokens if token.lower() not in stop_words]
    important_hyp_tokens = [token for token in hyp_tokens if token.lower() not in stop_words]

    key_match = len(set(important_ref_tokens) & set(important_hyp_tokens)) / max(len(set(important_ref_tokens)), 1)

    question_match = len(set(question_tokens) & set(hyp_tokens)) / max(len(set(question_tokens)), 1)

    q_bleu = 0.7 * bleu_score + 0.2 * key_match + 0.1 * question_match

    return q_bleu

data = dataset["validation"]
q_bleu_scores = [
    calculate_q_bleu(pred, gt, q)
    for pred, gt, q in zip(data["predicted_answer"], data["short_answers"], data["question"])
]
average_q_bleu = sum(q_bleu_scores) / len(q_bleu_scores)
print(f"Average Q-BLEU Score: {average_q_bleu:.4f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Average Q-BLEU Score: 0.0193
