In [30]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/hw4
!pwd

## Fine-tune

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import wandb

splits = {
    'train': 'data/train-00000-of-00001-ec54fbe500fc3b5c.parquet',
    'validation': 'data/validation-00000-of-00001-3cf888b12fff1dd6.parquet'
}
df_train = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["train"])
df_validation = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["validation"])

In [5]:
def preprocess_dataframe(df):
    df = df[['context', 'question', 'answers']].copy()
    df['answers'] = df['answers'].apply(lambda x: x[0] if x else None)
    return df
df_train = preprocess_dataframe(df_train)
df_validation = preprocess_dataframe(df_validation)

In [6]:
train_dataset = Dataset.from_pandas(df_train)
validation_dataset = Dataset.from_pandas(df_validation)
datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
#model.to(device)

In [12]:
for name, param in model.named_parameters():
    if "decoder.block.5" not in name and "decoder.block.4" not in name:  # train only last 2 layers
        param.requires_grad = False

trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable Parameters:", trainable_params)

Trainable Parameters: ['decoder.block.4.layer.0.SelfAttention.q.weight', 'decoder.block.4.layer.0.SelfAttention.k.weight', 'decoder.block.4.layer.0.SelfAttention.v.weight', 'decoder.block.4.layer.0.SelfAttention.o.weight', 'decoder.block.4.layer.0.layer_norm.weight', 'decoder.block.4.layer.1.EncDecAttention.q.weight', 'decoder.block.4.layer.1.EncDecAttention.k.weight', 'decoder.block.4.layer.1.EncDecAttention.v.weight', 'decoder.block.4.layer.1.EncDecAttention.o.weight', 'decoder.block.4.layer.1.layer_norm.weight', 'decoder.block.4.layer.2.DenseReluDense.wi.weight', 'decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.4.layer.2.layer_norm.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.5.layer.0.SelfAttention.v.weight', 'decoder.block.5.layer.0.SelfAttention.o.weight', 'decoder.block.5.layer.0.layer_norm.weight', 'decoder.block.5.layer.1.EncDecAttention.q.weight', 'decoder.block.5.layer.1.EncDecAt

In [None]:
'''
def answer_question(context, question):
    inputs = tokenizer(context, question, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index])
    )
    return answer

    last_hidden_state = outputs.last_hidden_state
    token_logits = last_hidden_state.mean(dim=2)
    predicted_token_ids = torch.argmax(token_logits, dim=-1)
    generated_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)
    return generated_text

In [13]:
def preprocess_function(examples):
    inputs = [f"question: {q}  context: {c}" for q, c in zip(examples["question"], examples["context"])]
    targets = examples["answers"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Map:   0%|          | 0/74160 [00:00<?, ? examples/s]

Map:   0%|          | 0/4212 [00:00<?, ? examples/s]

In [14]:
# training
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",    # eval for every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    report_to="wandb",
    run_name="t5-small-finetune1"
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [15]:
wandb.init(project="newsqa-finetuning", name="t5-small-finetuning")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [13]:
import gc
torch.cuda.empty_cache()
gc.collect()

69

In [16]:
trainer.train()
model.save_pretrained("./finetuned_t5")
tokenizer.save_pretrained("./finetuned_t5")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.9362,0.945433
2,1.1405,0.929015
3,1.111,0.925078


('./finetuned_t5/tokenizer_config.json',
 './finetuned_t5/special_tokens_map.json',
 './finetuned_t5/spiece.model',
 './finetuned_t5/added_tokens.json',
 './finetuned_t5/tokenizer.json')

In [None]:
wandb.finish()

## Inference

In [19]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "./finetuned_t5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [17]:
splits = {
    'validation': 'data/validation-00000-of-00001-3cf888b12fff1dd6.parquet'
}
df_validation = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["validation"])

In [18]:
def preprocess_dataframe(df):
    df = df[['context', 'question', 'answers']].copy()
    df['answers'] = df['answers'].apply(lambda x: x[0] if x else None)
    return df

df_validation = preprocess_dataframe(df_validation)

In [20]:
def generate_answer(context, question, max_length=50):
    input_text = f"question: {question}  context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=3,
            early_stopping=True
        )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return answer

In [21]:
df_validation["predicted_answer"] = df_validation.apply(
    lambda row: generate_answer(row["context"], row["question"]),
    axis=1
)

In [22]:
df_validation

Unnamed: 0,context,question,answers,predicted_answer
0,(CNN) -- What could be more powerful than the ...,What will be nominated?,three different videos,Iron Eyes Cody
1,(CNN) -- What could be more powerful than the ...,What does the Harrison Ford video feature?,"getting his chest waxed,",your own environmental videos here on CNN's Ec...
2,(CNN) -- What could be more powerful than the ...,What videos will you send?,environmental,your own environmental videos here on CNN's Ec...
3,(CNN) -- What could be more powerful than the ...,What is Ford getting waxed?,his chest,his chest
4,(CNN) -- What could be more powerful than the ...,Who got his chest waxed?,Harrison Ford,Harrison Ford
...,...,...,...,...
4207,WASHINGTON (CNN) -- President Obama on Friday ...,What will the new system include?,give detainees greater latitude in selecting l...,give detainees greater latitude in selecting l...
4208,NEW YORK (CNN) -- John and Elizabeth Calvert e...,who are growing?,friends,Calverts
4209,NEW YORK (CNN) -- John and Elizabeth Calvert e...,Where did the couple live?,Hilton Head Island,Hilton Head Island
4210,NEW YORK (CNN) -- John and Elizabeth Calvert e...,when was the last seen of John and Elizabeth?,"March 3,","March 3,"


## Upload to hf

In [23]:
from huggingface_hub import notebook_login

In [28]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
import pandas as pd
from datasets import Dataset
hf_dataset = Dataset.from_pandas(df_validation)

In [29]:
hf_dataset.push_to_hub("baseline-dataset-1", private=False)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Shannnh/baseline-dataset-1/commit/d0bda3098d435755580efaa94b249bbb40d45d35', commit_message='Upload dataset', commit_description='', oid='d0bda3098d435755580efaa94b249bbb40d45d35', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Shannnh/baseline-dataset-1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Shannnh/baseline-dataset-1'), pr_revision=None, pr_num=None)

## Metrics

In [33]:
from datasets import load_dataset
dataset = load_dataset("Shannnh/baseline-dataset-1")

README.md:   0%|          | 0.00/397 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.53M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4212 [00:00<?, ? examples/s]

In [34]:
print(dataset)

In [35]:
data = dataset["train"]
def calculate_f1(predicted, ground_truth):
    pred_tokens = predicted.split()
    gt_tokens = ground_truth.split()
    common = set(pred_tokens) & set(gt_tokens)
    if len(common) == 0:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)

f1_scores = [calculate_f1(pred, gt) for pred, gt in zip(data["predicted_answer"], data["answers"])]
average_f1 = sum(f1_scores) / len(f1_scores)
print(f"Average F1 Score: {average_f1:.4f}")

In [37]:
average_f1

0.4901781422950149

In [36]:
def calculate_exact_match(predicted, ground_truth):
    return int(predicted.strip() == ground_truth.strip())

em_scores = [calculate_exact_match(pred, gt) for pred, gt in zip(data["predicted_answer"], data["answers"])]
average_em = sum(em_scores) / len(em_scores)
print(f"Exact Match Score: {average_em:.4f}")

In [38]:
average_em

0.31837606837606836

In [39]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def calculate_q_bleu(predicted, ground_truth, question):
    # Tokenize inputs
    ref_tokens = ground_truth.split()
    hyp_tokens = predicted.split()
    question_tokens = question.split()

    reference = [ref_tokens]
    hypothesis = hyp_tokens
    bleu_score = sentence_bleu(reference, hypothesis)

    stop_words = set(stopwords.words('english'))
    important_ref_tokens = [token for token in ref_tokens if token.lower() not in stop_words]
    important_hyp_tokens = [token for token in hyp_tokens if token.lower() not in stop_words]

    key_match = len(set(important_ref_tokens) & set(important_hyp_tokens)) / max(len(set(important_ref_tokens)), 1)

    question_match = len(set(question_tokens) & set(hyp_tokens)) / max(len(set(question_tokens)), 1)

    q_bleu = 0.7 * bleu_score + 0.2 * key_match + 0.1 * question_match

    return q_bleu

data = {
    "predicted_answer": ["The capital of France is Paris.", "Python is a programming language."],
    "answers": ["Paris is the capital of France.", "Python is a widely used programming language."],
    "questions": ["What is the capital of France?", "What is Python?"]
}

q_bleu_scores = [
    calculate_q_bleu(pred, gt, q)
    for pred, gt, q in zip(data["predicted_answer"], data["answers"], data["questions"])
]
average_q_bleu = sum(q_bleu_scores) / len(q_bleu_scores)
print(f"Average Q-BLEU Score: {average_q_bleu:.4f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [40]:
average_q_bleu

0.135