In [1]:
# @title Requirements

!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [29]:
# @title Libraries

from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric

In [30]:
# @title Task 1: Use a pre-trained google/flan-t5-small as the model:

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
# @title Task 2: Verify if the summarization task works:


input_text = """World War I[j] or the First World War (28 July 1914 – 11 November 1918) was a global conflict fought between two coalitions: the Allies and the Central Powers. Battles took place throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. One of the deadliest wars in history, it ultimately resulted in an estimated 9 million soldiers dead and 23 million wounded, plus another 5 million civilian deaths from numerous causes. Millions more died as a result of genocide, and the war was a major factor in the 1918 Spanish flu pandemic."""

input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(input_ids, max_length=100, num_beams=4)

summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary_text)


Generated Summary: World War I[j] or the First World War (28 July 1914 – 11 November 1918) was a global conflict fought between two coalitions: the Allies and the Central Powers


In [32]:
# @title Task 3: Verify if the Q&A task works:


context = "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity."
question = "Who was Albert Einstein?"

input_text = f"question: {question} context: {context}"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print(" ")
print(decoded_output)

 
physicist




In [33]:
# @title Task 4: Verify if English to French translation task works:

english_text = "Hello, how are you?"

input_ids = tokenizer.encode("translate English to French: " + english_text, return_tensors="pt")
output_ids = model.generate(input_ids, max_length=100, num_beams=4)

french_translation = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("English Input:", english_text)
print("French Translation:", french_translation)


English Input: Hello, how are you?
French Translation: Hello, c'est-à-dire?


In [34]:
# @title Task 5: Programmatically print the names of all the model layers and their dimensions:

for name, param in model.named_parameters():
    print(name, param.size())

shared.weight torch.Size([32128, 512])
encoder.block.0.layer.0.SelfAttention.q.weight torch.Size([384, 512])
encoder.block.0.layer.0.SelfAttention.k.weight torch.Size([384, 512])
encoder.block.0.layer.0.SelfAttention.v.weight torch.Size([384, 512])
encoder.block.0.layer.0.SelfAttention.o.weight torch.Size([512, 384])
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight torch.Size([32, 6])
encoder.block.0.layer.0.layer_norm.weight torch.Size([512])
encoder.block.0.layer.1.DenseReluDense.wi_0.weight torch.Size([1024, 512])
encoder.block.0.layer.1.DenseReluDense.wi_1.weight torch.Size([1024, 512])
encoder.block.0.layer.1.DenseReluDense.wo.weight torch.Size([512, 1024])
encoder.block.0.layer.1.layer_norm.weight torch.Size([512])
encoder.block.1.layer.0.SelfAttention.q.weight torch.Size([384, 512])
encoder.block.1.layer.0.SelfAttention.k.weight torch.Size([384, 512])
encoder.block.1.layer.0.SelfAttention.v.weight torch.Size([384, 512])
encoder.block.1.layer.0.SelfAttention.o

In [35]:
# @title Task 6: Programmatically print the total number of parameters/weights in this model:

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total parameters:", total_params)

Total parameters: 76961152


In [36]:
# @title Task 7: Set the tensor in the final layer (decoder.final_layer_norm.weight) to all zeros:

model.decoder.final_layer_norm.weight.data.fill_(0)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [37]:
# @title Task 8: Verify if the Q&A task works after resetting the weights of the above layer:

context = "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity."
question = "Who was Albert Einstein?"

input_text = f"question: {question} context: {context}"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print(" ")
print(" DECODED OUTPUT: ", decoded_output) # NOT WORKING AFTER SETTING THE TENSORS IN THE FINAL LAYER TO ZERO

 
 DECODED OUTPUT:  


In [None]:
# @title Task 9: Replace the decoder.final_layer_norm.weight with a layer of smaller dimensions and adjust all the dependent layers to match the dimension

# Load the original model
original_model = T5ForConditionalGeneration.from_pretrained("t5-small")
original_config = original_model.config

# Modify the configuration to reduce the dimension of the final layer
new_dim = 256 # @param {type:"number"}
original_config.d_model = new_dim

num_layers = original_config.num_layers
scale_factor = new_dim / original_config.d_model
for i in range(num_layers):
    layer = original_model.decoder.layers[i]
    layer.self_attn.k_proj.weight.data *= scale_factor
    layer.self_attn.v_proj.weight.data *= scale_factor
    layer.self_attn.q_proj.weight.data *= scale_factor
    layer.self_attn.out_proj.weight.data *= scale_factor
    layer.fc1.weight.data *= scale_factor
    layer.fc2.weight.data *= scale_factor

# Replace the final layer normalization weight tensor with smaller dimensions
new_weight = torch.zeros(new_dim)
original_model.decoder.final_layer_norm.weight.data = new_weight

# original_model.save_pretrained("modified_flan_t5_small")


In [12]:
# @title Task 10: Reload the original google/flan-t5-small model:

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
# @title Task 11: Train the model for Q&A task


squad_dataset = load_dataset("squad")

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

# task-specific prefix
task_prefix = "answer:"

def preprocess(example):
    context = example["context"]
    question = example["question"]
    answer = example["answers"]["text"][0]
    input_text = f"{task_prefix} {question} context: {context}"
    target_text = answer
    return {"input_text": input_text, "target_text": target_text}

train_dataset = squad_dataset["train"].map(preprocess, remove_columns=["context", "question", "answers"])
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
lr = 5e-5 # @param {type:"number"}
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3 # @param {type:"number"}
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

model.train()

for epoch in range(num_epochs):
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        input_ids = tokenizer(batch["input_text"], return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
        labels = tokenizer(batch["target_text"], return_tensors="pt", padding=True, truncation=True, max_length=32).input_ids.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Update tqdm progress bar current loss
        progress_bar.set_postfix({"loss": loss.item()})

model.save_pretrained("qa_model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Epoch 1/3: 100%|██████████| 10950/10950 [36:42<00:00,  4.97it/s, loss=0.287]
Epoch 2/3: 100%|██████████| 10950/10950 [36:42<00:00,  4.97it/s, loss=0.139]
Epoch 3/3: 100%|██████████| 10950/10950 [36:49<00:00,  4.96it/s, loss=0.21]


In [None]:
# @title Task 11: Evaluate the model

In [None]:
squad_validation_dataset = load_dataset("squad", split="validation")

trained_model = T5ForConditionalGeneration.from_pretrained("qa_model")
# tokenizer = T5Tokenizer.from_pretrained("qa_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model.to(device)

In [27]:
# task-specific prefix
task_prefix = "question:"

predictions = []
references = []

def generate_answer(context, question):

    input_text = f"{task_prefix} {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    output_ids = trained_model.generate(input_ids, max_length=32, num_beams=4, early_stopping=True)

    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return answer


for example in squad_validation_dataset:

    context = example["context"]
    question = example["question"]

    prediction = generate_answer(context, question)
    reference = example["answers"]["text"][0]

    predictions.append({"prediction_text": prediction, "id": example["id"]})
    references.append({"answers": {"answer_start": [0], "text": [reference]}, "id": example["id"]})

# Initialize SQuAD evaluation metric
squad_metric = load_metric("squad")

results = squad_metric.compute(predictions=predictions, references=references)

print("Evaluation Scores:", results)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluation Scores: {'exact_match': 62.44087038789026, 'f1': 77.24002690297677}
