In [70]:
! pip install accelerate



In [71]:
! pip install datasets



In [72]:
!pip install rouge_score



In [73]:
!pip install evaluate



# ***All Imports in 1 Place***

In [1]:
# pip install accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Install necessary libraries
# !pip install transformers datasets

# Import everything we'll use
import datasets
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
import evaluate
import numpy as np
import torch

# ***1. Use a pre-trained google/flan-t5-small as the model.***

In [2]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
input_text = "Best way to stay fit?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids, max_new_tokens=100)

In [4]:
tokenizer.decode(outputs[0])

'<pad> Use a sock to keep your body fit</s>'

# ***2. Verify if the summarization task works.***

In [5]:
# Text to for summarization
text = """There are many types of cancer treatment. The types of treatment that you have will depend on the type of cancer you have and how advanced it is. Some people with cancer will have only one treatment. But most people have a combination of treatments, such as surgery with chemotherapy and/or radiation therapy. You may also have immunotherapy, targeted therapy, or hormone therapy.
Clinical trials might also be an option for you. Clinical trials are research studies that involve people. Understanding what they are and how they work can help you decide if taking part in a trial is a good option for you.
When you need treatment for cancer, you have a lot to learn and think about. It is normal to feel overwhelmed and confused. But, talking with your doctor and learning all you can about all your treatment options, including clinical trials, can help you make a decision you feel good about."""

prompt = f"summarize the treatments: {text}"

In [6]:
# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate the summary
outputs = model.generate(input_ids, max_new_tokens=100)
tokenizer.decode(outputs[0])

'<pad> Learn about cancer treatment. Learn about clinical trials.</s>'

### **Model has given the generic summary of the given text, where as the ask was to summarize the treatments listed in the context.**

# ***3. Verify if the Q&A task works.***

In [7]:
# Text to for summarization
text = """There are many types of cancer treatment. The types of treatment that you have will depend on the type of cancer you have and how advanced it is. Some people with cancer will have only one treatment. But most people have a combination of treatments, such as surgery with chemotherapy and/or radiation therapy. You may also have immunotherapy, targeted therapy, or hormone therapy.
Clinical trials might also be an option for you. Clinical trials are research studies that involve people. Understanding what they are and how they work can help you decide if taking part in a trial is a good option for you.
When you need treatment for cancer, you have a lot to learn and think about. It is normal to feel overwhelmed and confused. But, talking with your doctor and learning all you can about all your treatment options, including clinical trials, can help you make a decision you feel good about."""

prompt = f"""
Context: {text}
Answer the given questions from the given:
1. Cancer treatment depends on what?
2. What is advised to do in case of cancer?
"""

In [8]:
# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate the summary
outputs = model.generate(input_ids, max_new_tokens=100)
tokenizer.decode(outputs[0])

'<pad> 1. the type of cancer you have and how advanced it is 2. Talk with your doctor and learning all you can about all your treatment options, including clinical trials</s>'

### **Q&A task seems to be working well. It could able to answer both the question from the given context**

# ***4. Verify if English to French translation task works.***

In [9]:
# Text to for summarization
text = """There are many types of cancer treatment. The types of treatment that you have will depend on the type of cancer you have and how advanced it is. Some people with cancer will have only one treatment. But most people have a combination of treatments, such as surgery with chemotherapy and/or radiation therapy. You may also have immunotherapy, targeted therapy, or hormone therapy.
Clinical trials might also be an option for you. Clinical trials are research studies that involve people. Understanding what they are and how they work can help you decide if taking part in a trial is a good option for you.
When you need treatment for cancer, you have a lot to learn and think about. It is normal to feel overwhelmed and confused. But, talking with your doctor and learning all you can about all your treatment options, including clinical trials, can help you make a decision you feel good about."""

prompt = f"""
Translate to French: {text}
"""

In [10]:
# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate the summary
outputs = model.generate(input_ids, max_new_tokens=len(input_ids[0]))
tokenizer.decode(outputs[0])

"<pad>Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous avez en fonction de l'âge de cancer. Les types de traitements qui vous"

### **English to French translation is not working well. The same sentence is being repeated in the translation output**

# ***5. Programmatically print the names of all the model layers and their dimensions.***

In [11]:
# Iterate through all modules of the model
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Module):  # Check if it's a layer
        print(name)

        # Print dimensions for parameterizable layers
        for param_name, param in module.named_parameters():
            print(f"\t {param_name}: {list(param.shape)}")


	 shared.weight: [32128, 512]
	 encoder.block.0.layer.0.SelfAttention.q.weight: [384, 512]
	 encoder.block.0.layer.0.SelfAttention.k.weight: [384, 512]
	 encoder.block.0.layer.0.SelfAttention.v.weight: [384, 512]
	 encoder.block.0.layer.0.SelfAttention.o.weight: [512, 384]
	 encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight: [32, 6]
	 encoder.block.0.layer.0.layer_norm.weight: [512]
	 encoder.block.0.layer.1.DenseReluDense.wi_0.weight: [1024, 512]
	 encoder.block.0.layer.1.DenseReluDense.wi_1.weight: [1024, 512]
	 encoder.block.0.layer.1.DenseReluDense.wo.weight: [512, 1024]
	 encoder.block.0.layer.1.layer_norm.weight: [512]
	 encoder.block.1.layer.0.SelfAttention.q.weight: [384, 512]
	 encoder.block.1.layer.0.SelfAttention.k.weight: [384, 512]
	 encoder.block.1.layer.0.SelfAttention.v.weight: [384, 512]
	 encoder.block.1.layer.0.SelfAttention.o.weight: [512, 384]
	 encoder.block.1.layer.0.layer_norm.weight: [512]
	 encoder.block.1.layer.1.DenseReluDense.wi_0.weight

# ***6. Programmatically print the total number of parameters/weights in this model***

In [12]:
# Calculate total parameters
total_params = sum(p.numel() for p in model.parameters())

# Print total parameters
print(f"Total number of parameters: {total_params:,}")

# Calculate total trainable parameters (optional)
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Print total trainable parameters (optional)
print(f"Total number of trainable parameters: {total_trainable_params:,}")

Total number of parameters: 76,961,152
Total number of trainable parameters: 76,961,152


# ***7. Set the tensor in final layer (decoder.final_layer_norm.weight) to all zeros.***

In [13]:
# Access the tensor
final_layer_norm_weight = model.decoder.final_layer_norm.weight

# Set all elements to zero
final_layer_norm_weight.data.fill_(0.0)

# Print confirmation
print(model.decoder.final_layer_norm.weight.data)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

# ***8. Verify if the Q&A task works after resetting the weights of the above layer***

In [14]:
# Text to for summarization
text = """There are many types of cancer treatment. The types of treatment that you have will depend on the type of cancer you have and how advanced it is. Some people with cancer will have only one treatment. But most people have a combination of treatments, such as surgery with chemotherapy and/or radiation therapy. You may also have immunotherapy, targeted therapy, or hormone therapy.
Clinical trials might also be an option for you. Clinical trials are research studies that involve people. Understanding what they are and how they work can help you decide if taking part in a trial is a good option for you.
When you need treatment for cancer, you have a lot to learn and think about. It is normal to feel overwhelmed and confused. But, talking with your doctor and learning all you can about all your treatment options, including clinical trials, can help you make a decision you feel good about."""

prompt = f"""
Context: {text}
Answer the given questions from the given:
1. Cancer treatment depends on what?
2. What is advised to do in case of cancer?
"""

In [15]:
# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate the summary
outputs = model.generate(input_ids, max_new_tokens=100)
tokenizer.decode(outputs[0])

'<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

### **Q&A is not working after updating the final layer weights to all zeros**

# ***9. Replace the decoder.final_layer_norm.weight with a layer of smaller dimensions and adjust all the dependent layers to match the dimension***

In [16]:
import torch.nn as nn

In [17]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [18]:
class T5LayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
        self.eps = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.eps)
        return self.weight * x + self.bias

In [20]:
#Changing normalized_shape from 512 -> 256
model.decoder.final_layer_norm = T5LayerNorm(normalized_shape=256)

In [22]:
#Adjusting lm_head which follows the final_layer_norm
model.lm_head = nn.Linear(in_features=256, out_features=32128, bias=False)

In [24]:
#Adjusting layer_norm, DenseReluDense.wo present in 7th block in T5 Stack
model.decoder.block[7].layer[2].layer_norm = T5LayerNorm(normalized_shape=256)
model.decoder.block[7].layer[2].DenseReluDense.wo = nn.Linear(in_features=1024, out_features=256, bias=False)

In [26]:
#modified model
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

# ***10. Reload the original google/flan-t5-small model.***

In [27]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# ***11. Train the model for a Q&A task that takes a context as addi'onal input along with the question. You can use SQuAD dataset (h_ps://rajpurkar.github.io/SQuAD-explorer/ )***

In [28]:
# Acquire the training data from Hugging Face
DATA_NAME = "squad_v2"
squad_answers_qa = load_dataset(DATA_NAME)

# Split the training dataset into Train and Validation
squad_answers_qa = squad_answers_qa["train"].train_test_split(test_size=0.3)

Downloading readme:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [29]:
# Define the preprocessing function, and other utilities
def replace_answers_if_empty_list(answers):
    for i in range(len(answers)):

      answers[i] = ";".join(answers[i]["text"])
    return answers

def create_input_prompts(examples):
    input_prompts = []
    for i in range(len(examples['id'])):
      prompt = f"""
      Context: {examples['context'][i]}
      Ask: From the above context, answer the below question.
      {examples["question"][i]}
      """
      input_prompts.append(prompt)
    return input_prompts

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = create_input_prompts(examples)
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   examples["answers"] = replace_answers_if_empty_list(examples["answers"])
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answers"],
                      max_length=512,
                      truncation=True)
   model_inputs["labels"] = labels["input_ids"]

   return model_inputs

In [30]:
# Map the preprocessing function across our dataset
tokenized_dataset = squad_answers_qa.map(preprocess_function, batched=True)

Map:   0%|          | 0/91223 [00:00<?, ? examples/s]

Map:   0%|          | 0/39096 [00:00<?, ? examples/s]

In [31]:
# Selecting the first 100 examples to run training.
first_100_dataset_train = tokenized_dataset['train'].select(range(100))
first_30_dataset_validation = tokenized_dataset['test'].select(range(30))

In [32]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [34]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=first_100_dataset_train,
   eval_dataset=first_30_dataset_validation,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.591416,0.06,0.055556,0.06,0.06
2,No log,2.536174,0.06,0.055556,0.06,0.06
3,No log,2.509837,0.06,0.055556,0.06,0.06


TrainOutput(global_step=21, training_loss=1.6599807739257812, metrics={'train_runtime': 131.3323, 'train_samples_per_second': 2.284, 'train_steps_per_second': 0.16, 'total_flos': 13941787852800.0, 'train_loss': 1.6599807739257812, 'epoch': 3.0})

# ***12. Evaluate the quality of the model*** **bold text**

In [38]:
#Select a small chunk from the test set to test the quality of the model

test_data = tokenized_dataset['test'].select(range(31,100))

In [40]:
results = trainer.evaluate(eval_dataset=test_data)
print(results)



{'eval_loss': 2.3790457248687744, 'eval_rouge1': 0.11050088990592423, 'eval_rouge2': 0.08695652173913043, 'eval_rougeL': 0.11001416584940613, 'eval_rougeLsum': 0.10966910028694925, 'eval_runtime': 21.9684, 'eval_samples_per_second': 3.141, 'eval_steps_per_second': 0.819, 'epoch': 3.0}
