In [1]:
import torch
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from transformers import T5Tokenizer, T5ForConditionalGeneration

### 1. Use a pre-trained google/flan-t5-small as the model

In [2]:
model_name = "google_flan_t5/"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### 2. Verify if the summarizaton task works.

In [3]:
input_text = '''Transfer learning, where a model is first pre-trained on a data-rich task before being finetuned on a downstream task, has emerged as a powerful technique in natural language
                        processing (NLP). The effectiveness of transfer learning has given rise to a diversity of
                        approaches, methodology, and practice. In this paper, we explore the landscape of transfer
                        learning techniques for NLP by introducing a unified framework that converts all text-based
                        language problems into a text-to-text format. Our systematic study compares pre-training
                        objectives, architectures, unlabeled data sets, transfer approaches, and other factors on
                        dozens of language understanding tasks. By combining the insights from our exploration
                        with scale and our new “Colossal Clean Crawled Corpus”, we achieve state-of-the-art results
                        on many benchmarks covering summarization, question answering, text classification, and
                        more. To facilitate future work on transfer learning for NLP, we release our data set,
                        pre-trained models, and code.'''
                        
inputs = tokenizer.encode('''summarize: ''' + input_text,
                          return_tensors='pt',
                          max_length=512,
                          truncation=True)
summarization_ids = model.generate(inputs, max_length=120, min_length=40, num_beams=4)
summarization = tokenizer.decode(summarization_ids[0], skip_special_tokens = True)
print(summarization)

We explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. By combining the insights from our exploration with scale and our new “Colossal Clean Crawled Corpus”, we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more.


### 3. Verify if the Q&A task works

In [4]:
question = "What year did the Berlin Wall fall?"
context = "The Berlin Wall, a significant symbol of the Cold War, was finally brought down in 1989, leading to the reunification of East and West Germany"
input_text = f"question: {question} context: {context}"

inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512,truncation=True)
qa_ids = model.generate(inputs, max_length=50)
answer = tokenizer.decode(qa_ids[0], skip_special_tokens = True)
print(answer)

1989


### 4. Verify if English to French translation task works.

In [5]:
input_text = "To facilitate future work on transfer learning, we release our data set"

inputs = tokenizer.encode("translate English to French: "+ input_text, return_tensors="pt", max_length=512,truncation=True)
language_ids = model.generate(inputs, max_length=50)
language_translation = tokenizer.decode(language_ids[0], skip_special_tokens = True)
print(language_translation)

Pour faciliter la mise en uvre de l'apprentissage de transfert, nous remettent à l'élaboration de nos données.


### 5. Programmatically print the names of all the model layers and their dimensions

In [6]:
for name, params in model.named_parameters():
    print(f"Layer Name: {name}, Dimension: {params.size()}")

Layer Name: shared.weight, Dimension: torch.Size([32128, 512])
Layer Name: encoder.block.0.layer.0.SelfAttention.q.weight, Dimension: torch.Size([384, 512])
Layer Name: encoder.block.0.layer.0.SelfAttention.k.weight, Dimension: torch.Size([384, 512])
Layer Name: encoder.block.0.layer.0.SelfAttention.v.weight, Dimension: torch.Size([384, 512])
Layer Name: encoder.block.0.layer.0.SelfAttention.o.weight, Dimension: torch.Size([512, 384])
Layer Name: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight, Dimension: torch.Size([32, 6])
Layer Name: encoder.block.0.layer.0.layer_norm.weight, Dimension: torch.Size([512])
Layer Name: encoder.block.0.layer.1.DenseReluDense.wi_0.weight, Dimension: torch.Size([1024, 512])
Layer Name: encoder.block.0.layer.1.DenseReluDense.wi_1.weight, Dimension: torch.Size([1024, 512])
Layer Name: encoder.block.0.layer.1.DenseReluDense.wo.weight, Dimension: torch.Size([512, 1024])
Layer Name: encoder.block.0.layer.1.layer_norm.weight, Dimension: tor

### 6. Programmatically print the total number of parameters/weights in this model.

In [7]:
print(sum(p.numel() for p in model.parameters()))

76961152


### 7. Set the tensor in final layer (decoder.final_layer_norm.weight) to all zeros.

In [8]:
model.decoder.final_layer_norm.weight.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

### 8. Verify if the Q&A task works after resettng the weights of the above layer.

In [9]:
question = "What year did the Berlin Wall fall?"
context = "The Berlin Wall, a significant symbol of the Cold War, was finally brought down in 1989, leading to the reunification of East and West Germany"
input_text = f"question: {question} context: {context}"

inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512,truncation=True)
qa_ids = model.generate(inputs, max_length=50)
answer = tokenizer.decode(qa_ids[0])
print(answer)

<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


### 10. Train the model for a Q&A task that takes a context as additional input along with the question. You can use SQuAD dataset

In [11]:
model_name = "google_flan_t5/"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Training_Dataset

In [13]:
train_dataset = pd.read_parquet('squad/train-00000-of-00001.parquet').sample(1000)

train_dataset['answers_text'] = train_dataset['answers'].apply(lambda x : x['text'][0] if x['text'] else '')
train_dataset['input_text'] = "question: " + train_dataset['question'] + ' context: ' + train_dataset['context']
train_dataset = train_dataset[train_dataset.answers_text.str.len() > 0 ]

input_encodings = tokenizer(train_dataset.input_text.tolist(), return_tensors="pt", max_length=512,truncation=True, padding = True)
target_encodings = tokenizer(train_dataset.answers_text.tolist(), return_tensors="pt", max_length=512,truncation=True, padding = True)

#### Test_Dataset

In [14]:
def ans(text):
    try:
        return text['text'][0]
    except:
        return ''
test_dataset = pd.read_parquet('squad/validation-00000-of-00001.parquet').sample(100)
test_dataset['input_text'] = "question: " + test_dataset['question'] + ' context: ' + test_dataset['context']
test_input_encodings = tokenizer(test_dataset.input_text.tolist(), return_tensors="pt", max_length=512,truncation=True, padding = True)
test_dataset['answers_text'] = test_dataset['answers'].apply(ans)
truths = test_dataset['answers_text'].tolist()

#### Benchmarking without Fine-Tuning

In [15]:
with torch.no_grad():
    inputs = {k: v for k, v in test_input_encodings.items()}
    outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=20)
    
predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

In [16]:
from collections import Counter
import string
import re

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punct(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_punct(lower(s)))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


em_scores = [exact_match_score(pred, ans) for pred, ans in zip(predictions, truths)]
f1_scores = [f1_score(pred, ans) for pred, ans in zip(predictions, truths)]

average_em = sum(em_scores) / len(em_scores)
average_f1 = sum(f1_scores) / len(f1_scores)

print(f"Average Exact Match (EM): {average_em:.2f}")
print(f"Average F1 Score: {average_f1:.2f}")

Average Exact Match (EM): 0.43
Average F1 Score: 0.59


#### Fine Tuning

In [17]:
from torch.utils.data import Dataset, DataLoader

class SQuADT5Dataset(Dataset):
    def __init__(self, input_encodings, target_encodings):
        self.input_encodings = input_encodings
        self.target_encodings = target_encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.input_encodings.items()}
        item['labels'] = target_encodings['input_ids'][idx]
        return item

    def __len__(self):
        return len(self.input_encodings.input_ids)

In [18]:
train_dataset = SQuADT5Dataset(input_encodings, target_encodings)

#### Freezing all layers except last two

In [19]:
for parameter in model.parameters():
    parameter.requires_grad = False
    
for parameter in model.decoder.final_layer_norm.parameters():
    parameter.requires_grad = True
    
for parameter in model.lm_head.parameters():
    parameter.requires_grad = True

In [20]:
def print_layer_trainable_status(model):
    for name, param in model.named_parameters():
        print(f"{name}: {'trainable' if param.requires_grad else 'frozen'}")

In [21]:
print_layer_trainable_status(model)

shared.weight: frozen
encoder.block.0.layer.0.SelfAttention.q.weight: frozen
encoder.block.0.layer.0.SelfAttention.k.weight: frozen
encoder.block.0.layer.0.SelfAttention.v.weight: frozen
encoder.block.0.layer.0.SelfAttention.o.weight: frozen
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight: frozen
encoder.block.0.layer.0.layer_norm.weight: frozen
encoder.block.0.layer.1.DenseReluDense.wi_0.weight: frozen
encoder.block.0.layer.1.DenseReluDense.wi_1.weight: frozen
encoder.block.0.layer.1.DenseReluDense.wo.weight: frozen
encoder.block.0.layer.1.layer_norm.weight: frozen
encoder.block.1.layer.0.SelfAttention.q.weight: frozen
encoder.block.1.layer.0.SelfAttention.k.weight: frozen
encoder.block.1.layer.0.SelfAttention.v.weight: frozen
encoder.block.1.layer.0.SelfAttention.o.weight: frozen
encoder.block.1.layer.0.layer_norm.weight: frozen
encoder.block.1.layer.1.DenseReluDense.wi_0.weight: frozen
encoder.block.1.layer.1.DenseReluDense.wi_1.weight: frozen
encoder.block.1.la

#### Trainig_Loop - Only 1 epoch with Batch Size = 64

In [22]:
from torch.utils.data import DataLoader
from torch.optim import AdamW

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)

In [23]:
num_epochs = 1
print_every = 1
model.train()
for e in range(num_epochs):
    counter = 0
    for batch in train_loader:
        counter += 1
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        if counter % print_every == 0:
            print("Epoch: {}/{}...".format(e+1, num_epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()))

Epoch: 1/1... Step: 1... Loss: 43.167633...
Epoch: 1/1... Step: 2... Loss: 43.115242...
Epoch: 1/1... Step: 3... Loss: 42.771740...
Epoch: 1/1... Step: 4... Loss: 43.050915...
Epoch: 1/1... Step: 5... Loss: 42.636387...
Epoch: 1/1... Step: 6... Loss: 40.962044...
Epoch: 1/1... Step: 7... Loss: 40.944424...
Epoch: 1/1... Step: 8... Loss: 43.161983...
Epoch: 1/1... Step: 9... Loss: 41.652943...
Epoch: 1/1... Step: 10... Loss: 43.153229...
Epoch: 1/1... Step: 11... Loss: 43.107143...
Epoch: 1/1... Step: 12... Loss: 43.187603...
Epoch: 1/1... Step: 13... Loss: 42.171211...
Epoch: 1/1... Step: 14... Loss: 41.593601...
Epoch: 1/1... Step: 15... Loss: 42.631622...
Epoch: 1/1... Step: 16... Loss: 42.248489...


In [24]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [25]:
with torch.no_grad():
    inputs = {k: v for k, v in test_input_encodings.items()}
    outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=20)
    
predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

In [26]:
em_scores = [exact_match_score(pred, ans) for pred, ans in zip(predictions, truths)]
f1_scores = [f1_score(pred, ans) for pred, ans in zip(predictions, truths)]

average_em = sum(em_scores) / len(em_scores)
average_f1 = sum(f1_scores) / len(f1_scores)

print(f"Average Exact Match (EM): {average_em:.2f}")
print(f"Average F1 Score: {average_f1:.2f}")

Average Exact Match (EM): 0.43
Average F1 Score: 0.59
