In [None]:
! pip install datasets transformers rouge-score nltk py7zr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 5.5 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 15.7 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Collecting py7zr
  Downloading py7zr-0.20.2-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.6 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 2.6 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 57.2 MB/s 
[?25hCollecting xxhash
  D

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP Project

/content/drive/MyDrive/NLP Project


# Fine-tuning a model on a summarization task

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
import nltk
import numpy as np


## Loading the dataset

In [None]:
raw_datasets = load_dataset("samsum")
metric = load_metric("rouge")

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.88k [00:00<?, ?B/s]

Downloading and preparing dataset samsum/samsum to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e...


Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset samsum downloaded and prepared to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
import torch.nn as nn
class CustomTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # How the loss is computed by Trainer. By default, all models return the loss in the first element.
        # Subclass and override for custom behavior.
      
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
            loss_fct = nn.CrossEntropyLoss()
            cross_loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            loss += cross_loss
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        # Adding CrossEntropyLoss()
        # loss_fct = nn.CrossEntropyLoss()
        # cross_loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        # loss += cross_loss
        return (loss, outputs) if return_outputs else loss

## BART

### Preprocessing the data

In [None]:
model_checkpoint = "facebook/bart-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets_train = tokenized_datasets['train']
tokenized_datasets_val = tokenized_datasets['validation']

### Fine-tuning the model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
#in this iteration i changes the the parameters
# new
# max_input_length = 1024
# max_target_length = 256
# old
# max_input_length = 512
# max_target_length = 128

# new (not done yet)
# learning_rate = 1e-5
# old
# learning_rate = 2e-5

#all this are done by adding our loss function also

In [None]:
batch_size = 32
args = Seq2SeqTrainingArguments(
    "test-dialogue-summarization",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # gradient_accumulation_steps=2,
    weight_decay=0.01,
    # save_total_limit=2,
    num_train_epochs=5,
    logging_steps = 10, ## added
    predict_with_generate=True,
    fp16=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


# trainer = Seq2SeqTrainer(
#     model,
#     args,
#     train_dataset=tokenized_datasets_train,
#     eval_dataset=tokenized_datasets_val,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

Using cuda_amp half precision backend


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
trainer.evaluate() #before training

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 32


{'eval_loss': 1.5149409770965576,
 'eval_rouge1': 47.8974,
 'eval_rouge2': 25.0226,
 'eval_rougeL': 40.6141,
 'eval_rougeLsum': 44.3016,
 'eval_gen_len': 18.1663,
 'eval_runtime': 44.2467,
 'eval_samples_per_second': 18.487,
 'eval_steps_per_second': 0.588}

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14732
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2305
  Number of trainable parameters = 139420416


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.5405,1.515237,48.106,25.006,40.7695,44.4789,18.2787
2,1.4416,1.504386,48.4932,25.5341,41.2128,44.9875,18.2702
3,1.3602,1.509629,48.3065,25.9549,41.3665,44.9538,18.1112
4,1.2857,1.505367,48.7956,26.0232,41.5933,45.3164,18.1418
5,1.3638,1.507278,48.5343,25.8065,41.4085,45.145,18.2775


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 32


Saving model checkpoint to test-dialogue-summarization/checkpoint-500
Configuration saved in test-dialogue-summarization/checkpoint-500/config.json
Model weights saved in test-dialogue-summarization/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-dialogue-summarization/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-dialogue-summarization/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 32
Saving model checkpoint to test-dialogue-summarization/checkpoint-1000
Configuration saved in test-dialogue-summarization/checkpoint-1000/config.json
Model weights saved in test-dialogue-summarization/checkpoint-1000

TrainOutput(global_step=2305, training_loss=1.3976633634587947, metrics={'train_runtime': 2166.8283, 'train_samples_per_second': 33.994, 'train_steps_per_second': 1.064, 'total_flos': 1.9573314388992e+16, 'train_loss': 1.3976633634587947, 'epoch': 5.0})

In [None]:
trainer.evaluate() #after training

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 32


{'eval_loss': 1.5303208827972412,
 'eval_rouge1': 47.5105,
 'eval_rouge2': 24.1888,
 'eval_rougeL': 40.0868,
 'eval_rougeLsum': 43.8536,
 'eval_gen_len': 18.1125,
 'eval_runtime': 44.2729,
 'eval_samples_per_second': 18.476,
 'eval_steps_per_second': 0.587,
 'epoch': 5.0}

## T5

### Preprocessing the data

In [None]:
model_checkpoint = "t5-base"

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,


In [None]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    task_prefix = "summarize: "
    inputs = examples["dialogue"]
    model_inputs = tokenizer([task_prefix + dialogue for dialogue in inputs], 
                             padding="max_length",
                             max_length=max_input_length, 
                             truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(examples["summary"], 
                        padding="max_length",
                        max_length=max_target_length, 
                        truncation=True)

    model_inputs["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in labels["input_ids"]]
        
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# sample a small set for development
# tokenized_datasets_train = tokenized_datasets['train'].select(range(100))
# tokenized_datasets_val = tokenized_datasets['validation'].select(range(70))


tokenized_datasets_train = tokenized_datasets['train']
tokenized_datasets_val = tokenized_datasets['validation']

### Fine-tuning the model

In [None]:
# Parameters\
batch_size=8
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_results",
    num_train_epochs=5,
    do_train=True,
    do_eval=True,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-4,
    warmup_steps=500,
    weight_decay=0.1,
    # label_smoothing_factor=0.1, ## causes to throw an error
    predict_with_generate=True,
    # logging_dir="logs",
    logging_steps=10,
    save_total_limit=3,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# evaluate before training for comparison
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 8
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 2.411588191986084,
 'eval_rouge1': 24.5814,
 'eval_rouge2': 7.6806,
 'eval_rougeL': 20.811,
 'eval_rougeLsum': 22.5157,
 'eval_gen_len': 18.3191,
 'eval_runtime': 106.4884,
 'eval_samples_per_second': 7.682,
 'eval_steps_per_second': 0.967}

In [None]:

trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14732
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9210
  Number of trainable parameters = 222903552


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.4492,1.406961,47.1957,23.9462,39.7559,43.5966,16.8154
2,1.3351,1.377671,48.0974,24.7391,40.6019,44.6338,17.0562
3,1.1758,1.374614,48.1377,24.8276,40.6396,44.4451,16.9425
4,1.1536,1.386299,48.4272,25.0362,40.9575,44.7725,16.978
5,0.9342,1.401495,48.1292,24.7742,40.5648,44.3401,17.0868


Saving model checkpoint to t5_results/checkpoint-500
Configuration saved in t5_results/checkpoint-500/config.json
Model weights saved in t5_results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in t5_results/checkpoint-500/tokenizer_config.json
Special tokens file saved in t5_results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to t5_results/checkpoint-1000
Configuration saved in t5_results/checkpoint-1000/config.json
Model weights saved in t5_results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5_results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5_results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to t5_results/checkpoint-1500
Configuration saved in t5_results/checkpoint-1500/config.json
Model weights saved in t5_results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in t5_results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in t5_results/checkpoint-15

Saving model checkpoint to t5_results/checkpoint-2000
Configuration saved in t5_results/checkpoint-2000/config.json
Model weights saved in t5_results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in t5_results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in t5_results/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [t5_results/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to t5_results/checkpoint-2500
Configuration saved in t5_results/checkpoint-2500/config.json
Model weights saved in t5_results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in t5_results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in t5_results/checkpoint-2500/special_tokens_map.json
Deleting older checkpoint [t5_results/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to t5_results/checkpoint-3000
Configuration saved in t5_results/checkpoint-3000/config.json
Model weights saved in t5_res

TrainOutput(global_step=9210, training_loss=1.2890792537072064, metrics={'train_runtime': 11606.1589, 'train_samples_per_second': 6.347, 'train_steps_per_second': 0.794, 'total_flos': 4.48558382186496e+16, 'train_loss': 1.2890792537072064, 'epoch': 5.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, dialogue, id. If summary, dialogue, id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 8


{'eval_loss': 1.4014945030212402,
 'eval_rouge1': 48.1292,
 'eval_rouge2': 24.7742,
 'eval_rougeL': 40.5648,
 'eval_rougeLsum': 44.3401,
 'eval_gen_len': 17.0868,
 'eval_runtime': 108.3146,
 'eval_samples_per_second': 7.552,
 'eval_steps_per_second': 0.951,
 'epoch': 5.0}