In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    
    print('Memory Usage:',round(torch.cuda.get_device_properties(0).total_memory/1024**3,1), 'GB')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')


Using device: cuda

Tesla T4
Memory Usage: 14.8 GB
Allocated: 2.1 GB
Cached:    11.6 GB




In [None]:
import torch
import numpy as np
import random

def set_random_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
set_random_seed(0)

In [None]:
! pip install datasets transformers rouge-score nltk py7zr



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# %cd /content/drive/MyDrive/NLP Project with SCL

# Fine-tuning a model on a summarization task

## Loading the dataset

In [None]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("samsum")

metric = load_metric("rouge")



  0%|          | 0/3 [00:00<?, ?it/s]

## BART

### Preprocessing the data

In [None]:
model_checkpoint = "facebook/bart-base"

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gra

In [None]:
def check_token_length(dataset):
    ids=[]
    for i in range(len(dataset['dialogue'])):
        if len(tokenizer(dataset['dialogue'][i])['input_ids'])>1000:
            ids.append(i)
    print(ids)
    return ids
def remove_idx(list_idx, dataset):
    return dataset.select((
          i for i in range(len(dataset)) 
          if i not in set(list_idx)))
    
train_ids=check_token_length(raw_datasets['train'])
validation_ids=check_token_length(raw_datasets['validation'])
test_ids = check_token_length(raw_datasets['test'])
changed_datasets_train=remove_idx(train_ids, raw_datasets['train'])
changed_datasets_val = remove_idx(validation_ids, raw_datasets['validation'])
changed_datasets_test = remove_idx(test_ids, raw_datasets['test'])

Token indices sequence length is longer than the specified maximum sequence length for this model (1081 > 1024). Running this sequence through the model will result in indexing errors


[4269, 8198]
[]
[]


In [None]:
max_input_length = 1024
max_target_length = 128

def make_one_hot_sequence(input_ids, sequence_ids):
    changed_sequence_id=[0]
    token_to_speaker_id={}
    uniq_id = 1
    for dic in sequence_ids:
        if str(input_ids[dic['spk'][0]:dic['spk'][1]]) in token_to_speaker_id:
            speaker_id = token_to_speaker_id[str(input_ids[dic['spk'][0]:dic['spk'][1]])]
        else:
            token_to_speaker_id[str(input_ids[dic['spk'][0]:dic['spk'][1]])] = uniq_id
            speaker_id = uniq_id
            uniq_id+=1
        for _ in range(dic['spk'][0], dic['spk'][1]):
            changed_sequence_id.append(speaker_id)
        for _ in range(dic['utt'][0], dic['utt'][1]):
            changed_sequence_id.append(-1)
    changed_sequence_id.append(0)
    return changed_sequence_id 


def preprocess_function(examples): ## hit gold here. change this preprocess function to include speaker and turn information. 
    slash_n = tokenizer(["\r\n"])['input_ids'][0][1:-1]
    slash_n_mask = tokenizer(["\r\n"])['attention_mask'][0][1:-1]
    inputs_list=[]
    masks_list=[]
    pos_list=[]
    for index in range(len(examples['dialogue'])):
        # breaking the dialogue for spk:utt info
        broken=[]
        for utt in examples['dialogue'][index].split("\r\n"):
            first_ind = utt.find(':')
            broken.append(utt[:first_ind])
            broken.append(utt[first_ind:])
        
        tokenized_broken = tokenizer(broken)['input_ids']
        attention_broken = tokenizer(broken)['attention_mask']
        
        # adding \r\n tokens
        for i in range(1, len(tokenized_broken)-1, 2):
            tokenized_broken[i].insert(-1, slash_n[0])
            tokenized_broken[i].insert(-1, slash_n[1])
            attention_broken[i].insert(-1, slash_n_mask[0])
            attention_broken[i].insert(-1, slash_n_mask[1])
        joined = tokenized_broken[0]

        # annotating for spk_utt_pos
        assoc_dict={}
        assoc_dict['spk'] = [1, len(tokenized_broken[0])-1] # the range is actually exclusive of the last index. 
        odd_bool = True
        running_length = len(tokenized_broken[0])
        sequence_ids=[]
        for inner in tokenized_broken[1:]:
            if odd_bool==True:
                assoc_dict['utt']=[running_length-1, running_length+len(inner)-3]
                odd_bool=False
                sequence_ids.append(assoc_dict)
                assoc_dict={}
            else:
                assoc_dict['spk']=[running_length-1, running_length+len(inner)-3]
                odd_bool=True
            joined = joined[:-1]+inner[1:]
            running_length += (len(inner)-2)
        
        # test for CUDA assert error
        if(len(joined)>1024):
            print("input tokens list length greater than 1024, skipping example", end=' ')
            print("equal to", len(joined))
            print(tokenizer.decode(joined))
        
        # creating inputs list
        inputs_list.append(joined)
        pos_list.append(make_one_hot_sequence(joined, sequence_ids))
        
        # creating new mask
        joined_mask = attention_broken[0]
        for inner_attention in attention_broken[1:]:
            joined_mask = joined_mask[:-1]+inner_attention[1:]
        masks_list.append(joined_mask)
    
    # overriding normal model_inputs
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    model_inputs['input_ids'] = inputs_list
    model_inputs['attention_mask'] = masks_list
    model_inputs['spk_utt_pos'] = pos_list
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets_train_o = changed_datasets_train.map(preprocess_function, batched=True)
tokenized_datasets_val_o = changed_datasets_val.map(preprocess_function, batched=True)
tokenized_datasets_test_o = changed_datasets_test.map(preprocess_function, batched=True)

# tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
tokenized_datasets_train = tokenized_datasets_train_o.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets_val = tokenized_datasets_val_o.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets_test = tokenized_datasets_test_o.remove_columns(['id', 'dialogue', 'summary'])



  0%|          | 0/1 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


In [None]:
# tokenized_datasets_train = tokenized_datasets_train.select(range(2500))
# tokenized_datasets_val = tokenized_datasets_val.select(range(500))
# tokenized_datasets_train = tokenized_datasets_train
# tokenized_datasets_val = tokenized_datasets_val

In [None]:
from transformers import Seq2SeqTrainer
from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES


class CustomTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # How the loss is computed by Trainer. By default, all models return the loss in the first element.
        # Subclass and override for custom behavior.
        # print(inputs)
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)

        # Save past state if it exists
        # TODO: this needs to be fixed and mselfade cleaner later.

        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        return (loss, outputs) if return_outputs else loss


In [None]:
from transformers import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from transformers import DataCollatorForSeq2Seq
from typing import Optional, Any, Union
import numpy as np


class CustomCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    r"""
    Data collator that will dynamically pad the inputs received, as well as the labels.
    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        model ([`PreTrainedModel`]):
            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
            prepare the *decoder_input_ids*
            This is useful when using *label_smoothing* to avoid calculating loss twice.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
              is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
              lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: PreTrainedTokenizerBase
    model: Optional[Any] = None
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    return_tensors: str = "pt"

    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors
        labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
        # same length to return tensors.
        if labels is not None:
            max_label_length = max(len(l) for l in labels)
            if self.pad_to_multiple_of is not None:
                max_label_length = (
                        (max_label_length + self.pad_to_multiple_of - 1)
                        // self.pad_to_multiple_of
                        * self.pad_to_multiple_of
                )

            padding_side = self.tokenizer.padding_side
            for feature in features:
                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
                if isinstance(feature["labels"], list):
                    feature["labels"] = (
                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
                    )
                elif padding_side == "right":
                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
                else:
                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
        # added here
        spk_utt_pos = [feature["spk_utt_pos"] for feature in features]
        max_spk_utt_pos_length = max(len(l) for l in spk_utt_pos)

        if self.pad_to_multiple_of is not None:
            max_spk_utt_pos_length = (
                    (max_spk_utt_pos_length + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
            )

        padding_side = self.tokenizer.padding_side
        for feature in features:
            remainder = [0] * (max_spk_utt_pos_length - len(feature["spk_utt_pos"]))
            if isinstance(feature["spk_utt_pos"], list):
                feature["spk_utt_pos"] = (
                    feature["spk_utt_pos"] + remainder if padding_side == "right" else remainder + feature[
                        "spk_utt_pos"]
                )
            elif padding_side == "right":
                feature["spk_utt_pos"] = np.concatenate([feature["spk_utt_pos"], remainder]).astype(np.int64)
            else:
                feature["spk_utt_pos"] = np.concatenate([remainder, feature["spk_utt_pos"]]).astype(np.int64)

        features = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=return_tensors,
        )

        # prepare decoder_input_ids
        if (
                labels is not None
                and self.model is not None
                and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
        ):
            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
            features["decoder_input_ids"] = decoder_input_ids

        return features


In [None]:
from torch import nn
from transformers import BartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

from transformers.models.bart.modeling_bart import BartConfig
import torch
from typing import *
from transformers.modeling_outputs import Seq2SeqLMOutput
from transformers.models.bart.modeling_bart import shift_tokens_right
import random
from tqdm import tqdm
import gc
import itertools

class BartWithSCL(BartForConditionalGeneration):
    def __init__(self, config: BartConfig):
        super().__init__(config)

    def set_losses_list(self, SCLossesList=['token']):
        self.SCLossesList = SCLossesList
    def set_scl_coeff(self, scl_coeff=1e-1):
        self.scl_coeff=scl_coeff
    def token_scl(self,
                  last_hidden_state: torch.FloatTensor,
                  spk_utt_pos: torch.LongTensor,
    ) -> torch.FloatTensor:
        r"""
        last_hidden_state (torch.LongTensor) of shape (batch_size, sequence_length, n_dims):
            Output of the last layer of the encoder.
        spk_utt_pos (torch.LongTensor) of shape (batch_size, sequence_length,):
            metadata about the speaker tokens and utterance tokens
        Returns:
        Token Level Supervised Constrastive Loss (torch.LongTensor)
        """
        batch_scl = 0
        for i in range(len(spk_utt_pos)):
            batch_element = spk_utt_pos[i]
            spk_utt_list = []
            spk_dict = {'start': 0, 'end': 0, 'spk_id': 0, 'bool': False}
            utt_dict = {'start': 0, 'end': 0, 'spk_id': 0, 'bool': False}
            for j in range(len(batch_element)):
                if batch_element[j] == 0 and j > 0:
                    utt_dict['end'] = j
                    utt_dict['bool'] = False
                    spk_utt_list.append({'spk': [spk_dict['start'], spk_dict['end'], spk_dict['spk_id']],
                                         'utt': [utt_dict['start'], utt_dict['end'], utt_dict['spk_id']]})
                    break
                if batch_element[j] > 0 and spk_dict['bool'] == False:
                    utt_dict['end'] = j
                    utt_dict['bool'] = False
                    if j > 1:
                        spk_utt_list.append({'spk': [spk_dict['start'], spk_dict['end'], spk_dict['spk_id']],
                                             'utt': [utt_dict['start'], utt_dict['end'], utt_dict['spk_id']]})
                    spk_dict['start'] = j
                    spk_dict['bool'] = True
                    spk_dict['spk_id'] = batch_element[j]
                    

                if batch_element[j] < 0 and spk_dict['bool'] == True:
                    spk_dict['end'] = j
                    spk_dict['bool'] = False
                    utt_dict['spk_id'] = spk_dict['spk_id']
                    utt_dict['start'] = j
                    utt_dict['bool'] = True
            # uniq spks
            if spk_utt_list[0]['spk'][2]==0:
                continue
            uniq_spks = list(set([int(dic['spk'][2].cpu()) for dic in spk_utt_list]))
            if len(uniq_spks)==1:
                continue
            # spk_utt_states
            spk_utt_states = {spk: [] for spk in uniq_spks}

            for spk in uniq_spks:
                for dic in spk_utt_list:
                    if spk == dic['utt'][2]:
                        spk_utt_states[spk].append(last_hidden_state[i, dic['utt'][0]:dic['utt'][1]])
            
            
            #---------- hitesh------------------------------
            # positive samples
            # L_pos = 0
            # L_neg = 0 

            # sampled_spk_utt_states = []           

            # for spk in uniq_spks:
            #     utts = len(spk_utt_states[spk])
            #     spk_utt = []
            #     if utts > 1:
            #         # ids = random.sample(list(range(len(spk_utt_states[spk]))), random.randint(1, utts))
            #         ids = random.sample(list(range(len(spk_utt_states[spk]))), 2)
            #         for i in ids:
            #           spk_utt.append(spk_utt_states[spk][i])
            #     sampled_spk_utt_states.append(spk_utt)

            # for instance in sampled_spk_utt_states:
            #   for i in range(len(instance)):
            #     for j in range(len(instance)):
            #       mat_mul = torch.einsum('ij, kj->ik', instance[i], instance[j])
            #       sigm = torch.sigmoid(mat_mul)
            #       log = torch.log(sigm)
            #       L_pos += torch.sum(-1 * log)
            # # print("L_pos", L_pos)

            # #negative loss
            # for i in range(0,len(sampled_spk_utt_states)):
            #   instance = sampled_spk_utt_states[i]

            #   neg_instances = sampled_spk_utt_states[:i]+sampled_spk_utt_states[i+1:]
            #   neg_instances = list(itertools.chain(*neg_instances))
            #   # neg_instances = random.choices(neg_instances,k = random.randint(1, len(neg_instances)))
            #   if len(neg_instances)>0:
            #     # print(len(neg_instances))
            #     # print("-------------------------")
            #     # print(sampled_spk_utt_states)
            #     neg_instances = random.choices(neg_instances,k = 2)
            #     for i in range(len(instance)):
            #       for j in range(len(neg_instances)):
            #         mat_mul = torch.einsum('ij, kj->ik', instance[i], neg_instances[j])
            #         sigm = torch.sigmoid(mat_mul)
            #         log = torch.log(1 - sigm+1e-5)
            #         L_neg += torch.sum(-1 * log)
            #---------- hitesh------------------------------
            
            
            # positive samples
            L_pos = 0
            for spk in uniq_spks:
                if len(spk_utt_states[spk]) > 1:
                    ids = random.sample(list(range(len(spk_utt_states[spk]))), 2)
                    id1 = ids[0]
                    id2 = ids[1]
                    mat_mul = torch.einsum('ij, kj->ik', spk_utt_states[spk][id1], spk_utt_states[spk][id1])
                    sigm = torch.sigmoid(mat_mul)
                    log = torch.log(sigm)
                    L_pos += torch.sum(-1 * log)
                    L_pos = torch.nan_to_num(L_pos, posinf = 1e10, neginf = -1e10)
            # print("L_pos", L_pos)
            # negative samples
            
            L_neg = 0
            for spk in uniq_spks:
                new_uniq_spks = uniq_spks.copy()
                new_uniq_spks.remove(spk)

                spk2 = random.choice(new_uniq_spks)

                id1 = random.randint(0, len(spk_utt_states[spk])-1)
                id2 = random.randint(0, len(spk_utt_states[spk2])-1)

                mat_mul = torch.einsum('ij, kj->ik', spk_utt_states[spk][id1], spk_utt_states[spk2][id2])
                sigm = torch.sigmoid(mat_mul)
                # print(1 - sigm)
                # print(1 - sigm+1e-5)
                log = torch.log(1 - sigm+1e-5)
                L_neg += torch.sum(-1 * log)
                
                L_neg = torch.nan_to_num(L_neg, posinf = 1e10, neginf = -1e10)

            # print("L_neg", L_neg)
            
            batch_scl += L_pos
            batch_scl += L_neg
        batch_scl /= last_hidden_state.size(0)
        gc.collect()
        return batch_scl
    
    def turn_scl(self,
                  last_hidden_state: torch.FloatTensor,
                  spk_utt_pos: torch.LongTensor,
    ) -> torch.FloatTensor:
        r"""
        last_hidden_state (torch.LongTensor) of shape (batch_size, sequence_length, n_dims):
            Output of the last layer of the encoder.
        spk_utt_pos (torch.LongTensor) of shape (batch_size, sequence_length,):
            metadata about the speaker tokens and utterance tokens
        Returns:
        Turn Level Supervised Constrastive Loss (torch.LongTensor)
        """
        batch_scl = 0
        for i in range(len(spk_utt_pos)):
            batch_element = spk_utt_pos[i]
            spk_utt_list = []
            spk_dict = {'start': 0, 'end': 0, 'spk_id': 0, 'bool': False}
            utt_dict = {'start': 0, 'end': 0, 'spk_id': 0, 'bool': False}
            for j in range(len(batch_element)):
                if batch_element[j] == 0 and j > 0:
                    utt_dict['end'] = j
                    utt_dict['bool'] = False
                    spk_utt_list.append({'spk': [spk_dict['start'], spk_dict['end'], spk_dict['spk_id']],
                                         'utt': [utt_dict['start'], utt_dict['end'], utt_dict['spk_id']]})
                    break
                if batch_element[j] > 0 and spk_dict['bool'] == False:
                    utt_dict['end'] = j
                    utt_dict['bool'] = False
                    if j > 1:
                        spk_utt_list.append({'spk': [spk_dict['start'], spk_dict['end'], spk_dict['spk_id']],
                                             'utt': [utt_dict['start'], utt_dict['end'], utt_dict['spk_id']]})
                    spk_dict['start'] = j
                    spk_dict['bool'] = True
                    spk_dict['spk_id'] = batch_element[j]
                    

                if batch_element[j] < 0 and spk_dict['bool'] == True:
                    spk_dict['end'] = j
                    spk_dict['bool'] = False
                    utt_dict['spk_id'] = spk_dict['spk_id']
                    utt_dict['start'] = j
                    utt_dict['bool'] = True
            # uniq spks
            if spk_utt_list[0]['spk'][2]==0:
                continue
            uniq_spks = list(set([int(dic['spk'][2].cpu()) for dic in spk_utt_list]))
            if len(uniq_spks)==1:
                continue
            # spk_utt_states
            spk_utt_states = {spk: [] for spk in uniq_spks}

            for spk in uniq_spks:
                for dic in spk_utt_list:
                    if spk == dic['utt'][2]:
                        mean_pool = torch.mean(last_hidden_state[i, dic['utt'][0]:dic['utt'][1]], 0)
                        spk_utt_states[spk].append(mean_pool)

            # positive samples
            L_pos = 0
            for spk in uniq_spks:
                if len(spk_utt_states[spk]) > 1:
                    ids = random.sample(list(range(len(spk_utt_states[spk]))), 2)
                    id1 = ids[0]
                    id2 = ids[1]
                    mat_mul = torch.einsum('i, j->', spk_utt_states[spk][id1], spk_utt_states[spk][id1])
                    sigm = torch.sigmoid(mat_mul)
                    log = torch.log(sigm)
                    L_pos += torch.sum(-1 * log)
                    # L_pos = torch.nan_to_num(L_pos, posinf = 1e10, neginf = -1e10)
            # print("L_pos", L_pos)
            # negative samples
            L_neg = 0
            for spk in uniq_spks:
                new_uniq_spks = uniq_spks.copy()
                new_uniq_spks.remove(spk)

                spk2 = random.choice(new_uniq_spks)

                id1 = random.randint(0, len(spk_utt_states[spk])-1)
                id2 = random.randint(0, len(spk_utt_states[spk2])-1)

                mat_mul = torch.einsum('i, j->', spk_utt_states[spk][id1], spk_utt_states[spk2][id2])
                sigm = torch.sigmoid(mat_mul)
                # print(1 - sigm)
                # print(1 - sigm+1e-5)
                log = torch.log(1 - sigm+1e-5)
                L_neg += torch.sum(-1 * log)
                
                # L_neg = torch.nan_to_num(L_neg, posinf = 1e10, neginf = -1e10)

            # print("L_neg", L_neg)
            
            batch_scl += L_pos
            batch_scl += L_neg
        batch_scl /= last_hidden_state.size(0)
        gc.collect()
        return batch_scl
    
    def global_scl(self,
                  last_hidden_state: torch.FloatTensor,
                  spk_utt_pos: torch.LongTensor,
    ) -> torch.FloatTensor:
        r"""
        last_hidden_state (torch.LongTensor) of shape (batch_size, sequence_length, n_dims):
            Output of the last layer of the encoder.
        spk_utt_pos (torch.LongTensor) of shape (batch_size, sequence_length,):
            metadata about the speaker tokens and utterance tokens
        Returns:
        Turn Level Supervised Constrastive Loss (torch.LongTensor)
        """
        batch_scl = 0
        for i in range(len(spk_utt_pos)):
            batch_element = spk_utt_pos[i]
            spk_utt_list = []
            spk_dict = {'start': 0, 'end': 0, 'spk_id': 0, 'bool': False}
            utt_dict = {'start': 0, 'end': 0, 'spk_id': 0, 'bool': False}
            for j in range(len(batch_element)):
                if batch_element[j] == 0 and j > 0:
                    utt_dict['end'] = j
                    utt_dict['bool'] = False
                    spk_utt_list.append({'spk': [spk_dict['start'], spk_dict['end'], spk_dict['spk_id']],
                                         'utt': [utt_dict['start'], utt_dict['end'], utt_dict['spk_id']]})
                    break
                if batch_element[j] > 0 and spk_dict['bool'] == False:
                    utt_dict['end'] = j
                    utt_dict['bool'] = False
                    if j > 1:
                        spk_utt_list.append({'spk': [spk_dict['start'], spk_dict['end'], spk_dict['spk_id']],
                                             'utt': [utt_dict['start'], utt_dict['end'], utt_dict['spk_id']]})
                    spk_dict['start'] = j
                    spk_dict['bool'] = True
                    spk_dict['spk_id'] = batch_element[j]
                    

                if batch_element[j] < 0 and spk_dict['bool'] == True:
                    spk_dict['end'] = j
                    spk_dict['bool'] = False
                    utt_dict['spk_id'] = spk_dict['spk_id']
                    utt_dict['start'] = j
                    utt_dict['bool'] = True
            # uniq spks
            if spk_utt_list[0]['spk'][2]==0:
                continue
            uniq_spks = list(set([int(dic['spk'][2].cpu()) for dic in spk_utt_list]))
            if len(uniq_spks)==1:
                continue
            # spk_utt_states
            spk_utt_states = {spk: [] for spk in uniq_spks}

            for spk in uniq_spks:
                for dic in spk_utt_list:
                    if spk == dic['utt'][2]:
                        mean_pool = torch.mean(last_hidden_state[i, dic['utt'][0]:dic['utt'][1]], 0)
                        spk_utt_states[spk].append(mean_pool)

            # positive samples
            L_pos = 0
            L_neg = 0
            for spk in uniq_spks:
                if len(spk_utt_states[spk]) > 1:
                    ids = random.choice(list(range(len(spk_utt_states[spk]))))
                    
                    spk_mean_exc = torch.mean(torch.vstack([spk_utt_states[spk][temp] for temp in range(len(spk_utt_states[spk])) if temp != ids]), 0)
                    
                    pos_mat_mul = torch.einsum('i, j->', spk_utt_states[spk][ids], spk_mean_exc)
                    pos_sigm = torch.sigmoid(pos_mat_mul)
                    pos_log = torch.log(pos_sigm)
                    L_pos += torch.sum(-1 * pos_log)

                    # negative sample

                    new_uniq_spks = uniq_spks.copy()
                    new_uniq_spks.remove(spk)
                    
                    spk2 = random.choice(new_uniq_spks)
                    id_neg = random.choice(list(range(len(spk_utt_states[spk2]))))
                    neg_mat_mul = torch.einsum('i, j->', spk_utt_states[spk2][id_neg], spk_mean_exc)
                    neg_sigm = torch.sigmoid(neg_mat_mul)
                    neg_log = torch.log(1 - neg_sigm+1e-5)
                    L_neg += torch.sum(-1 * neg_log)
                

            # print("L_neg", L_neg)
            
            batch_scl += L_pos
            batch_scl += L_neg
        batch_scl /= last_hidden_state.size(0)
        gc.collect()
        return batch_scl

    def forward(
            self,
            input_ids: torch.LongTensor = None,
            attention_mask: Optional[torch.Tensor] = None,
            spk_utt_pos: Optional[torch.Tensor] = None, ##changed here
            decoder_input_ids: Optional[torch.LongTensor] = None,
            decoder_attention_mask: Optional[torch.LongTensor] = None,
            head_mask: Optional[torch.Tensor] = None,
            decoder_head_mask: Optional[torch.Tensor] = None,
            cross_attn_head_mask: Optional[torch.Tensor] = None,
            encoder_outputs: Optional[List[torch.FloatTensor]] = None,
            past_key_values: Optional[List[torch.FloatTensor]] = None,
            inputs_embeds: Optional[torch.FloatTensor] = None,
            decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
            labels: Optional[torch.LongTensor] = None,
            use_cache: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Returns:
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
            if use_cache:
                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if encoder_outputs is None:
            encoder = self.get_encoder()
            # TODO: mask the speaker names from the input IDs using the speaker pos info
            turn_attention_mask=None
            token_encoder_outputs=None
            tog_encoder_outputs=None
            
            if 'token' in self.SCLossesList:
                token_encoder_outputs = encoder(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                )

            if 'turn' in self.SCLossesList or 'global' in self.SCLossesList:
                tog_attention_mask = torch.where(spk_utt_pos>0, 0, attention_mask)
                tog_encoder_outputs = encoder(
                    input_ids=input_ids,
                    attention_mask=tog_attention_mask,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                )
        # if 'hidden_states' in encoder_outputs:
        #     print("encoder_outputs['last_hidden_state'].size(), encoder_outputs['hidden_states'].size()",
        #     encoder_outputs['last_hidden_state'].size(), encoder_outputs['hidden_states'].size())
        # else:
        #     print("encoder_outputs['last_hidden_state'].size()", encoder_outputs['last_hidden_state'].size())

        lm_logits = self.lm_head(outputs[0])
        lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
        # added here
        sc_loss = 0
        if 'token' in self.SCLossesList and labels is not None:
            sc_loss += self.token_scl(last_hidden_state=token_encoder_outputs['last_hidden_state'], spk_utt_pos=spk_utt_pos)
            # print(sc_loss)
        if 'turn' in self.SCLossesList and labels is not None:
            sc_loss += self.turn_scl(last_hidden_state=tog_encoder_outputs['last_hidden_state'], spk_utt_pos=spk_utt_pos)
        
        if 'global' in self.SCLossesList and labels is not None:
            sc_loss += self.global_scl(last_hidden_state=tog_encoder_outputs['last_hidden_state'], spk_utt_pos=spk_utt_pos)
        
        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss+(self.scl_coeff*sc_loss),) + output) if masked_lm_loss is not None else output
        loss = None
        if masked_lm_loss is None:
            loss = None
        else:
            loss = masked_lm_loss+(self.scl_coeff*sc_loss)
        return Seq2SeqLMOutput(
            loss=loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )


### Fine-tuning the model

In [None]:
# from models import BartWithSCL
# from datacollator import CustomCollatorForSeq2Seq
# from trainer import CustomTrainer


from transformers import BartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

In [None]:
model = BartWithSCL.from_pretrained(model_checkpoint)
model.set_losses_list(['turn','token','global'])
model.set_scl_coeff(1)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2

In [None]:
batch_size = 4
args = Seq2SeqTrainingArguments(
    "test-joint2batch3c1",
    evaluation_strategy = "epoch",
    # eval_steps=5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    # save_total_limit=2,
    num_train_epochs=5,
    logging_steps = 10, ## added
    predict_with_generate=True,
    remove_unused_columns=False, ## added
    fp16=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
data_collator = CustomCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np
import torch
torch.cuda.empty_cache()
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    for i in range(0,50):
      # print(tokenized_datasets_val["dialogue"][i])
      print("-----------",i,"--------------")
      print("------>Predictions by Model")
      print(decoded_preds[i])
      print("----->Predictions Original")
      print(decoded_labels[i])
      print("**************************")
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
trainer.train() #1 turn+token+global batch 3

***** Running training *****
  Num examples = 14730
  Num Epochs = 2
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 2
  Total optimization steps = 4910
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,49.0013,149.014801,42.3925,18.3488,34.5186,38.6423,17.846


Saving model checkpoint to test-joint2batch3c1/checkpoint-500
Configuration saved in test-joint2batch3c1/checkpoint-500/config.json
Model weights saved in test-joint2batch3c1/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c1/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-joint2batch3c1/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c1/checkpoint-1000
Configuration saved in test-joint2batch3c1/checkpoint-1000/config.json
Model weights saved in test-joint2batch3c1/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c1/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-joint2batch3c1/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c1/checkpoint-1500
Configuration saved in test-joint2batch3c1/checkpoint-1500/config.json
Model weights saved in test-joint2batch3c1/checkpoint-1500/pytorch_model.bin
tokenizer config

----------- 0 --------------
------>Predictions by Model
A wants to go with Tom to the animal shelter tomorrow.
He wants to get a
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
----------- 1 --------------
------>Predictions by Model
Emma wants to buy a new advent calendar for her kids.
Rob and Lauren are
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
----------- 2 --------------
------>Predictions by Model
Jackie is pregnant with Madison.
She doesn't want to talk about it.
She
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
********************

Saving model checkpoint to test-joint2batch3c1/checkpoint-2500
Configuration saved in test-joint2batch3c1/checkpoint-2500/config.json
Model weights saved in test-joint2batch3c1/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c1/checkpoint-2500/tokenizer_config.json
Special tokens file saved in test-joint2batch3c1/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c1/checkpoint-3000
Configuration saved in test-joint2batch3c1/checkpoint-3000/config.json
Model weights saved in test-joint2batch3c1/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c1/checkpoint-3000/tokenizer_config.json
Special tokens file saved in test-joint2batch3c1/checkpoint-3000/special_tokens_map.json


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,49.0013,149.014801,42.3925,18.3488,34.5186,38.6423,17.846
2,183.944,101.739494,43.2962,19.8798,35.7604,39.4281,17.3386


Saving model checkpoint to test-joint2batch3c1/checkpoint-3500
Configuration saved in test-joint2batch3c1/checkpoint-3500/config.json
Model weights saved in test-joint2batch3c1/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c1/checkpoint-3500/tokenizer_config.json
Special tokens file saved in test-joint2batch3c1/checkpoint-3500/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c1/checkpoint-4000
Configuration saved in test-joint2batch3c1/checkpoint-4000/config.json
Model weights saved in test-joint2batch3c1/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c1/checkpoint-4000/tokenizer_config.json
Special tokens file saved in test-joint2batch3c1/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c1/checkpoint-4500
Configuration saved in test-joint2batch3c1/checkpoint-4500/config.json
Model weights saved in test-joint2batch3c1/checkpoint-4500/pytorch_model.bin
tokenizer c

----------- 0 --------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
----------- 1 --------------
------>Predictions by Model
Rob and Lauren are going to buy an advent calendar for their kids.
Emma and Rob
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
----------- 2 --------------
------>Predictions by Model
Madison is pregnant.
She is taking every commitment seriously.
She wants an abortion.
She
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4910, training_loss=266.5653718408402, metrics={'train_runtime': 9462.5422, 'train_samples_per_second': 3.113, 'train_steps_per_second': 0.519, 'total_flos': 4489631853557760.0, 'train_loss': 266.5653718408402, 'epoch': 2.0})

In [None]:
trainer.train() #10 turn+token+global batch 3

***** Running training *****
  Num examples = 14730
  Num Epochs = 2
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 2
  Total optimization steps = 4910
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,444.943,1512.910767,26.1159,4.9683,20.9895,23.5482,16.379
2,1447.518,940.1474,26.6048,5.4063,21.2669,23.796,15.9474


Saving model checkpoint to test-joint2batch3c10/checkpoint-500
Configuration saved in test-joint2batch3c10/checkpoint-500/config.json
Model weights saved in test-joint2batch3c10/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c10/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-joint2batch3c10/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c10/checkpoint-1000
Configuration saved in test-joint2batch3c10/checkpoint-1000/config.json
Model weights saved in test-joint2batch3c10/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c10/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-joint2batch3c10/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c10/checkpoint-1500
Configuration saved in test-joint2batch3c10/checkpoint-1500/config.json
Model weights saved in test-joint2batch3c10/checkpoint-1500/pytorch_model.bin
tok

----------- 0 --------------
------>Predictions by Model
Benny is going to eat a puppy for her birthday.
She will bring her puppy
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
----------- 1 --------------
------>Predictions by Model
Emma and Lauren are going to celebrate the anniversary of their wedding.
They are going
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
----------- 2 --------------
------>Predictions by Model
Jackie is worried about her daughter's pregnancy.
She wants to talk to her parents
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was preg

Saving model checkpoint to test-joint2batch3c10/checkpoint-2500
Configuration saved in test-joint2batch3c10/checkpoint-2500/config.json
Model weights saved in test-joint2batch3c10/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c10/checkpoint-2500/tokenizer_config.json
Special tokens file saved in test-joint2batch3c10/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c10/checkpoint-3000
Configuration saved in test-joint2batch3c10/checkpoint-3000/config.json
Model weights saved in test-joint2batch3c10/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-joint2batch3c10/checkpoint-3000/tokenizer_config.json
Special tokens file saved in test-joint2batch3c10/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to test-joint2batch3c10/checkpoint-3500
Configuration saved in test-joint2batch3c10/checkpoint-3500/config.json
Model weights saved in test-joint2batch3c10/checkpoint-3500/pytorch_model.bi

----------- 0 --------------
------>Predictions by Model
Beth will bring the puppy to the puppy.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
----------- 1 --------------
------>Predictions by Model
Emma and Lauren are going to celebrate the anniversary of the wedding.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
----------- 2 --------------
------>Predictions by Model
Jackie is going to marry Martha.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
----------- 3 --------------
------>Predictions by Model
Marla is l



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4910, training_loss=2774.420476281813, metrics={'train_runtime': 9417.6812, 'train_samples_per_second': 3.128, 'train_steps_per_second': 0.521, 'total_flos': 4489631853557760.0, 'train_loss': 2774.420476281813, 'epoch': 2.0})

In [None]:
trainer.train() #0.1 turn+token+global

***** Running training *****
  Num examples = 14730
  Num Epochs = 2
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 2
  Total optimization steps = 4910
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.2725,13.099684,46.7574,23.4471,39.2711,43.2073,18.2274
2,6.3249,6.558846,47.08,24.177,40.3434,43.6921,17.6785


Saving model checkpoint to test-jointbatch6/checkpoint-500
Configuration saved in test-jointbatch6/checkpoint-500/config.json
Model weights saved in test-jointbatch6/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-jointbatch6/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-jointbatch6/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-jointbatch6/checkpoint-1000
Configuration saved in test-jointbatch6/checkpoint-1000/config.json
Model weights saved in test-jointbatch6/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-jointbatch6/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-jointbatch6/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-jointbatch6/checkpoint-1500
Configuration saved in test-jointbatch6/checkpoint-1500/config.json
Model weights saved in test-jointbatch6/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-jointbatch6/checkpo

----------- 0 --------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
----------- 1 --------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob, Rob and Lauren
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
----------- 2 --------------
------>Predictions by Model
Madison is pregnant.
She doesn't want to talk about it because she's worried about
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
****************

Saving model checkpoint to test-jointbatch6/checkpoint-2500
Configuration saved in test-jointbatch6/checkpoint-2500/config.json
Model weights saved in test-jointbatch6/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in test-jointbatch6/checkpoint-2500/tokenizer_config.json
Special tokens file saved in test-jointbatch6/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to test-jointbatch6/checkpoint-3000
Configuration saved in test-jointbatch6/checkpoint-3000/config.json
Model weights saved in test-jointbatch6/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-jointbatch6/checkpoint-3000/tokenizer_config.json
Special tokens file saved in test-jointbatch6/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to test-jointbatch6/checkpoint-3500
Configuration saved in test-jointbatch6/checkpoint-3500/config.json
Model weights saved in test-jointbatch6/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in test-jointbatch6/ch

----------- 0 --------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
----------- 1 --------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob and Lauren like it
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
----------- 2 --------------
------>Predictions by Model
Madison is pregnant.
She doesn't want to talk about it because she's worried about
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
*************



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4910, training_loss=5.850273959748371, metrics={'train_runtime': 9968.9159, 'train_samples_per_second': 2.955, 'train_steps_per_second': 0.493, 'total_flos': 4493420674928640.0, 'train_loss': 5.850273959748371, 'epoch': 2.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 818
  Batch size = 3
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


-------------------------
------>Predictions by Model
A wants to get a puppy for his son.
He wants to name it Lemmy
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob doesn't like the
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Jackie is pregnant with Madison's child.
She doesn't want to talk about it
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-----------

{'eval_loss': 9.662054061889648,
 'eval_rouge1': 42.9668,
 'eval_rouge2': 19.5059,
 'eval_rougeL': 35.4994,
 'eval_rougeLsum': 39.1085,
 'eval_gen_len': 16.6002,
 'eval_runtime': 313.8526,
 'eval_samples_per_second': 2.606,
 'eval_steps_per_second': 0.87}

In [None]:
trainer.train() #1 turn

***** Running training *****
  Num examples = 14730
  Num Epochs = 5
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 2
  Total optimization steps = 6135
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,3.8408,3.806844,46.0617,23.3331,38.5567,42.5037,18.099
1,3.7812,3.588556,46.6812,23.9701,39.7915,43.322,17.7848
2,3.8944,3.551383,47.6012,24.41,40.2943,44.1606,18.1112
3,3.4776,3.566261,47.8461,24.7176,40.3763,44.1576,18.2017
4,3.2812,3.505859,48.0901,24.9524,40.7897,44.4671,18.1699


Saving model checkpoint to test-tokenbatch6/checkpoint-500
Configuration saved in test-tokenbatch6/checkpoint-500/config.json
Model weights saved in test-tokenbatch6/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-1000
Configuration saved in test-tokenbatch6/checkpoint-1000/config.json
Model weights saved in test-tokenbatch6/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
She doesn't want to talk about it because she has money problems
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
---------------

Saving model checkpoint to test-tokenbatch6/checkpoint-1500
Configuration saved in test-tokenbatch6/checkpoint-1500/config.json
Model weights saved in test-tokenbatch6/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-2000
Configuration saved in test-tokenbatch6/checkpoint-2000/config.json
Model weights saved in test-tokenbatch6/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob used to get one
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
Madison doesn't want to talk about it.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
---------------------

Saving model checkpoint to test-tokenbatch6/checkpoint-2500
Configuration saved in test-tokenbatch6/checkpoint-2500/config.json
Model weights saved in test-tokenbatch6/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3000
Configuration saved in test-tokenbatch6/checkpoint-3000/config.json
Model weights saved in test-tokenbatch6/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-3000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3500
Configuration saved in test-tokenbatch6/checkpoint-3500/config.json
Model weights saved in test-tokenbatch6/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants an advent calendar for her kids.
Rob used to get one every year
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
She doesn't want to talk about it because she's worried about
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
*********************

Saving model checkpoint to test-tokenbatch6/checkpoint-4000
Configuration saved in test-tokenbatch6/checkpoint-4000/config.json
Model weights saved in test-tokenbatch6/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-4500
Configuration saved in test-tokenbatch6/checkpoint-4500/config.json
Model weights saved in test-tokenbatch6/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She took him to the animal shelter
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants an advent calendar for her kids.
Rob used to get one every year
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
Madison doesn't want to talk about it because she's worried about
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************

Saving model checkpoint to test-tokenbatch6/checkpoint-5000
Configuration saved in test-tokenbatch6/checkpoint-5000/config.json
Model weights saved in test-tokenbatch6/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-5500
Configuration saved in test-tokenbatch6/checkpoint-5500/config.json
Model weights saved in test-tokenbatch6/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-6000
Configuration saved in test-tokenbatch6/checkpoint-6000/config.json
Model weights saved in test-tokenbatch6/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob used to get one
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
Madison doesn't want to talk about it because she's worried about
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
*********************



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=6135, training_loss=3.834668092642165, metrics={'train_runtime': 8826.7114, 'train_samples_per_second': 8.344, 'train_steps_per_second': 0.695, 'total_flos': 1.408779533371392e+16, 'train_loss': 3.834668092642165, 'epoch': 5.0})

In [None]:
trainer.train() #10 turn

***** Running training *****
  Num examples = 14730
  Num Epochs = 5
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 2
  Total optimization steps = 6135
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,18.7632,20.683386,47.3007,23.9175,39.7057,43.3966,18.4218
1,20.2536,19.311647,47.8681,24.7162,40.3091,44.2441,18.1919
2,20.7405,19.342619,47.7472,24.873,40.5831,44.0438,18.1748
3,16.4131,19.211998,48.1637,25.0107,40.6033,44.2349,18.3557
4,18.3121,19.103617,48.1236,24.7877,40.5943,44.3159,18.302


Saving model checkpoint to test-tokenbatch6/checkpoint-500
Configuration saved in test-tokenbatch6/checkpoint-500/config.json
Model weights saved in test-tokenbatch6/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-1000
Configuration saved in test-tokenbatch6/checkpoint-1000/config.json
Model weights saved in test-tokenbatch6/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob used to get one
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant but she doesn't want to talk about it.
Iggy's friend
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
--------

Saving model checkpoint to test-tokenbatch6/checkpoint-1500
Configuration saved in test-tokenbatch6/checkpoint-1500/config.json
Model weights saved in test-tokenbatch6/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-2000
Configuration saved in test-tokenbatch6/checkpoint-2000/config.json
Model weights saved in test-tokenbatch6/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A will go to the animal shelter tomorrow afternoon to get a puppy for her son.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma is looking for an advent calendar for her children.
Rob used to get one
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant but she doesn't want to talk about it.
Iggy's friend
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
******************

Saving model checkpoint to test-tokenbatch6/checkpoint-2500
Configuration saved in test-tokenbatch6/checkpoint-2500/config.json
Model weights saved in test-tokenbatch6/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3000
Configuration saved in test-tokenbatch6/checkpoint-3000/config.json
Model weights saved in test-tokenbatch6/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-3000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3500
Configuration saved in test-tokenbatch6/checkpoint-3500/config.json
Model weights saved in test-tokenbatch6/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A is going to the animal shelter tomorrow afternoon.
She wants to get a puppy for
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma is looking for an advent calendar for her children.
Rob used to get one
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant but she doesn't want to talk about it.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
--

Saving model checkpoint to test-tokenbatch6/checkpoint-4000
Configuration saved in test-tokenbatch6/checkpoint-4000/config.json
Model weights saved in test-tokenbatch6/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-4500
Configuration saved in test-tokenbatch6/checkpoint-4500/config.json
Model weights saved in test-tokenbatch6/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A will go to the animal shelter tomorrow afternoon to get a puppy for her son.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her children.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant but she doesn't want to talk about it.
Iggy's friend
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------

Saving model checkpoint to test-tokenbatch6/checkpoint-5000
Configuration saved in test-tokenbatch6/checkpoint-5000/config.json
Model weights saved in test-tokenbatch6/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-5500
Configuration saved in test-tokenbatch6/checkpoint-5500/config.json
Model weights saved in test-tokenbatch6/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-6000
Configuration saved in test-tokenbatch6/checkpoint-6000/config.json
Model weights saved in test-tokenbatch6/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A will go to the animal shelter tomorrow afternoon to get a puppy for her son.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her children.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant but she doesn't want to talk about it.
Iggy's friend
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=6135, training_loss=19.84097158965194, metrics={'train_runtime': 8277.4414, 'train_samples_per_second': 8.898, 'train_steps_per_second': 0.741, 'total_flos': 1.408779533371392e+16, 'train_loss': 19.84097158965194, 'epoch': 5.0})

In [None]:
trainer.train() #1 token

***** Running training *****
  Num examples = 14730
  Num Epochs = 5
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 2
  Total optimization steps = 6135
  Number of trainable parameters = 139420416
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,27.6522,113.108192,44.2466,20.5821,36.3503,40.269,17.7005
1,37.3822,47.435345,46.0824,22.6507,38.4254,41.9365,17.7457
2,48.1901,33.907181,46.754,23.2996,39.1978,42.814,17.7628
3,35.3732,48.250854,47.7365,24.3356,40.0902,43.8587,18.1516
4,48.9033,47.6273,48.3144,24.9937,40.4409,44.3047,18.0208


Saving model checkpoint to test-tokenbatch6/checkpoint-500
Configuration saved in test-tokenbatch6/checkpoint-500/config.json
Model weights saved in test-tokenbatch6/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-1000
Configuration saved in test-tokenbatch6/checkpoint-1000/config.json
Model weights saved in test-tokenbatch6/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to get a puppy for her son, but he doesn't know what to
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob suggests buying fabric/
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Jackie is pregnant with Madison.
Iggy doesn't want to talk about it because
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-

Saving model checkpoint to test-tokenbatch6/checkpoint-1500
Configuration saved in test-tokenbatch6/checkpoint-1500/config.json
Model weights saved in test-tokenbatch6/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-2000
Configuration saved in test-tokenbatch6/checkpoint-2000/config.json
Model weights saved in test-tokenbatch6/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A will take her son to the animal shelter tomorrow afternoon.
He wants to get a
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob and Lauren think it
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
She doesn't want to talk about it because she's worried about
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
*********

Saving model checkpoint to test-tokenbatch6/checkpoint-2500
Configuration saved in test-tokenbatch6/checkpoint-2500/config.json
Model weights saved in test-tokenbatch6/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3000
Configuration saved in test-tokenbatch6/checkpoint-3000/config.json
Model weights saved in test-tokenbatch6/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-3000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3500
Configuration saved in test-tokenbatch6/checkpoint-3500/config.json
Model weights saved in test-tokenbatch6/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A will take her son to the animal shelter tomorrow afternoon.
He will get a puppy
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob and Lauren like the
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------------------
------>Predictio

Saving model checkpoint to test-tokenbatch6/checkpoint-4000
Configuration saved in test-tokenbatch6/checkpoint-4000/config.json
Model weights saved in test-tokenbatch6/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-4500
Configuration saved in test-tokenbatch6/checkpoint-4500/config.json
Model weights saved in test-tokenbatch6/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A will take her son to the animal shelter tomorrow afternoon.
He wants to get a
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to make an advent calendar for her kids.
Rob and Lauren like the
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
She doesn't want to talk about it because she's worried about
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
********

Saving model checkpoint to test-tokenbatch6/checkpoint-5000
Configuration saved in test-tokenbatch6/checkpoint-5000/config.json
Model weights saved in test-tokenbatch6/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-5500
Configuration saved in test-tokenbatch6/checkpoint-5500/config.json
Model weights saved in test-tokenbatch6/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-6000
Configuration saved in test-tokenbatch6/checkpoint-6000/config.json
Model weights saved in test-tokenbatch6/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A will take her son to the animal shelter tomorrow afternoon.
A wants to get a
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob likes the idea.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
She doesn't want to talk about it because people get excited and
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
***********



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=6135, training_loss=57.83391118675301, metrics={'train_runtime': 8371.5747, 'train_samples_per_second': 8.798, 'train_steps_per_second': 0.733, 'total_flos': 1.408779533371392e+16, 'train_loss': 57.83391118675301, 'epoch': 5.0})

In [None]:
trainer.train() #10 token

***** Running training *****
  Num examples = 14730
  Num Epochs = 5
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 2
  Total optimization steps = 6135
  Number of trainable parameters = 139420416


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,710.5417,1068.385864,39.2495,15.3607,31.5469,35.186,16.8191
1,321.4755,434.005768,45.1613,21.6478,37.0812,41.4696,18.3472
2,56.1664,693.369019,45.9286,22.1405,38.2911,41.9762,17.6406
3,351.4464,269.512085,45.8383,22.3104,38.2796,41.9801,17.7323
4,839.3077,437.032196,46.3977,22.7926,38.7734,42.5666,17.6785


Saving model checkpoint to test-tokenbatch6/checkpoint-500
Configuration saved in test-tokenbatch6/checkpoint-500/config.json
Model weights saved in test-tokenbatch6/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-1000
Configuration saved in test-tokenbatch6/checkpoint-1000/config.json
Model weights saved in test-tokenbatch6/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A will get a puppy for her son tomorrow.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants Rob to buy an advent calendar for Christmas for her and Lauren.
They
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Jackie is pregnant and wants to have an abortion.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------------------
------>Predictions by M

Saving model checkpoint to test-tokenbatch6/checkpoint-1500
Configuration saved in test-tokenbatch6/checkpoint-1500/config.json
Model weights saved in test-tokenbatch6/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-1500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-2000
Configuration saved in test-tokenbatch6/checkpoint-2000/config.json
Model weights saved in test-tokenbatch6/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to buy a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob is going to buy
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Jackie is pregnant.
She doesn't want to talk about it because she's afraid
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
------

Saving model checkpoint to test-tokenbatch6/checkpoint-2500
Configuration saved in test-tokenbatch6/checkpoint-2500/config.json
Model weights saved in test-tokenbatch6/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-2500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3000
Configuration saved in test-tokenbatch6/checkpoint-3000/config.json
Model weights saved in test-tokenbatch6/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-3000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-3500
Configuration saved in test-tokenbatch6/checkpoint-3500/config.json
Model weights saved in test-tokenbatch6/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A will go with B to the animal shelter tomorrow.
She will get a puppy for
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob likes the idea.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Jackie is pregnant with Madison.
She doesn't want to talk about it because she
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
***********************

Saving model checkpoint to test-tokenbatch6/checkpoint-4000
Configuration saved in test-tokenbatch6/checkpoint-4000/config.json
Model weights saved in test-tokenbatch6/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-4500
Configuration saved in test-tokenbatch6/checkpoint-4500/config.json
Model weights saved in test-tokenbatch6/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-4500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-4500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will take him to the animal
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob is going to buy
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------------------
------>Predictions by Model
Marla 

Saving model checkpoint to test-tokenbatch6/checkpoint-5000
Configuration saved in test-tokenbatch6/checkpoint-5000/config.json
Model weights saved in test-tokenbatch6/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5000/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5000/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-5500
Configuration saved in test-tokenbatch6/checkpoint-5500/config.json
Model weights saved in test-tokenbatch6/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/checkpoint-5500/tokenizer_config.json
Special tokens file saved in test-tokenbatch6/checkpoint-5500/special_tokens_map.json
Saving model checkpoint to test-tokenbatch6/checkpoint-6000
Configuration saved in test-tokenbatch6/checkpoint-6000/config.json
Model weights saved in test-tokenbatch6/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in test-tokenbatch6/ch

-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will get one for him.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob will buy one.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------------------
------>Predictions by Model
Marla found a 



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=6135, training_loss=630.6079075524938, metrics={'train_runtime': 8086.5653, 'train_samples_per_second': 9.108, 'train_steps_per_second': 0.759, 'total_flos': 1.410922066305024e+16, 'train_loss': 630.6079075524938, 'epoch': 5.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


-------------------------
------>Predictions by Model
A wants to get a puppy for her son.
She will get one for him.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
Emma wants to buy an advent calendar for her kids.
Rob will buy one.
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant.
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------------------
------>Predictions by Model
Marla found a 

{'eval_loss': 390.1603698730469,
 'eval_rouge1': 46.3977,
 'eval_rouge2': 22.7926,
 'eval_rougeL': 38.7734,
 'eval_rougeLsum': 42.5666,
 'eval_gen_len': 17.6785,
 'eval_runtime': 113.04,
 'eval_samples_per_second': 7.236,
 'eval_steps_per_second': 1.212,
 'epoch': 5.0}

## T5

### Preprocessing the data

In [None]:
model_checkpoint = "t5-base"

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Could not locate the tokenizer configuration file, will try to use the model config instead.


Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9/pytorch_model.bin
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


In [None]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    task_prefix = "summarize: "
    inputs = examples["dialogue"]
    model_inputs = tokenizer([task_prefix + dialogue for dialogue in inputs], 
                             padding="max_length",
                             max_length=max_input_length, 
                             truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(examples["summary"], 
                        padding="max_length",
                        max_length=max_target_length, 
                        truncation=True)

    model_inputs["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in labels["input_ids"]]
        
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# sample a small set for development
# tokenized_datasets_train = tokenized_datasets['train'].select(range(100))
# tokenized_datasets_val = tokenized_datasets['validation'].select(range(70))


tokenized_datasets_train = tokenized_datasets['train']
tokenized_datasets_val = tokenized_datasets['validation']

### Fine-tuning the model

In [None]:
# Parameters\
batch_size=8
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_results",
    num_train_epochs=5,
    do_train=True,
    do_eval=True,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-4,
    warmup_steps=500,
    weight_decay=0.1,
    # label_smoothing_factor=0.1, ## causes to throw an error
    predict_with_generate=True,
    # logging_dir="logs",
    logging_steps=10,
    save_total_limit=3,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# evaluate before training for comparison
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, id, dialogue. If summary, id, dialogue are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 8
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


-------------------------
------>Predictions by Model
a hamster named lemmy is named after his dead hamster.
----->Predictions Original
A will go to the animal shelter tomorrow to get a puppy for her son.
They already visited the shelter last Monday and the son chose the puppy.
**************************
-------------------------
------>Predictions by Model
x-mas advent calendars are filled with various things.
i love the idea
----->Predictions Original
Emma and Rob love the advent calendar.
Lauren fits inside calendar various items, for instance, small toys and Christmas decorations.
Her children are excited whenever they get the calendar.
**************************
-------------------------
------>Predictions by Model
Madison is pregnant Iggy: ok, she's probably worrying about it Iggy:
----->Predictions Original
Madison is pregnant but she doesn't want to talk about it.
Patricia Stevens got married and she thought she was pregnant.
**************************
-------------------------

{'eval_loss': 2.411588191986084,
 'eval_rouge1': 24.5734,
 'eval_rouge2': 7.6639,
 'eval_rougeL': 20.7835,
 'eval_rougeLsum': 22.5135,
 'eval_gen_len': 18.3191,
 'eval_runtime': 135.4583,
 'eval_samples_per_second': 6.039,
 'eval_steps_per_second': 0.76}

In [None]:

trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, id, dialogue. If summary, id, dialogue are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14732
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9210
  Number of trainable parameters = 222903552


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
trainer.evaluate()