In [1]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    
    print('Memory Usage:',round(torch.cuda.get_device_properties(0).total_memory/1024**3,1), 'GB')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')


Using device: cuda

Tesla T4
Memory Usage: 14.8 GB
Allocated: 0.0 GB
Cached:    0.0 GB




In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
import torch
import numpy as np
import random

def set_random_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
set_random_seed(0)

In [4]:
! pip install datasets transformers rouge-score nltk py7zr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd /content/drive/MyDrive/NLP Project with SCL

/content/drive/MyDrive/NLP Project with SCL


In [7]:
! dir

BART\ Global-SamSUM.ipynb
BART\ Joint-SamSUM.ipynb
BART\ TOKEN-SamSUM.ipynb
BART\ Turn-SamSUM.ipynb
datacollator.py
Hiteshmodels.py
HiteshToken\ NLP\ Project\ with\ SCL.ipynb
models.py
myTrainer.py
NLP\ Project\ with\ SCL.ipynb
__pycache__
t5_results
t5-token-b4c0.1
T5\ with\ TOKEN\ SCL.ipynb
test-dialogue-summarization
test-dialogue-summarization-hitesh
test-dialogue-summarization-token-batch16
test-global-SCL-batch16
test-global-SCL-batch6
test-jointbatch6
test-tokenbatch6
test-turn-batch6
Token\ NLP\ Project\ with\ SCL.ipynb
trainer.py
Turn\ NLP\ Project\ with\ SCL\ (1).ipynb
Turn\ NLP\ Project\ with\ SCL.ipynb


# Fine-tuning a model on a summarization task

## Loading the dataset

In [8]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("samsum")

metric = load_metric("rouge")



  0%|          | 0/3 [00:00<?, ?it/s]

  metric = load_metric("rouge")


## BART

### Preprocessing the data

In [9]:
model_checkpoint = "/content/drive/MyDrive/NLP Project with SCL/test-global-SCL-batch16/checkpoint-6000"

In [10]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
def check_token_length(dataset):
    ids=[]
    for i in range(len(dataset['dialogue'])):
        if len(tokenizer(dataset['dialogue'][i])['input_ids'])>1000:
            ids.append(i)
    print(ids)
    return ids
def remove_idx(list_idx, dataset):
    return dataset.select((
          i for i in range(len(dataset)) 
          if i not in set(list_idx)))
    
train_ids=check_token_length(raw_datasets['train'])
validation_ids=check_token_length(raw_datasets['validation'])
test_ids = check_token_length(raw_datasets['test'])
changed_datasets_train=remove_idx(train_ids, raw_datasets['train'])
changed_datasets_val = remove_idx(validation_ids, raw_datasets['validation'])
changed_datasets_test = remove_idx(test_ids, raw_datasets['test'])

Token indices sequence length is longer than the specified maximum sequence length for this model (1081 > 1024). Running this sequence through the model will result in indexing errors


[4269, 8198]
[]




[]


In [12]:
max_input_length = 1024
max_target_length = 128

def make_one_hot_sequence(input_ids, sequence_ids):
    changed_sequence_id=[0]
    token_to_speaker_id={}
    uniq_id = 1
    for dic in sequence_ids:
        if str(input_ids[dic['spk'][0]:dic['spk'][1]]) in token_to_speaker_id:
            speaker_id = token_to_speaker_id[str(input_ids[dic['spk'][0]:dic['spk'][1]])]
        else:
            token_to_speaker_id[str(input_ids[dic['spk'][0]:dic['spk'][1]])] = uniq_id
            speaker_id = uniq_id
            uniq_id+=1
        for _ in range(dic['spk'][0], dic['spk'][1]):
            changed_sequence_id.append(speaker_id)
        for _ in range(dic['utt'][0], dic['utt'][1]):
            changed_sequence_id.append(-1)
    changed_sequence_id.append(0)
    return changed_sequence_id 


def preprocess_function(examples): ## hit gold here. change this preprocess function to include speaker and turn information. 
    slash_n = tokenizer(["\r\n"])['input_ids'][0][1:-1]
    slash_n_mask = tokenizer(["\r\n"])['attention_mask'][0][1:-1]
    inputs_list=[]
    masks_list=[]
    pos_list=[]
    for index in range(len(examples['dialogue'])):
        # breaking the dialogue for spk:utt info
        broken=[]
        for utt in examples['dialogue'][index].split("\r\n"):
            first_ind = utt.find(':')
            broken.append(utt[:first_ind])
            broken.append(utt[first_ind:])
        
        tokenized_broken = tokenizer(broken)['input_ids']
        attention_broken = tokenizer(broken)['attention_mask']
        
        # adding \r\n tokens
        for i in range(1, len(tokenized_broken)-1, 2):
            tokenized_broken[i].insert(-1, slash_n[0])
            tokenized_broken[i].insert(-1, slash_n[1])
            attention_broken[i].insert(-1, slash_n_mask[0])
            attention_broken[i].insert(-1, slash_n_mask[1])
        joined = tokenized_broken[0]

        # annotating for spk_utt_pos
        assoc_dict={}
        assoc_dict['spk'] = [1, len(tokenized_broken[0])-1] # the range is actually exclusive of the last index. 
        odd_bool = True
        running_length = len(tokenized_broken[0])
        sequence_ids=[]
        for inner in tokenized_broken[1:]:
            if odd_bool==True:
                assoc_dict['utt']=[running_length-1, running_length+len(inner)-3]
                odd_bool=False
                sequence_ids.append(assoc_dict)
                assoc_dict={}
            else:
                assoc_dict['spk']=[running_length-1, running_length+len(inner)-3]
                odd_bool=True
            joined = joined[:-1]+inner[1:]
            running_length += (len(inner)-2)
        
        # test for CUDA assert error
        if(len(joined)>1024):
            print("input tokens list length greater than 1024, skipping example", end=' ')
            print("equal to", len(joined))
            print(tokenizer.decode(joined))
        
        # creating inputs list
        inputs_list.append(joined)
        pos_list.append(make_one_hot_sequence(joined, sequence_ids))
        
        # creating new mask
        joined_mask = attention_broken[0]
        for inner_attention in attention_broken[1:]:
            joined_mask = joined_mask[:-1]+inner_attention[1:]
        masks_list.append(joined_mask)
    
    # overriding normal model_inputs
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    model_inputs['input_ids'] = inputs_list
    model_inputs['attention_mask'] = masks_list
    model_inputs['spk_utt_pos'] = pos_list
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_datasets_train = changed_datasets_train.map(preprocess_function, batched=True)
tokenized_datasets_val = changed_datasets_val.map(preprocess_function, batched=True)
tokenized_datasets_test = changed_datasets_test.map(preprocess_function, batched=True)

# tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
tokenized_datasets_train = tokenized_datasets_train.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets_val = tokenized_datasets_val.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets_test = tokenized_datasets_test.remove_columns(['id', 'dialogue', 'summary'])



  0%|          | 0/1 [00:00<?, ?ba/s]



In [14]:
# tokenized_datasets_train = tokenized_datasets_train.select(range(2500))
# tokenized_datasets_val = tokenized_datasets_val.select(range(500))
# tokenized_datasets_train = tokenized_datasets_train
# tokenized_datasets_val = tokenized_datasets_val

### Fine-tuning the model

In [15]:
from models import BartWithSCL
from datacollator import CustomCollatorForSeq2Seq
from trainer import CustomTrainer


from transformers import BartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

In [16]:
model = BartWithSCL.from_pretrained(model_checkpoint)
model.set_losses_list(['global'])
model.set_scl_coeff()

In [17]:
batch_size = 6
args = Seq2SeqTrainingArguments(
    "test-global-SCL-batch6",
    evaluation_strategy = "epoch",
    # eval_steps=5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    # save_total_limit=2,
    num_train_epochs=5,
    logging_steps = 10, ## added
    predict_with_generate=True,
    remove_unused_columns=False, ## added
    fp16=True,
)

In [18]:
data_collator = CustomCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
import nltk
import numpy as np
import torch
torch.cuda.empty_cache()
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    for i in range(0,50):
      # print(tokenized_datasets_val["dialogue"][i])
      print("----------",i,"---------------")
      print("------>Predictions by Model")
      print(decoded_preds[i])
      print("----->Predictions Original")
      print(decoded_labels[i])
      print("**************************")
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [20]:

trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [21]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
print()




In [24]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 818
  Batch size = 6


spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward tensor([[ 0,  1, -1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ..., -1, -1,  0],
        [ 0,  1, -1,  ...,  0,  0,  0],
        [ 0,  1, -1,  ...,  0,  0,  0]], device='cuda:0')
torch.Size([6, 321, 768])


spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward tensor([[ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0],
        [ 0,  1, -1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ..., -1, -1,  0],
        [ 0,  1, -1,  ...,  0,  0,  0],
        [ 0,  1,  1,  ...,  0,  0,  0]], device='cuda:0')
torch.Size([6, 266, 768])
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forward None
spk_utt_pos in forwar

KeyboardInterrupt: ignored

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

## T5

### Preprocessing the data

In [None]:
model_checkpoint = "t5-base"

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    task_prefix = "summarize: "
    inputs = examples["dialogue"]
    model_inputs = tokenizer([task_prefix + dialogue for dialogue in inputs], 
                             padding="max_length",
                             max_length=max_input_length, 
                             truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(examples["summary"], 
                        padding="max_length",
                        max_length=max_target_length, 
                        truncation=True)

    model_inputs["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in labels["input_ids"]]
        
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [None]:
# sample a small set for development
# tokenized_datasets_train = tokenized_datasets['train'].select(range(100))
# tokenized_datasets_val = tokenized_datasets['validation'].select(range(70))


tokenized_datasets_train = tokenized_datasets['train']
tokenized_datasets_val = tokenized_datasets['validation']

### Fine-tuning the model

In [None]:
# Parameters\
batch_size=8
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_results",
    num_train_epochs=5,
    do_train=True,
    do_eval=True,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-4,
    warmup_steps=500,
    weight_decay=0.1,
    # label_smoothing_factor=0.1, ## causes to throw an error
    predict_with_generate=True,
    # logging_dir="logs",
    logging_steps=10,
    save_total_limit=3,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# evaluate before training for comparison
trainer.evaluate()

In [None]:

trainer.train()

In [None]:
trainer.evaluate()