In [1]:
!pip install transformers sentencepiece
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

src_text = [  """China’s Huawei overtook Samsung Electronics as the world’s biggest seller of mobile phones in the second quarter of 2020, shipping 55.8 million devices compared to Samsung’s 53.7 million, according to data from research firm Canalys. While Huawei’s sales fell 5 per cent from the same quarter a year earlier, South Korea’s Samsung posted a bigger drop of 30 per cent, owing to disruption from the coronavirus in key markets such as Brazil, the United States and Europe, Canalys said. Huawei’s overseas shipments fell 27 per cent in Q2 from a year earlier, but the company increased its dominance of the China market which has been faster to recover from COVID-19 and where it now sells over 70 per cent of its phones. “Our business has demonstrated exceptional resilience in these difficult times,” a Huawei spokesman said. “Amidst a period of unprecedented global economic slowdown and challenges, we’re continued to grow and further our leadership position.” Nevertheless, Huawei’s position as number one seller may prove short-lived once other markets recover given it is mainly due to economic disruption, a senior Huawei employee with knowledge of the matter told Reuters. Apple is due to release its Q2 iPhone shipment data on Friday.""" ] 

#We will be using the pegasus XSUM model, we define it as below.

model_name = 'google/pegasus-xsum'

device = 'cuda' if torch.cuda.is_available() else 'cpu'

#create a tokenizer that will be used to parse and tokenize the input text.
#This will download and initialize the model.

tokenizer = PegasusTokenizer.from_pretrained(model_name)

model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

#We tokenize the text, and it returns us PyTorch tensors.
batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)

#We take these tensors and get the abstractive summary.
translated = model.generate(**batch)

#Since the output as tensors is encoded, we have to decode it.
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

print(tgt_text)
#You can see the tgt_text variable, to get the summary.
# ['Huawei has overtaken Samsung as the world’s biggest seller of mobile phon



['Huawei has overtaken Samsung as the world’s biggest seller of mobile phones, according to data from research firm Canalys.']


In [11]:
path_models = './models/'
path_outputs = './outputs/'
path_processed_data= './processed_data/'

#covid_test_prediction_t5_bart_v3
data_name = 'covid_filtered_ner_test_v3'
input_filename = 'covid_test_prediction_t5_bart_v3_2.json'
out_filename =  'final_inference_1.json'



In [4]:
import os
import json
import joblib
import time
import pandas as pd
import numpy as np
import random
from IPython.display import display, HTML
import torch

jsonFile = open(os.path.join(path_processed_data, input_filename), "r")
jsonContent = jsonFile.read()
details_dict = json.loads(jsonContent)
jsonFile.close()

jsonFile2 = open(os.path.join(path_processed_data, 'covid_test_ner_v3.json'), "r")
jsonContent2 = jsonFile2.read()
details_dict2 = json.loads(jsonContent2)
jsonFile2.close()

#print(details_dict2)[0]

X = details_dict['text']
Y = details_dict['headlines']
S1 = details_dict['ner_filtered_text']

#P1 = details_dict2['predict_base'],

P1 = details_dict['predict_base_bart_fullTrained']
P2 = details_dict['predict_bart_ner_fullTrained']

P3 = details_dict['predict_bart_ner_nerTrained']
P4 = details_dict['predict_bart_full_nerTrained']

P5 = details_dict['predict_base_t5_full_fullTrained']
P6 = details_dict['predict_t5_3s_fullTrained']
P7 = details_dict['predict_t5_ner_fullTrained']
P8 = details_dict['predict_t5_full_nerTrained']
P9 = details_dict['predict_t5_3s_nerTrained']
P10 = details_dict['predict_t5_ner_nerTrained']

    
details = {
    'text' : X,
    'headlines' : Y,
    'ner_filtered_text' : S1,
    'predict_base_bart_fullTrained' : P1,
    'predict_bart_ner_fullTrained' : P2,
    'predict_bart_ner_nerTrained' : P3,
    'predict_bart_full_nerTrained' : P4,
    
    'predict_base_t5_full_fullTrained' : P5,
    'predict_t5_3s_fullTrained' : P6,
    'predict_t5_ner_fullTrained' : P7,
    'predict_t5_full_nerTrained' : P8,
    'predict_t5_3s_nerTrained': P9,
    'predict_t5_ner_nerTrained' : P10
}

df_score = pd.DataFrame(details)


In [5]:
print(len(X))

5000


In [14]:
P11 = []

for i, article in enumerate(X):
    if i%100 ==0: 
        print("i = ", i)
    
    #We tokenize the text, and it returns us PyTorch tensors.
    batch = tokenizer(X[i], truncation=True, padding='longest', return_tensors="pt").to(device)

    #We take these tensors and get the abstractive summary.
    translated = model.generate(**batch)

    #Since the output as tensors is encoded, we have to decode it.
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

    P11.append(tgt_text)
    #You can see the tgt_text variable, to get the summary.
    # ['Huawei has overtaken Samsung as the world’s biggest seller of mobile phon


i =  0
i =  100
i =  200
i =  300
i =  400
i =  500
i =  600
i =  700
i =  800
i =  900
i =  1000
i =  1100
i =  1200
i =  1300
i =  1400
i =  1500
i =  1600
i =  1700
i =  1800
i =  1900
i =  2000
i =  2100
i =  2200
i =  2300
i =  2400
i =  2500
i =  2600
i =  2700
i =  2800
i =  2900
i =  3000
i =  3100
i =  3200
i =  3300
i =  3400
i =  3500
i =  3600
i =  3700
i =  3800
i =  3900
i =  4000
i =  4100
i =  4200
i =  4300
i =  4400
i =  4500
i =  4600
i =  4700
i =  4800
i =  4900


In [15]:
P11

[['A selection of photographs from around the world this week:'],
 ['US President Donald Trump has suspended all flights to and from Europe amid an outbreak of the deadly virus coronavirus.'],
 ['Images courtesy of AFP, EPA, Getty Images and Reuters'],
 ['By MOUSSA Associated Press SYDNEY Tom Hanks wife Rita Wilson isolated stable condition Australian hospital Thursday contracting new coronavirus Australian officials said.'],
 ['The Board Directors Arabian Horse Breeders Alliance wish good health coming months look forward seeing 8 11 2021.'],
 ['The Trump administration has announced travel restrictions for 26 European nations amid a deadly virus outbreak.'],
 ["On Tuesday Bernie Sanders defeated Joe Biden in Missouri's Democratic presidential primary."],
 ['Health officials in South Korea say they are confident they have stopped the spread of Middle East Respiratory Syndrome (MERS).'],
 ["Arsenal's Premier League match against Brighton on Saturday has been called off after the club's

In [16]:
P11_ = []
for i, a in enumerate(P11):
    P11_.append(a[0])

print(P11_)



In [17]:
details = {
    'text' : X,
    'headlines' : Y,
    'ner_filtered_text' : S1,
    'predict_base_bart_fullTrained' : P1,
    'predict_bart_ner_fullTrained' : P2,
    'predict_bart_ner_nerTrained' : P3,
    'predict_bart_full_nerTrained' : P4,
    
    'predict_base_t5_full_fullTrained' : P5,
    'predict_t5_3s_fullTrained' : P6,
    'predict_t5_ner_fullTrained' : P7,
    'predict_t5_full_nerTrained' : P8,
    'predict_t5_3s_nerTrained': P9,
    'predict_t5_ner_nerTrained' : P10,
    'predict_pegasus_base' : P11_
}

jsonString = json.dumps(details)
jsonFile = open(os.path.join(path_processed_data, out_filename), "w")
jsonFile.write(jsonString)
jsonFile.close()

In [1]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()

"""Script for fine-tuning Pegasus
Example usage:
  # use XSum dataset as example, with first 1000 docs as training data
  from datasets import load_dataset
  dataset = load_dataset("xsum")
  train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
  
  # use Pegasus Large model as base for fine-tuning
  model_name = 'google/pegasus-large'
  train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
  trainer.train()
 
Reference:
  https://huggingface.co/transformers/master/custom_datasets.html

"""

from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

      
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2000,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2000,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer
    )

  return trainer


if __name__=='__main__':
  # use XSum dataset as example, with first 1000 docs as training data
  from datasets import load_dataset
  dataset = load_dataset("xsum")
  train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
  
  # use Pegasus Large model as base for fine-tuning
  model_name = 'google/pegasus-large'
  train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
  trainer.train()

Using custom data configuration default
Reusing dataset xsum (/home/ubuntu/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1000
  Num Epochs = 2000
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 2000000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33msrikanthy[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Step,Training Loss


RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 14.76 GiB total capacity; 13.57 GiB already allocated; 143.75 MiB free; 13.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF