In [1]:
import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Adafactor
import torch
from sklearn.model_selection import train_test_split
import tqdm
import sys
torch.bfloat16 = None

In [2]:
data_raw = pd.read_csv('data/news_summary.csv',encoding='latin-1')

In [3]:
data_raw.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [4]:
data = pd.DataFrame()
data['news'] = data_raw['ctext']
data['summary'] = data_raw['text']

In [5]:
data.head()

Unnamed: 0,news,summary
0,The Daman and Diu administration on Wednesday ...,The Administration of Union Territory Daman an...
1,"From her special numbers to TV?appearances, Bo...",Malaika Arora slammed an Instagram user who tr...
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Mumbai and other Indian cities are t...,Hotels in Maharashtra will train their staff t...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4514 entries, 0 to 4513
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   news     4396 non-null   object
 1   summary  4514 non-null   object
dtypes: object(2)
memory usage: 70.7+ KB


In [7]:
data = data.dropna()

In [8]:
torch_device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [9]:
def get_val_loss(model,val_loader):
    temp = model.eval()
    num_batches = 0
    loss_sum = 0
    for batch in val_loader:
        encoder_input = batch["encoder_input"].to(torch_device)
        encoder_attention_mask = batch["encoder_attention_masks"].to(torch_device)
        decoder_input = batch["decoder_input"].to(torch_device)
        decoder_output = batch["decoder_output"].to(torch_device)
        with torch.no_grad():
            output = model(input_ids=encoder_input,
                         attention_mask=encoder_attention_mask,
                         decoder_input_ids=decoder_input,
                        labels=decoder_output)
        loss_sum += output.loss.item()
        num_batches +=1
    return loss_sum/num_batches
    

In [10]:
X = list(data.news)
y = list(data.summary)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [11]:
model_name = 'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [12]:
#Gives a dictionary of tensors
input_enc_train = tokenizer.prepare_seq2seq_batch(X_train, truncation=True, padding='longest', return_tensors="pt")
target_enc_train = tokenizer.prepare_seq2seq_batch(y_train, truncation=True, padding='longest', return_tensors="pt")

In [13]:
input_enc_val = tokenizer.prepare_seq2seq_batch(X_test, truncation=True, padding='longest', return_tensors="pt")
target_enc_val = tokenizer.prepare_seq2seq_batch(y_test, truncation=True, padding='longest', return_tensors="pt")

In [14]:
class NewsSummaryDataset(torch.utils.data.Dataset):
    def __init__(self,input_encodings,target_encodings):
        self.input_encodings = input_encodings
        self.target_encodings = target_encodings
    
    def __getitem__(self,idx):
        data = {}
        data["encoder_input"]=self.input_encodings["input_ids"][idx]
        data["encoder_attention_masks"] = self.input_encodings["attention_mask"][idx]
        data["decoder_output"] = self.target_encodings["input_ids"][idx]
        decoder_input = self.target_encodings.input_ids[idx]
        #For right shifting input
        decoder_input = torch.roll(decoder_input,1,-1)
        decoder_input[0] = torch.tensor(0)
        data["decoder_input"] = decoder_input
        return data
    
    def __len__(self):
        return self.target_encodings.input_ids.shape[0]


        

In [15]:
train_dataset = NewsSummaryDataset(input_enc_train,target_enc_train)
val_dataset = NewsSummaryDataset(input_enc_val,target_enc_val)

In [28]:
INPUT_BATCH_SIZE=8
train_loader = torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=INPUT_BATCH_SIZE)
val_loader = torch.utils.data.DataLoader(val_dataset,shuffle=True,batch_size=INPUT_BATCH_SIZE)

In [29]:
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
x = model.train()

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
optim = Adafactor(model.parameters(),lr=1e-7,relative_step=False)

In [36]:
EPOCH = 5
prev_val_loss = 10000
for epoch in range(EPOCH):
    pbar = tqdm.tqdm(train_loader)
    loss_sum =0
    num_batch=0
    for batch in pbar:
        optim.zero_grad()
        encoder_input = batch["encoder_input"].to(torch_device)
        encoder_attention_mask = batch["encoder_attention_masks"].to(torch_device)
        decoder_input = batch["decoder_input"].to(torch_device)
        decoder_output = batch["decoder_output"].to(torch_device)
        output = model(input_ids=encoder_input,
                       attention_mask=encoder_attention_mask,
                       decoder_input_ids=decoder_input,
                       labels=decoder_output)
        loss = output[0]
        loss.backward()
        optim.step()
        loss_sum += loss.item()
        num_batch+=1
        pbar.set_description("Epoch: %s, Train loss: %f"%(epoch,loss_sum/num_batch))
    
    val_loss = get_val_loss(model,val_loader)
    sys.stdout.write("         Val loss: %f"%val_loss)
    sys.stdout.flush()

    #Breaking criteria
    if prev_val_loss < val_loss:
        break

    prev_val_loss = val_loss

    #saving model checkpoint
    model.save_pretrained("./model_finetuned/pegasus_finetuned_for_summary")
    tokenizer.save_pretrained("./model_finetuned/pegasus_finetuned_for_summary")

Epoch: 0, Train loss: 4.857804: 100%|██████████| 440/440 [09:08<00:00,  1.25s/it]


         Val loss: 4.499187

Epoch: 1, Train loss: 4.856783: 100%|██████████| 440/440 [09:08<00:00,  1.25s/it]


         Val loss: 4.499187

In [33]:
x= model.eval()

In [34]:
input_enc_test = tokenizer.prepare_seq2seq_batch(X_test[0:2], truncation=True, padding='longest', return_tensors="pt").to(torch_device)
translated = model.generate(**input_enc_test)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

RuntimeError: all only supports torch.uint8 dtype

In [270]:
tgt_text

['A woman has been arrested for trespassing at the White House.',
 'Priyanka Chopra has said that "I definitely think it is no one\'s business what a family decides to do." "I definitely think it is no one\'s business what a family decides to do," she added. "I definitely think it is no one\'s business what a family decides to do."']

In [None]:
traget_enc_test = tokenizer.prepare_seq2seq_batch(y_test[0:2], truncation=True, padding='longest', return_tensors="pt")

In [273]:
y_test[0:2]

['A woman, who was arrested twice last week for trying to jump the White House fence, has been arrested for the third time for scaling a fence at the Treasury Building, next to the White House. The woman, who told officers that she wanted to speak to President Donald Trump, has been charged with unlawful entry and contempt of court.',
 "A tea stall in Gujarat's Vadnagar railway station where PM Narendra Modi used to sell tea during his childhood is set to become a tourist spot. Union Culture Minister Mahesh Sharma visited the place on Sunday and said the tea stall will be developed as a part of a ?100-crore project to develop Vadnagar and surrounding areas as tourist destinations. "]

In [16]:
from tqdm.auto import tqdm
from time import sleep

In [17]:
for i in tqdm(range(10)):
    sleep(3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [30]:
torch.__version__


'1.1.0'

In [31]:
python.__version__

NameError: name 'python' is not defined

In [36]:
type(encoder_input)

torch.Tensor

In [2]:
t = torch.Tensor([1,0,1])

In [3]:
t.bool()

tensor([1, 0, 1], dtype=torch.uint8)

In [22]:
torch.bfloat16

AttributeError: module 'torch' has no attribute 'bfloat16'

In [18]:
torch.bfloat16 = None

In [24]:
torch.bfloat16