In [1]:
import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Adafactor, AdamW
import torch
from sklearn.model_selection import train_test_split
import tqdm
import sys
torch.bfloat16 = None

In [2]:
!nvidia-smi

Tue Jan 12 11:06:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:04:00.0 Off |                    0 |
| N/A   32C    P0    24W / 250W |      8MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   31C    P0    24W / 250W |      8MiB / 16160MiB |      0%      Default |
|       

In [4]:
data_raw = pd.read_csv('../data/news_summary.csv',encoding='latin-1')

In [5]:
data_raw.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [6]:
data = pd.DataFrame()
data['news'] = data_raw['ctext']
data['summary'] = data_raw['text']

In [7]:
data.head()

Unnamed: 0,news,summary
0,The Daman and Diu administration on Wednesday ...,The Administration of Union Territory Daman an...
1,"From her special numbers to TV?appearances, Bo...",Malaika Arora slammed an Instagram user who tr...
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Mumbai and other Indian cities are t...,Hotels in Maharashtra will train their staff t...


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4514 entries, 0 to 4513
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   news     4396 non-null   object
 1   summary  4514 non-null   object
dtypes: object(2)
memory usage: 70.7+ KB


In [9]:
data = data.dropna()

In [10]:
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')

In [11]:
def get_val_loss(model,val_loader):
    temp = model.eval()
    num_batches = 0
    loss_sum = 0
    for batch in val_loader:
        encoder_input = batch["encoder_input"].to(device)
        encoder_attention_mask = batch["encoder_attention_masks"].to(device)
        decoder_input = batch["decoder_input"].to(device)
        decoder_output = batch["decoder_output"].to(device)
        with torch.no_grad():
            output = model(input_ids=encoder_input,
                         attention_mask=encoder_attention_mask,
                         decoder_input_ids=decoder_input,
                        labels=decoder_output)
        loss_sum += output.loss.item()
        num_batches +=1
    return loss_sum/num_batches
    

In [12]:
X = list(data.news)
y = list(data.summary)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [13]:
model_name =   '../model/pegasus_news_summary' #'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [14]:
#Gives a dictionary of tensors
input_enc_train = tokenizer.prepare_seq2seq_batch(X_train, truncation=True, padding='longest', return_tensors="pt")
target_enc_train = tokenizer.prepare_seq2seq_batch(y_train, truncation=True, padding='longest', return_tensors="pt")

In [15]:
input_enc_val = tokenizer.prepare_seq2seq_batch(X_test, truncation=True, padding='longest', return_tensors="pt")
target_enc_val = tokenizer.prepare_seq2seq_batch(y_test, truncation=True, padding='longest', return_tensors="pt")

In [16]:
class NewsSummaryDataset(torch.utils.data.Dataset):
    def __init__(self,input_encodings,target_encodings):
        self.input_encodings = input_encodings
        self.target_encodings = target_encodings
    
    def __getitem__(self,idx):
        data = {}
        data["encoder_input"]=self.input_encodings["input_ids"][idx]
        data["encoder_attention_masks"] = self.input_encodings["attention_mask"][idx]
        data["decoder_output"] = self.target_encodings["input_ids"][idx]
        decoder_input = self.target_encodings.input_ids[idx]
        #For right shifting input
        decoder_input = torch.roll(decoder_input,1,-1)
        decoder_input[0] = torch.tensor(0)
        data["decoder_input"] = decoder_input
        return data
    
    def __len__(self):
        return self.target_encodings.input_ids.shape[0]


        

In [17]:
train_dataset = NewsSummaryDataset(input_enc_train,target_enc_train)
val_dataset = NewsSummaryDataset(input_enc_val,target_enc_val)

In [18]:
INPUT_BATCH_SIZE=8
train_loader = torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=INPUT_BATCH_SIZE)
val_loader = torch.utils.data.DataLoader(val_dataset,shuffle=True,batch_size=INPUT_BATCH_SIZE)

In [19]:
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
x = model.train()

In [20]:
optim = AdamW(model.parameters(),lr=1e-6)

In [22]:
#34 epochs till now
EPOCH = 20
prev_val_loss = 10000
for epoch in range(EPOCH):
    pbar = tqdm.tqdm(train_loader)
    loss_sum =0
    num_batch=0
    for batch in pbar:
        optim.zero_grad()
        encoder_input = batch["encoder_input"].to(device)
        encoder_attention_mask = batch["encoder_attention_masks"].to(device)
        decoder_input = batch["decoder_input"].to(device)
        decoder_output = batch["decoder_output"].to(device)
        output = model(input_ids=encoder_input,
                       attention_mask=encoder_attention_mask,
                       decoder_input_ids=decoder_input,
                       labels=decoder_output)
        loss = output[0]
        loss.backward()
        optim.step()
        loss_sum += loss.item()
        num_batch+=1
        pbar.set_description("Epoch: %s, Train loss: %f"%(epoch,loss_sum/num_batch))
    
    val_loss = get_val_loss(model,val_loader)
    sys.stdout.write("         Val loss: %f"%val_loss)
    sys.stdout.flush()

    #Breaking criteria
    if prev_val_loss < val_loss:
        break

    prev_val_loss = val_loss

    #saving model checkpoint
    model.save_pretrained("../model/pegasus_news_summary")
    tokenizer.save_pretrained("../model/pegasus_news_summary")

Epoch: 0, Train loss: 0.942607: 100%|██████████| 440/440 [07:18<00:00,  1.00it/s]


         Val loss: 1.072890

Epoch: 1, Train loss: 0.936294: 100%|██████████| 440/440 [07:12<00:00,  1.02it/s]


         Val loss: 1.069782

Epoch: 2, Train loss: 0.930634: 100%|██████████| 440/440 [07:24<00:00,  1.01s/it]


         Val loss: 1.066664

Epoch: 3, Train loss: 0.924967: 100%|██████████| 440/440 [07:23<00:00,  1.01s/it]


         Val loss: 1.064443

Epoch: 4, Train loss: 0.919701: 100%|██████████| 440/440 [07:24<00:00,  1.01s/it]


         Val loss: 1.062250

Epoch: 5, Train loss: 0.914822: 100%|██████████| 440/440 [07:25<00:00,  1.01s/it]


         Val loss: 1.060834

Epoch: 6, Train loss: 0.909529: 100%|██████████| 440/440 [07:26<00:00,  1.01s/it]


         Val loss: 1.058897

Epoch: 7, Train loss: 0.905031: 100%|██████████| 440/440 [07:19<00:00,  1.00it/s]


         Val loss: 1.057524

Epoch: 8, Train loss: 0.900296: 100%|██████████| 440/440 [07:25<00:00,  1.01s/it]


         Val loss: 1.056268

Epoch: 9, Train loss: 0.895962: 100%|██████████| 440/440 [07:25<00:00,  1.01s/it]


         Val loss: 1.055066

Epoch: 10, Train loss: 0.891695: 100%|██████████| 440/440 [07:18<00:00,  1.00it/s]


         Val loss: 1.054102

Epoch: 11, Train loss: 0.887606: 100%|██████████| 440/440 [07:24<00:00,  1.01s/it]


         Val loss: 1.053197

Epoch: 12, Train loss: 0.883236: 100%|██████████| 440/440 [07:21<00:00,  1.00s/it]


         Val loss: 1.052608

Epoch: 13, Train loss: 0.878913: 100%|██████████| 440/440 [07:22<00:00,  1.01s/it]


         Val loss: 1.051757

Epoch: 14, Train loss: 0.874781: 100%|██████████| 440/440 [07:23<00:00,  1.01s/it]


         Val loss: 1.051069

Epoch: 15, Train loss: 0.870452: 100%|██████████| 440/440 [07:23<00:00,  1.01s/it]


         Val loss: 1.050568

Epoch: 16, Train loss: 0.866729: 100%|██████████| 440/440 [07:17<00:00,  1.01it/s]


         Val loss: 1.050332

Epoch: 17, Train loss: 0.862927: 100%|██████████| 440/440 [07:24<00:00,  1.01s/it]


         Val loss: 1.049702

Epoch: 18, Train loss: 0.859234: 100%|██████████| 440/440 [07:20<00:00,  1.00s/it]


         Val loss: 1.049763

In [23]:
x= model.eval()

In [27]:
input_enc_test = tokenizer.prepare_seq2seq_batch(X_test[2:10], truncation=True, padding='longest', return_tensors="pt").to(device)
translated = model.generate(**input_enc_test)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

In [51]:
i=6
tgt_text[i]

'Be it biscuits or nut crackers, drug peddlers are lacing them with LSD and selling them at higher price, agencies dealing with narcotics claimed. "Most of the drugs, especially LSD, coming from Germany, India are being coated on eatables," they added. Notably, earlier LSD-laced sugar cubes were seized'

In [52]:
X_test[2:10][i]

'Believe it or not, eatables laced with LSD are the emerging modus operandi followed by drug peddlers across the country, agencies dealing against narcotics claimed. Be it biscuits or nut crackers, drug peddlers are lacing them with LSD and selling them at higher price.The revelation came days after Task Force sleuths arrested nine people, including two Nigerians, in two separate cases at Masab Tank and Banjara Hills in Hyderabad. Along with 300 grams of cocaine, 42 grams of MDMA and three biscuits laced with LSD were recovered.According to sources, during interrogation, they told police that to save themselves from being caught at the airport, they started coating LSD on eatables such as biscuits and nut crackers. They further told police that this is not the first time they have laced eatables with LSD. "Most of the drugs, especially LSD, coming from Germany to India are being coated on eatables," they reportedly confessed.LSD LACED EATABLESThe officers said that earlier LSD-laced su

In [53]:
y_test[2:10][i]

'Drug peddlers are coating biscuits and nut crackers with Lysergic acid diethylamide (LSD) drug to avoid being caught at airports, Task Force officials said. "These biscuits were detected and seized only because the peddlers were carrying another drug, which was detected in the scanning machine," an officer said. This was revealed after Task Force arrested nine people carrying LSD-laced biscuits.'

In [37]:
traget_enc_test = tokenizer.prepare_seq2seq_batch(y_test[0:2], truncation=True, padding='longest', return_tensors="pt")

In [34]:
y_test[2:10][i]

['US President Barack Obama has declared January 16 as Religious Freedom Day in accordance with a yearly tradition. Urging his countrymen to stand against religious intolerance, Obama said, "religious freedom is a principle based not on shared ancestry, culture, ethnicity, or faith but on a shared commitment to liberty." He further said that America\'s strength comes from its diversity.',
 "The Supreme Court on Thursday rejected a petition seeking a stay on the release of Madhur Bhandarkar's directorial 'Indu Sarkar'. The court noted that the film is an artistic expression within the parameters of law. The petition was filed by Priya Singh Paul, who claimed to be the daughter of Sanjay Gandhi and was alleging that the movie is derogatory.",
 'Congress leader Captain Amarinder Singh will take oath as the Chief Minister of Punjab on March 16, after he led his party to victory in the Punjab elections, winning 77 of 117 seats. Stating that Prime Minister Narendra Modi had talked to him aft

In [16]:
from tqdm.auto import tqdm
from time import sleep

In [17]:
for i in tqdm(range(10)):
    sleep(3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [30]:
torch.__version__


'1.1.0'

In [31]:
python.__version__

NameError: name 'python' is not defined

In [36]:
type(encoder_input)

torch.Tensor

In [2]:
t = torch.Tensor([1,0,1])

In [3]:
t.bool()

tensor([1, 0, 1], dtype=torch.uint8)

In [22]:
torch.bfloat16

AttributeError: module 'torch' has no attribute 'bfloat16'

In [18]:
torch.bfloat16 = None

In [24]:
torch.bfloat16