In [1]:
!nvidia-smi

Wed Sep 23 17:48:21 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    31W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch
import re
import time
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
logging.basicConfig(level=logging.ERROR)
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('/content/drive/My Drive/Colab Files/stories_with_summary_train_cleaned.csv')

In [6]:
BATCH_SIZE = 16
SHUFFLE_SIZE = 1024

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
learning_rate = 1e-3
print(device)

cuda:0


In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))
    

optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate, weight_decay=0.0001)

In [8]:
df.head(1)

Unnamed: 0,article,summary,cleaned_stories
0,It's official: U.S. President Barack Obama wan...,['Syrian official: Obama climbed to the top of...,it is official u.s. president barack obama wan...


In [9]:
class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self, articles, highlights):
        self.x = articles
        self.y = highlights
        
    def __getitem__(self, index):
        x = tokenizer.encode_plus(model.config.prefix + self.x[index], max_length=512,truncation=True ,return_tensors="pt", pad_to_max_length=True)
        y = tokenizer.encode(self.y[index], max_length=150, truncation=True,return_tensors="pt", pad_to_max_length=True)
        return x['input_ids'].view(-1), x['attention_mask'].view(-1), y.view(-1)
        
    def __len__(self):
        return len(self.x)

In [10]:
df_train,df_test = train_test_split(df,test_size = 0.1)
df_val ,df_test = train_test_split(df_test,test_size =0.5)

train_dataset = SummaryDataset(articles = df_train.cleaned_stories.values, highlights = df_train.summary.values)
train_data_loader = torch.utils.data.DataLoader(train_dataset,batch_size=BATCH_SIZE)

val_dataset = SummaryDataset(articles = df_val.cleaned_stories.values, highlights = df_val.summary.values)
val_data_loader = torch.utils.data.DataLoader(val_dataset,batch_size=BATCH_SIZE)

test_dataset = SummaryDataset(articles = df_test.cleaned_stories.values, highlights = df_test.summary.values)
test_data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=BATCH_SIZE)

In [11]:
pad_token_id = tokenizer.pad_token_id
def step(inputs_ids, attention_mask, y):
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone()
    lm_labels[y[:, 1:] == pad_token_id] = -100
    output = model(inputs_ids, attention_mask=attention_mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
    return output[0] # loss

In [12]:
EPOCHS = 1
log_interval = 2000
train_loss = []
val_loss = []
for epoch in range(EPOCHS):
    model.train() 
    start_time = time.time()
    for i, (inputs_ids, attention_mask, y) in enumerate(train_data_loader):
        inputs_ids = inputs_ids.to(device)
        attention_mask = attention_mask.to(device)
        y = y.to(device)
        
        
        optimizer.zero_grad()
        loss = step(inputs_ids, attention_mask, y)
        train_loss.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
            
        if (i + 1) % log_interval == 0:
            with torch.no_grad():
                x, x_mask, y = next(iter(val_data_loader))
                x = x.to(device)
                x_mask = x_mask.to(device)
                y = y.to(device)
                
                v_loss = step(x, x_mask, y)
                v_loss = v_loss.item()
                
                
                elapsed = time.time() - start_time
                print('| epoch {:3d} | [{:5d}/{:5d}] | '
                  'ms/batch {:5.2f} | '
                  'loss {:5.2f} | val loss {:5.2f}'.format(
                    epoch, i, len(train_data_loader),
                    elapsed * 1000 / log_interval,
                    loss.item(), v_loss))
                start_time = time.time()
                val_loss.append(v_loss)

| epoch   0 | [ 1999/ 5207] | ms/batch 793.36 | loss  3.44 | val loss  3.73
| epoch   0 | [ 3999/ 5207] | ms/batch 793.65 | loss  3.59 | val loss  4.02


In [14]:
pip install rouge-score

Collecting rouge-score
  Downloading https://files.pythonhosted.org/packages/1f/56/a81022436c08b9405a5247b71635394d44fe7e1dbedc4b28c740e09c2840/rouge_score-0.0.4-py2.py3-none-any.whl
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [15]:
from rouge_score import rouge_scorer
from rouge_score import scoring

class RougeScore:
    '''
    mostly from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/evaluation/metrics.py 
    '''
    
    def __init__(self, score_keys=None)-> None:
        super().__init__()
        if score_keys is None:  
            self.score_keys = ["rouge1", "rouge2", "rougeLsum"]
        
        self.scorer = rouge_scorer.RougeScorer(self.score_keys)
        self.aggregator = scoring.BootstrapAggregator()
        
        
    @staticmethod
    def prepare_summary(summary):
            # Make sure the summary is not bytes-type
            # Add newlines between sentences so that rougeLsum is computed correctly.
            summary = summary.replace(" . ", " .\n")
            return summary
    
    def __call__(self, target, prediction):
        """Computes rouge score.''
        Args:
        targets: string
        predictions: string
        """

        target = self.prepare_summary(target)
        prediction = self.prepare_summary(prediction)
        
        self.aggregator.add_scores(self.scorer.score(target=target, prediction=prediction))

        return 
    
    def reset_states(self):
        self.rouge_list = []

    def result(self):
        result = self.aggregator.aggregate()
        
        for key in self.score_keys:
            score_text = "%s = %.2f, 95%% confidence [%.2f, %.2f]"%(
                key,
                result[key].mid.fmeasure*100,
                result[key].low.fmeasure*100,
                result[key].high.fmeasure*100
            )
            print(score_text)
        
        return {key: result[key].mid.fmeasure*100 for key in self.score_keys}

In [16]:
rouge_score = RougeScore()
predictions = []
for i, (input_ids, attention_mask, y) in enumerate(test_data_loader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    y = y.to(device)
        
    summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
    real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    for pred_sent, real_sent in zip(pred, real):
        rouge_score(pred_sent, real_sent)
        predictions.append(str("pred sentence: " + pred_sent + "\n\n real sentence: " + real_sent))
    if i > 40:
        break
    
rouge_score.result()

rouge1 = 21.03, 95% confidence [20.36, 21.65]
rouge2 = 4.68, 95% confidence [4.29, 5.09]
rougeLsum = 14.35, 95% confidence [13.88, 14.79]


{'rouge1': 21.028184502046578,
 'rouge2': 4.684724848699995,
 'rougeLsum': 14.349246908595841}

In [17]:
for pred in predictions[:10]:
    print("------")
    print(pred)
    print("------")  

------
pred sentence: ', 'She is a question of the midst of a mania induced rage', "Michichichelle's mother is now a "lelele"]

 real sentence: ['Philip Lerman oversaw the missing child cases for "America's Most Wanted"', "60 cases were solved but not the personal one: his stepsister's disappearance", 'He saw brave families hold out hope, and show would air and re-air their cases', 'Lerman says every family of a missing child is holding onto hope after women found']
------
------
pred sentence: ', 'The hefner says he will work to keep the puppy', "The puppy's ring and the bentley", 'His research for a bullet by not 100,000 life']

 real sentence: ['Hugh Hefner and Crystal Harris are fighting over their shared Cavalier King Charles spaniel', '"We both love the puppy," Hefner said', '"The puppy's valuable, but not $100,000 worth," he said']
------
------
pred sentence: 'NEW: The death penalty has been dismissed for his facebook comment', 'The death death penalty is not in court', "The br

In [19]:
torch.save(model.state_dict(), '/content/drive/My Drive/Colab Files/t5_summarization_model.h5')