In [1]:
!nvidia-smi

Thu Sep 24 11:30:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/f4/9f93f06dd2c57c7cd7aa515ffbf9fcfd8a084b92285732289f4a5696dd91/transformers-3.2.0-py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 16.6MB/s 
[?25hCollecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 20.9MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import torch
import re
import time
import pandas as pd
from sklearn.model_selection import train_test_split
import logging
logging.basicConfig(level=logging.ERROR)
import warnings
warnings.filterwarnings('ignore')

## Read Data from Drive

In [5]:
df = pd.read_csv('/content/drive/My Drive/Colab Files/stories_with_summary_train_cleaned.csv')

## Check for GPU 

In [6]:
BATCH_SIZE = 16
SHUFFLE_SIZE = 1024

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
learning_rate = 1e-4
print(device)

cuda:0


## Load T5 Transformer model for fine tuning from Hugging Face

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))
    

optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate, weight_decay=0.0001)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…




In [8]:
df.head(1)

Unnamed: 0,article,summary,cleaned_stories
0,It's official: U.S. President Barack Obama wan...,['Syrian official: Obama climbed to the top of...,it is official u.s. president barack obama wan...


## Create Dataset Class

In [9]:
class SummaryDataset(torch.utils.data.Dataset):
    def __init__(self, articles, highlights):
        self.x = articles
        self.y = highlights
        
    def __getitem__(self, index):
        x = tokenizer.encode_plus(model.config.prefix + self.x[index], max_length=512,truncation=True ,return_tensors="pt", pad_to_max_length=True)
        y = tokenizer.encode(self.y[index], max_length=150, truncation=True,return_tensors="pt", pad_to_max_length=True)
        return x['input_ids'].view(-1), x['attention_mask'].view(-1), y.view(-1)
        
    def __len__(self):
        return len(self.x)

In [10]:
df_train,df_test = train_test_split(df,test_size = 0.1)
df_val ,df_test = train_test_split(df_test,test_size =0.5)

train_dataset = SummaryDataset(articles = df_train.cleaned_stories.values, highlights = df_train.summary.values)
train_data_loader = torch.utils.data.DataLoader(train_dataset,batch_size=BATCH_SIZE)

val_dataset = SummaryDataset(articles = df_val.cleaned_stories.values, highlights = df_val.summary.values)
val_data_loader = torch.utils.data.DataLoader(val_dataset,batch_size=BATCH_SIZE)

test_dataset = SummaryDataset(articles = df_test.cleaned_stories.values, highlights = df_test.summary.values)
test_data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=BATCH_SIZE)

In [11]:
pad_token_id = tokenizer.pad_token_id
def step(inputs_ids, attention_mask, y):
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone()
    lm_labels[y[:, 1:] == pad_token_id] = -100
    output = model(inputs_ids, attention_mask=attention_mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
    return output[0] # loss

## Fine Tuning / Training T5 Model

In [12]:
EPOCHS = 2
log_interval = 200
train_loss = []
val_loss = []
for epoch in range(EPOCHS):
    model.train() 
    start_time = time.time()
    for i, (inputs_ids, attention_mask, y) in enumerate(train_data_loader):
        inputs_ids = inputs_ids.to(device)
        attention_mask = attention_mask.to(device)
        y = y.to(device)
        
        
        optimizer.zero_grad()
        loss = step(inputs_ids, attention_mask, y)
        train_loss.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
            
        if (i + 1) % log_interval == 0:
            with torch.no_grad():
                x, x_mask, y = next(iter(val_data_loader))
                x = x.to(device)
                x_mask = x_mask.to(device)
                y = y.to(device)
                
                v_loss = step(x, x_mask, y)
                v_loss = v_loss.item()
                
                
                elapsed = time.time() - start_time
                print('| epoch {:3d} | [{:5d}/{:5d}] | '
                  'ms/batch {:5.2f} | '
                  'loss {:5.2f} | val loss {:5.2f}'.format(
                    epoch+1, i, len(train_data_loader),
                    elapsed * 1000 / log_interval,
                    loss.item(), v_loss))
                start_time = time.time()
                val_loss.append(v_loss)

| epoch   1 | [  199/ 5207] | ms/batch 488.12 | loss  2.65 | val loss  2.81
| epoch   1 | [  399/ 5207] | ms/batch 486.10 | loss  2.24 | val loss  2.63
| epoch   1 | [  599/ 5207] | ms/batch 487.06 | loss  2.42 | val loss  2.49
| epoch   1 | [  799/ 5207] | ms/batch 486.50 | loss  2.20 | val loss  2.52
| epoch   1 | [  999/ 5207] | ms/batch 486.41 | loss  2.36 | val loss  2.48
| epoch   1 | [ 1199/ 5207] | ms/batch 487.33 | loss  2.38 | val loss  2.47
| epoch   1 | [ 1399/ 5207] | ms/batch 486.71 | loss  2.26 | val loss  2.45
| epoch   1 | [ 1599/ 5207] | ms/batch 486.44 | loss  2.34 | val loss  2.53
| epoch   1 | [ 1799/ 5207] | ms/batch 486.69 | loss  2.33 | val loss  2.47
| epoch   1 | [ 1999/ 5207] | ms/batch 487.36 | loss  1.99 | val loss  2.49
| epoch   1 | [ 2199/ 5207] | ms/batch 486.96 | loss  2.34 | val loss  2.51
| epoch   1 | [ 2399/ 5207] | ms/batch 486.67 | loss  2.54 | val loss  2.52
| epoch   1 | [ 2599/ 5207] | ms/batch 488.49 | loss  2.39 | val loss  2.55
| epoch   1 

In [13]:
pip install rouge-score

Collecting rouge-score
  Downloading https://files.pythonhosted.org/packages/1f/56/a81022436c08b9405a5247b71635394d44fe7e1dbedc4b28c740e09c2840/rouge_score-0.0.4-py2.py3-none-any.whl
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


## Check Rouge Scores for Summarization

In [14]:
from rouge_score import rouge_scorer
from rouge_score import scoring

class RougeScore:
    '''
    mostly from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/evaluation/metrics.py 
    '''
    
    def __init__(self, score_keys=None)-> None:
        super().__init__()
        if score_keys is None:  
            self.score_keys = ["rouge1", "rouge2", "rougeLsum"]
        
        self.scorer = rouge_scorer.RougeScorer(self.score_keys)
        self.aggregator = scoring.BootstrapAggregator()
        
        
    @staticmethod
    def prepare_summary(summary):
            # Make sure the summary is not bytes-type
            # Add newlines between sentences so that rougeLsum is computed correctly.
            summary = summary.replace(" . ", " .\n")
            return summary
    
    def __call__(self, target, prediction):
        """Computes rouge score.''
        Args:
        targets: string
        predictions: string
        """

        target = self.prepare_summary(target)
        prediction = self.prepare_summary(prediction)
        
        self.aggregator.add_scores(self.scorer.score(target=target, prediction=prediction))

        return 
    
    def reset_states(self):
        self.rouge_list = []

    def result(self):
        result = self.aggregator.aggregate()
        
        for key in self.score_keys:
            score_text = "%s = %.2f, 95%% confidence [%.2f, %.2f]"%(
                key,
                result[key].mid.fmeasure*100,
                result[key].low.fmeasure*100,
                result[key].high.fmeasure*100
            )
            print(score_text)
        
        return {key: result[key].mid.fmeasure*100 for key in self.score_keys}

In [15]:
rouge_score = RougeScore()
predictions = []
for i, (input_ids, attention_mask, y) in enumerate(test_data_loader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    y = y.to(device)
        
    summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
    real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    for pred_sent, real_sent in zip(pred, real):
        rouge_score(pred_sent, real_sent)
        predictions.append(str("pred sentence: " + pred_sent + "\n\n real sentence: " + real_sent))
    if i > 40:
        break
    
rouge_score.result()

rouge1 = 28.06, 95% confidence [27.37, 28.76]
rouge2 = 8.18, 95% confidence [7.65, 8.75]
rougeLsum = 18.41, 95% confidence [17.88, 19.00]


{'rouge1': 28.056943591224226,
 'rouge2': 8.181835123056894,
 'rougeLsum': 18.405928023838808}

## Get some Sample Predictions

In [16]:
for pred in predictions[:10]:
    print("------")
    print(pred)
    print("------")  

------
pred sentence: 'States Gallo fired all the teachers at Central Falls high school after years', 'President Obama got involved and supported the firing, saying if a teacher is responsible', "He says teachers are failing to follow the curriculum's failure, he says", 'He says he's not the answer because the teachers don't want to do a bad job']

 real sentence: ['Rhode Island school district fired teachers at a failing school', "Teacher Esther Wojcicki says that's not the answer to poorly performing schools", 'She says parents, administrators need to share blame, help solve problem', '"No teacher can effectively educate a child without support from the parents," she says']
------
------
pred sentence: ', 'A listers are now scratching at casting directors doors in their eagerness to voice voice', "The movie's a roll call of big name actors has followed from tom Hanks as Woody's wiseccs", 'The movie has become a big advantage']

 real sentence: ['From Johnny Depp to Angelina Jolie, A-

## Save Model State 

In [17]:
torch.save(model.state_dict(), '/content/drive/My Drive/Colab Files/t5_summarization_model_2_Epochs.pt')