In [None]:
!pip install transformers -q
!pip install wandb -q

[K     |████████████████████████████████| 4.0 MB 5.0 MB/s 
[K     |████████████████████████████████| 77 kB 8.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 57.1 MB/s 
[K     |████████████████████████████████| 880 kB 55.9 MB/s 
[K     |████████████████████████████████| 596 kB 62.1 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.8 MB 5.2 MB/s 
[K     |████████████████████████████████| 181 kB 73.7 MB/s 
[K     |████████████████████████████████| 144 kB 71.9 MB/s 
[K     |████████████████████████████████| 63 kB 2.2 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [None]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
import wandb

import transformers
from datasets import load_dataset
from torch.optim import Adam
import torch
from torch.utils.data import DataLoader,Dataset,RandomSampler,SequentialSampler
from transformers import T5ForConditionalGeneration,T5Tokenizer,T5PreTrainedModel # SentencePiece library is required to download pretrained t5tokenizer
# Let's try T5TokenizerFast
from transformers.models.t5 import T5TokenizerFast

In [None]:
!pip install rouge-score
!pip install datasets

Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4
Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 65.8 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 71.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 68.3 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urlli

In [None]:
import pandas as pd
import numpy as np
from datasets import load_metric
# from rouge import Rouge
metric = load_metric("rouge")

In [None]:
!nvidia-smi

Thu May  5 15:37:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mshaivals[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.article
        self.ctext = self.data.highlights

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100 #Here, we make sure that padding token id’s of the labels are not taken into account by the loss function, by replacing them with -100.
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        # print(outputs[0]) #This just prints the tensor values of the NllLoss at each iteration in an epoch.
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=100, 
                num_beams=4,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
def main():
    # WandB – Initialize a new run
    wandb.init(project="Summarization")

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training  
    config = wandb.config          # Initialize config
    config.TRAIN_BATCH_SIZE = 2    # input batch size for training
    config.VALID_BATCH_SIZE = 2   # input batch size for testing
    config.TRAIN_EPOCHS = 2        
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01) This is generally high for T5 when using Adam optimizers(in the range of 1e-4 to 3e-4)
    config.SEED = 42              
    config.MAX_LEN = 900
    config.SUMMARY_LEN = 100 

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED) # pytorch random seed
    np.random.seed(config.SEED) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5TokenizerFast.from_pretrained('t5-base') #Here, we used the FastTokenizer in order to focus only on Unigrams. It is faster than the normal one
    

 
    # Selecting the needed columns only. 
    # Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
    df = pd.read_csv('/content/drive/MyDrive/DL Project/train.csv',encoding='latin-1')
    df = df.sample(frac=0.001)
    df = df.drop(columns=['id'])
    df = df[['article','highlights']]
    df.highlights = 'summarize: ' + df.highlights
    # print(df.head())

    
    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
    train_size = 0.8
    train_dataset=df.sample(frac=train_size,random_state = config.SEED)
    val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    
    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    # Log metrics with wandb
    wandb.watch(model, log="all")
    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)


    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv('/content/drive/MyDrive/DL Project/predictions_T5.csv')
        print('Output Files generated for review')

if __name__ == '__main__':
    main()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training Loss,█▅▆▇▅▃▆▃▅▆▄▅▅▄▅▄▄▅▆▇▁▁▂▄▁▃▃▄▄▃▃▂▄▂▃▂▃▄▂▅

0,1
Training Loss,2.97591


FULL Dataset: (287, 2)
TRAIN Dataset: (230, 2)
TEST Dataset: (57, 2)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset
tensor(6.9332, device='cuda:0', grad_fn=<NllLossBackward0>)
Epoch: 0, Loss:  6.933244705200195




tensor(6.4807, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.6229, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.8004, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.5671, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.4323, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(5.1789, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.6875, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.2747, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.5077, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.8763, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.5678, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.6654, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.2479, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.2592, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(3.6002, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.3436, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(4.1319, device='cuda:0', grad_fn=

In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 22.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 21.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 11.1 MB/s eta 0:00:01[K     |█                               | 40 kB 8.7 MB/s eta 0:00:01[K     |█▍                              | 51 kB 7.1 MB/s eta 0:00:01[K     |█▋                              | 61 kB 8.3 MB/s eta 0:00:01[K     |██                              | 71 kB 8.4 MB/s eta 0:00:01[K     |██▏                             | 81 kB 9.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 8.4 MB/s eta 0:00:01[K     |██▊                             | 102 kB 8.6 MB/s eta 0:00:01[K     |███                             | 112 kB 8.6 MB/s eta 0:00:01[K     |███▎                            | 122 kB 8.6 MB/s eta 0:00:01[K     |███▌       

In [41]:
df = pd.read_csv("/content/drive/MyDrive/DL Project/predictions_T5.csv")

In [42]:
df.head()

Unnamed: 0.1,Unnamed: 0,Generated Text,Actual Text
0,0,"PUBLISHED:. 06:56 EST, 30 May 2013. |. UPDATED...",(CNN) -- PepsiCo is ending its relationship wi...
1,1,"PUBLISHED:. 08:56 EST, 30 September 2013. |. U...","By. Tara Brady. PUBLISHED:. 00:21 EST, 3 March..."
2,2,Morrisons is to sell ready-peeled fruits in th...,"By. Daily Mail Reporter. UPDATED:. 07:24 EST, ..."
3,3,Hundreds of Egyptians have died on election da...,Cairo (CNN) -- Violence marked the beginning o...
4,4,"'Selfie', 'twerk' and 'Twittersphere' are amon...",'Selfie' may have been named Oxford. Dictionar...


In [43]:
df.shape

(5742, 3)

In [44]:
predicted_text = df["Generated Text"].tolist()
actual_text = df["Actual Text"].tolist()

In [None]:
!pip install rouge-score



In [None]:
from datasets import load_metric

In [45]:
rouge_scores = load_metric("rouge")

###Calculating Average Rouge-1, Rouge-2 and Rouge-L scores

In [46]:
results = rouge_scores.compute(predictions=predicted_text, references=actual_text)

In [48]:
print("Rouge-1 Score of T5 Model:", results["rouge1"].high.fmeasure, "\nRouge-2 Score of T5 Model:", results["rouge2"].high.fmeasure, "\nRouge-L Score of T5 Model:", results["rougeL"].high.fmeasure)

Rouge-1 Score of T5 Model: 0.3460494998534182 
Rouge-2 Score of T5 Model: 0.12486014352479728 
Rouge-L Score of T5 Model: 0.2153263670523677
