<a href="https://colab.research.google.com/github/sbassam/nub-summarizer/blob/master/fine_tuning_T_5_on_CNN%2Bdaily_mail_%2B_ML4T.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/My Drive/summarizer/nub-training-evaluation

In [None]:
# the one-time data download for the first run
# !wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/cnn_dm.tgz
# !tar -xzvf cnn_dm.tgz

# !export CNN_DIR=${PWD}/cnn_dm

In [None]:
!pip install transformers -q
!pip install wandb -q
!pip install rouge-score

In [None]:
import shutil
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
import wandb
from torch import cuda

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
!nvidia-smi

In [None]:
!wandb login

In [None]:
with open('cnn_dm/train.source') as f:
    train_source = f.readlines()
train_source = [x.strip() for x in train_source] 
train_source = pd.DataFrame(train_source)
with open('cnn_dm/train.target') as f:
    train_target = f.readlines()
train_target = [x.strip() for x in train_target] 
train_target = pd.DataFrame(train_target)


In [None]:
TRAIN_SIZE = #insert train size

In [None]:
sample_ids = np.random.choice(train_source.size, replace = False, size = TRAIN_SIZE) 
cnn_dm_train = pd.concat([train_source.iloc[sample_ids], train_target.iloc[sample_ids]], axis=1)
cnn_dm_train.columns = ['full_text', 'summary']
cnn_dm_train.full_text = 'summarize: ' + cnn_dm_train.full_text

In [None]:
df_lessons = pd.read_csv('/content/drive/My Drive/summarizer/nub-training-evaluation/lesson_summary.csv')
df_lessons = df_lessons[['summary','full_text']]
df_lessons.full_text = 'summarize: ' + df_lessons.full_text

In [None]:
cnn_dm_train = pd.concat([cnn_dm_train, df_lessons], axis = 0)

In [None]:
cnn_dm_train.shape

In [None]:
# class and functions from  github.com/abhimishra91/transformers-tutorials


class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.summary = self.data.summary
        self.full_text = self.data.full_text

    def __len__(self):
        return len(self.summary)

    def __getitem__(self, index):
        full_text = str(self.full_text[index])
        full_text = ' '.join(full_text.split())

        summary = str(self.summary[index])
        summary = ' '.join(summary.split())

        source = self.tokenizer.batch_encode_plus([full_text], max_length=self.source_len, pad_to_max_length=True, return_tensors='pt', truncation=True)
        target = self.tokenizer.batch_encode_plus([summary], max_length=self.summ_len, pad_to_max_length=True, return_tensors='pt', truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }


def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _, data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype=torch.long)
        mask = data['source_mask'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]

        if _ % 10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _ % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()



def main(train_dataset, model_output_dir):
    # WandB – Initialize a new run
    wandb.init(project="transformers_tutorials_summarization")

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training
    config = wandb.config  # Initialize config
    config.TRAIN_BATCH_SIZE = 1  # input batch size for training (default: 64)
    config.TRAIN_EPOCHS = 2  # number of epochs to train (default: 10)
    config.LEARNING_RATE = 1e-4  # learning rate (default: 0.01)
    config.SEED = 42  # random seed (default: 42)
    config.MAX_LEN = 1024
    config.SUMMARY_LEN = 256

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED)  # pytorch random seed
    np.random.seed(config.SEED)  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    
    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset.reset_index(drop=True), tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.Adam(params=model.parameters(), lr=config.LEARNING_RATE)
    wandb.watch(model, log="all")

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)





In [None]:
dir = './model'
if os.path.exists(dir):
    shutil.rmtree(dir)
os.makedirs(dir)

In [None]:
main(cnn_dm_train, dir)

In [None]:
# register the model in huggingface model hub
