#### Cedits: https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb

In [1]:
# !pip install wandb -q

In [2]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import BartTokenizer, BartForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
# import wandb

In [3]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device = cpu
# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [4]:
# !wandb login 288b8f7697201744f07825b641619180aa4f47dd

In [5]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [6]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids,labels=lm_labels)
        loss = outputs[0]
        
#         if _%10 == 0:
#             wandb.log({"Training Loss": loss.item()})

#         if _%500==0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [7]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=512, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [8]:
# # WandB – Initialize a new run
# wandb.init(project="transformers_tutorials_summarization")

# WandB – Config is a variable that holds and saves hyperparameters and inputs
# Defining some key variables that will be used later on in the training  
# config = wandb.config          # Initialize config
# config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
# config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
# config.TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
# config.VAL_EPOCHS = 1 
# config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
# config.SEED = 42               # random seed (default: 42)
# config.MAX_LEN = 512
# config.SUMMARY_LEN = 512

TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 512

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")#T5Tokenizer.from_pretrained("t5-base")


# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
df = pd.read_csv('data.csv',encoding='utf-8')
df = df[['text','ctext']]
df.ctext = 'summarize: ' + df.ctext
print(df.head())
df = df.head(100)


# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))


# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)



# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")#T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Log metrics with wandb
# wandb.watch(model, log="all")
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('./models/predictions.csv')
    print('Output Files generated for review')

                                                text  \
0  It's official: U.S. President Barack Obama wan...   
1  (CNN) -- Usain Bolt rounded off the world cham...   
2  Kansas City, Missouri (CNN) -- The General Ser...   
3  Los Angeles (CNN) -- A medical doctor in Vanco...   
4  (CNN) -- Police arrested another teen Thursday...   

                                               ctext  
0  summarize: Syrian official: Obama climbed to t...  
1  summarize: Usain Bolt wins third gold of world...  
2  summarize: The employee in agency's Kansas Cit...  
3  summarize: NEW: A Canadian doctor says she was...  
4  summarize: Another arrest made in gang rape ou...  
FULL Dataset: (100, 2)
TRAIN Dataset: (80, 2)
TEST Dataset: (20, 2)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




RuntimeError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 11.00 GiB total capacity; 9.58 GiB already allocated; 0 bytes free; 9.71 GiB reserved in total by PyTorch)

In [None]:
import os
output_dir = './abstractive/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
sample = '''(CNN) -- Loud music pumps through huge speakers, front row guests cheer and a parade of stunning models electrifies the gleaming catwalk -- welcome to Africa Fashion Week London.

Currently in its third year, the glamorous event saw dozens of big names and up-and-coming designers from across the continent descending on the UK capital's hip district of Shoreditch to unveil their latest stylish creations.

"Our platform is about promoting emerging and established Africa-inspired designers," said Ronke Ademiluyi, founder of the event, held from August 1 to 3.

"The main thing is to bring attention to them, to showcase their creativity to the world so they get more global recognition for what they do and more appreciations for their brands as well."

Vibrant colors

The runways featured designs from countries such as Nigeria, Ghana, South Africa, Congo, Zambia, Zimbabwe, Botswana, Kenya and Morocco -- but also from the diaspora, including Britain and the Caribbean.

In many ways, the event reaffirmed why Africa-inspired designs are fast catching the eye of the fashion world.

Mixing current trends with traditional patterns, more than 60 designers graced the catwalk with a wide array of colorful creations -- everything from show-stopping evening gowns and modern urban casualwear to bold textured prints and chic accessory lines.

Glorious colors in the desert: Darfur's fashionable women

Ademiluyi says the continent's fashion today "represents a fusion of contemporary and African designs" awash with "a lot of vibrant colors and tribal trends."

Amongst those giving a modern twist to traditional styles is Nigerian designer Fashola Olayinka with her Lagos-based label "MOOFA Designs." Her latest collection, "Ashake" is celebrating the "very powerful and strong women" who "turn heads wherever they go to."

"That's basically what's the collection is about," says Olayinka, who started the label about four years ago. "Women who are very feminine and sexy."

Read this: Congo's designer dandies

The young designer says that despite the existing challenges, such as frequent power cuts, it's very exciting being part of Nigeria's fashion scene right now.

"We work hard and we party hard, so it's been really fun and it's a growing process in Nigeria," she says. "Nigerians like to dress up and a lot of people in Nigeria are now wearing their own fabrics."

Profile boost

But despite the growing interest in African designs, Ademiluyi says that many of the continent's promising talents still find it difficult to break into the mainstream international shows.

She says that for many of them, the week is a chance to shine on the international stage.

"A lot of them are talented but they're struggling," she says. "They don't have support from anywhere, so what we do is we support them -- it's an affordable platform for the designers to showcase their talents to the world."

Read this: Taking African colors to America's Deep South

South African fashion artist Steve Mandy agrees. He says that events like this help participants boost their profile both internationally and at home.

"You can meet some really important people here and I have already met some great people here that I think I'll do business with," says Durban-based Mandy, known for hand-painting on dresses and t-shirts.

"The other thing is the spin-off in terms of your own image, in terms of our audience back in South Africa -- the fact that you can say that you did African Fashion Week it promotes you and helps your product to gain integrity."

Looking ahead, Ademiluyi says the goal is to hold the event twice a year and also establish a supply platform that would make the designers' creations more accessible to the world.

"For a lot of them, after Africa Fashion Week, that's it," she says. "The clients don't know where to get their brand, so we hope to support the designers a lot more by opening up a distribution outlet."


'''

answer = summarizer(sample)[0]['summary_text']
answer