In [1]:
import os
import sys
import importlib
import argparse
import nltk
import pandas as pd
import numpy as np
import random
import torch

In [2]:
sys.path.append('../')

In [3]:
from loader import ArticleLoader
from tokenizer import TokenizerOptimization
from preprocessor import Preprocessor

import wandb
from dotenv import load_dotenv

In [4]:
from datasets import load_metric
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

## Model pipeline

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
model_checkpoint = 't5-small'

In [7]:
print('\nLoading Article Data')
article_loader = ArticleLoader('../Data', '../theguardians_article_info.csv', 24000, 1000)
datasets = article_loader.load_data()
print(datasets)


Loading Article Data
DatasetDict({
    train: Dataset({
        features: ['summary', 'document', '__index_level_0__'],
        num_rows: 158355
    })
    validation: Dataset({
        features: ['summary', 'document', '__index_level_0__'],
        num_rows: 39552
    })
})


In [9]:
print('\nPreprocessing Data')
preprocessor = Preprocessor()
datasets = datasets.map(preprocessor.preprocess4train, load_from_cache_file=True)


Preprocessing Data


HBox(children=(FloatProgress(value=0.0, max=158355.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39552.0), HTML(value='')))




In [11]:
print('\nOptimizing Tokenizer')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
unk_token_data = pd.read_csv('/opt/ml/project/Summarization/Tokenizer/extra_tokens.csv')
tokenizer_opimizer = TokenizerOptimization(tokenizer, './Tokenizer', unk_token_data)
tokenizer = tokenizer_opimizer.optimize()
print('Length of Tokenizer : %d' %len(tokenizer))


Optimizing Tokenizer


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Length of Tokenizer : 32100


In [12]:
max_input_length = 768
max_target_length = 128

def tokenize_function(examples):
    inputs = ['summarize: ' + doc for doc in examples['document']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, load_from_cache_file=True)
print(tokenized_datasets)
train_data = tokenized_datasets['train']
val_data = tokenized_datasets['validation']

HBox(children=(FloatProgress(value=0.0, max=159.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'attention_mask', 'document', 'input_ids', 'labels', 'summary'],
        num_rows: 158355
    })
    validation: Dataset({
        features: ['__index_level_0__', 'attention_mask', 'document', 'input_ids', 'labels', 'summary'],
        num_rows: 39552
    })
})


## Model

In [14]:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F

from dataclasses import dataclass
from typing import Optional, Tuple

from transformers.models.t5.modeling_t5 import T5Stack, T5PreTrainedModel
from transformers.modeling_outputs import (
    ModelOutput,
    BaseModelOutput,
    Seq2SeqLMOutput,
)

from transformers.utils.model_parallel_utils import assert_device_map, get_device_map

In [15]:
@dataclass
class Seq2SeqLMOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


In [117]:
class BaseT5ForConditionalGeneration(T5PreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.model_dim = config.d_model

        self.shared = nn.Embedding(config.vocab_size, config.d_model) # embedding layer

        # encoder
        encoder_config = copy.deepcopy(config) 
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = T5Stack(encoder_config, self.shared)

        # decoder
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = T5Stack(decoder_config, self.shared)

        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) # output layer

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

    def parallelize(self, device_map=None):
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.encoder.block))
        self.encoder.parallelize(self.device_map)
        self.decoder.parallelize(self.device_map)
        self.lm_head = self.lm_head.to(self.decoder.first_device)
        self.model_parallel = True


    def deparallelize(self):
        self.encoder.deparallelize()
        self.decoder.deparallelize()
        self.encoder = self.encoder.to("cpu")
        self.decoder = self.decoder.to("cpu")
        self.lm_head = self.lm_head.to("cpu")
        self.model_parallel = False
        self.device_map = None
        torch.cuda.empty_cache()


    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def get_output_embeddings(self):
        return self.lm_head

    def get_encoder(self):
        return self.encoder

    def get_decoder(self):
        return self.decoder

    def forward(
        self,
        input_ids=None,                 # input ids of tokenized document
        attention_mask=None,            # attention mask of tokenized document
        decoder_input_ids=None,         # input id of tokenized summaries
        decoder_attention_mask=None,    
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        encoder_outputs=None,
        past_key_values=None,
        inputs_embeds=None,
        decoder_inputs_embeds=None,
        labels=None,                    # label id of tokenized summaries
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            if self.config.num_layers == self.config.num_decoder_layers:
                decoder_head_mask = head_mask

        # Encode if needed (training, first prediction pass)
        if encoder_outputs is None:
            # Convert encoder inputs in embeddings if needed 
            # Generate encoder output using input ids and attention mask
            encoder_outputs = self.encoder(
                input_ids=input_ids,                        # input ids of tokenized document
                attention_mask=attention_mask,              # attention mask of tokenized document
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        # output of encoder
        hidden_states = encoder_outputs[0] # (batch_size, input_length, feature_size)
        print('Output od encoder : {}'.format(hidden_states.shape))

        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)

        if labels is not None :
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                # get decoder inputs from shifting lm labels to the right
                decoder_input_ids = self._shift_right(labels)
            if decoder_attention_mask is None :
                one_tensor, zero_tensor = torch.ones(labels.shape), torch.zeros(labels.shape)
                if torch.cuda.is_available() :
                    one_tensor, zero_tensor  = one_tensor.cuda(), zero_tensor.cuda()

                decoder_attention_mask = torch.where(self._shift_right(labels) == 0, zero_tensor, one_tensor)
        

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.decoder.first_device)
            hidden_states = hidden_states.to(self.decoder.first_device)
            if decoder_input_ids is not None:
                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
            if attention_mask is not None:
                attention_mask = attention_mask.to(self.decoder.first_device)
            if decoder_attention_mask is not None:
                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,                      
            attention_mask=decoder_attention_mask,      
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,        
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # output of decoder
        sequence_output = decoder_outputs[0] # (batch_size, target_length, feature_size)
        print('Output of decoder : {}'.format(sequence_output.shape))
        

        # Set device for model parallelism
        if self.model_parallel:
            torch.cuda.set_device(self.encoder.first_device)
            self.lm_head = self.lm_head.to(self.encoder.first_device)
            sequence_output = sequence_output.to(self.lm_head.weight.device)

        if self.config.tie_word_embeddings:
            sequence_output = sequence_output * (self.model_dim ** -0.5)

        # output of model
        lm_logits = self.lm_head(sequence_output) # (batch_size , target_length, vocab_size)
        print('Output of model : {}'.format(lm_logits.shape))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
            
        if not return_dict:
            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return Seq2SeqLMOutput(
            loss=loss,                                                  # loss
            logits=lm_logits,                                           # output of model
            past_key_values=decoder_outputs.past_key_values,            
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,        # encoder output
            encoder_attentions=encoder_outputs.attentions,
        )


    # DataCollatorForSeq2Seq
    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return self._shift_right(labels)



In [118]:
model = BaseT5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)

Some weights of the model checkpoint at t5-small were not used when initializing BaseT5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing BaseT5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BaseT5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BaseT5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['lm_head.weight', 'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

## Training Data Input

In [119]:
train_data = tokenized_datasets['train']

In [120]:
train_data = train_data.remove_columns(['__index_level_0__', 'document', 'summary'])

In [121]:
train_data

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 158355
})

In [122]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [123]:
from torch.utils.data import DataLoader

In [124]:
train_dataloader = DataLoader(
    train_data, shuffle=True, batch_size=8, collate_fn=data_collator
)

In [125]:
for batch in train_dataloader:
    break

{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 768]),
 'input_ids': torch.Size([8, 768]),
 'labels': torch.Size([8, 30]),
 'decoder_input_ids': torch.Size([8, 30])}

## Check Pipeline

In [126]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    break

Output od encoder : torch.Size([8, 768, 512])
Output of decoder : torch.Size([8, 25, 512])
Output of model : torch.Size([8, 25, 32128])


In [127]:
outputs.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'encoder_last_hidden_state'])

In [128]:
outputs['loss']

tensor(9.2562, device='cuda:0', grad_fn=<NllLossBackward>)

In [129]:
outputs['logits'].shape # (batch_size, target_length, feature_size)

torch.Size([8, 25, 32128])

In [130]:
outputs['encoder_last_hidden_state'].shape # (batch_size, input_length, feature_size)

torch.Size([8, 768, 512])

In [131]:
batch['decoder_input_ids'].shape

torch.Size([8, 25])