<a href="https://colab.research.google.com/github/s276842/masked-language-model-text-generation/blob/test/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 503 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, tr

# Nvidia Apex

In [None]:
%%capture
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:

%%writefile setup.sh
export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [None]:
!sh setup.sh

APEX_AVAILABLE = True

Cloning into 'apex'...
remote: Enumerating objects: 8815, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 8815 (delta 20), reused 20 (delta 4), pack-reused 8767[K
Receiving objects: 100% (8815/8815), 14.48 MiB | 16.88 MiB/s, done.
Resolving deltas: 100% (6001/6001), done.
  cmdoptions.check_install_build_global(options)
Using pip 21.1.3 from /usr/local/lib/python3.7/dist-packages/pip (python 3.7)
Value for scheme.platlib does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/local/lib/python3.7/dist-packages
sysconfig: /usr/lib/python3.7/site-packages
Value for scheme.purelib does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/local/lib/python3.7/dist-packages
sysconfig: /usr/lib/python3.7/site-packages
Value for scheme.headers does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/loca

# Dataset

In [None]:
import time


In [8]:
!git clone --branch test https://github.com/s276842/masked-language-model-text-generation.git

Cloning into 'masked-language-model-text-generation'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 38 (delta 0), reused 1 (delta 0), pack-reused 35[K
Unpacking objects: 100% (38/38), done.


In [9]:
import os
os.chdir('/content/masked-language-model-text-generation')

from dataset import *
from model import *

os.chdir('/content')

In [10]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import os
from transformers import AdamW, get_linear_schedule_with_warmup

use_apex = True
try:
    os.chdir('/content/apex')
    from apex import amp

    APEX_AVAILABLE = True
except ModuleNotFoundError:
    APEX_AVAILABLE = False


def truncate_lef_input_embeddings(tokenizer, input_embeddings):
    bos_token_id = tokenizer.bos_token_id or tokenizer.cls_token_id
    pad_token_id = tokenizer.pad_token_id

    input_ids = input_embeddings['input_ids'][:,-tokenizer.model_max_length:]
    input_ids[input_ids[:,0] != pad_token_id, 0] = bos_token_id
    input_embeddings['input_ids'] = input_ids

    input_embeddings['attention_mask'] = input_embeddings['attention_mask'][:, -tokenizer.model_max_length:]
    input_embeddings['token_type_ids'] = input_embeddings['token_type_ids'][:, -tokenizer.model_max_length:]

    return input_embeddings


def init_batch(tokenizer, batch, generation_max_len=40):

    batch_size = len(batch[0])
    seed = batch[0]
    text = [' '.join([tokenizer.mask_token] * generation_max_len)] * batch_size

    # note that the input_embeddings must be truncated manually since there is no option to truncate on left side.
    # Passing truncation=True, will truncate on the right-side of the dialog context removing the most recent utterances
    # todo suppress warning for exceeding tokens
    input_embeddings = tokenizer(seed, text, return_tensors='pt', padding=True, return_token_type_ids = True)
    input_embeddings = truncate_lef_input_embeddings(tokenizer, input_embeddings)

    target = []
    for response in batch[1]:
        response_tokens = tokenizer.encode(response + ' <eos> ', add_special_tokens=False)[:generation_max_len + 1]
        response_tokens += [tokenizer.pad_token_id] * (generation_max_len - len(response_tokens))
        target.append(response_tokens)

    target = torch.tensor(target)

    return input_embeddings, target

def negative_attention(attentions, counter):
    avg_attentions = attentions.cpu().mean(axis=(0, 2))




def train(textgenerator, train_dataloader, optimizer, scheduler, log_interval: int = 15, generation_max_len: int = 40, device: str ='cpu', use_apex = False):
    
    tokenizer = textgenerator.tokenizer
    model = textgenerator.model.to(device)

    model.train()
    total_loss = 0.
    num_batches = len(train_dataloader)

    for i, batch in enumerate(train_dataloader):
        start_time = time.time()

        input_embeddings, response = init_batch(tokenizer, batch, generation_max_len=generation_max_len)
        input_embeddings = input_embeddings.to(device)

        batch_size, num_tokens = input_embeddings['input_ids'].shape
        context_offset = num_tokens - generation_max_len - 1

        # Initialize probabilty distribution and counter
        p = torch.tensor([[1] + [0] * (num_tokens - 1)] * batch_size)
        counter = torch.ones((batch_size, generation_max_len))

        for j in range(num_iter):
            logits, positions, attentions  = textgenerator(input_embeddings, p=p,
                                                               context_offset=context_offset,
                                                               generation_max_len=generation_max_len)

                     
            counter[torch.arange(batch_size), positions] += 1

            # Retrieve target tokens from ground-truth:
            target = response[torch.arange(batch_size), positions]
            
            optimizer.zero_grad()
            loss = F.cross_entropy(logits, target)
                
            if use_apex:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

            if j % log_interval == 0 and j > 0:
                lr = scheduler.get_last_lr()[0]
                ms_per_batch = (time.time() - start_time) * 1000 / log_interval
                cur_loss = total_loss / log_interval
                
                print(f'| {i:5d}/{num_batches:5d} batches | {j:5d}/{num_iter:5d} iter | '
              f'lr {lr:04.4f} | ms/batch {ms_per_batch:5.2f} | '
              f'loss {cur_loss:5.2f} ')
          
                total_loss = 0
                print_a_sentence(response, input_embeddings['input_ids'][0], tokenizer, generation_max_len)


            p = negative_attention(attentions, counter)
            # Replace tokens
            dist = torch.distributions.categorical.Categorical(logits=logits)
            tokens = dist.sample()
            input_embeddings['input_ids'][torch.arange(batch_size), positions + context_offset] = tokens.to(device)

            start_time = time.time()


def evaluate(textgenerator, val_dataloader, device='cpu') -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.

    with torch.no_grad():
        for i, batch in enumerate(val_dataloader):

            input_embeddings, response = init_batch(tokenizer, batch, generation_max_len=generation_max_len)
            input_embeddings = input_embeddings.to(device)

            batch_size, num_tokens = input_embeddings['input_ids'].shape
            p = torch.tensor([[1] + [0] * (num_tokens - 1)] * batch_size)
            counter = torch.ones((batch_size, generation_max_len))
            
            for i in range(100):
                # Mask and compute predictions
                logits, positions, attentions  = textgenerator(input_embeddings, p=p,
                                                               context_offset=context_offset,
                                                               generation_max_len=generation_max_len)

                counter[torch.arange(batch_size), positions] += 1

                # Retrieve target tokens from ground-truth:
                target = response[torch.arange(batch_size), positions]
                
                # Replace tokens
                dist = torch.distributions.categorical.Categorical(logits=logits)
                tokens = dist.sample()
                input_embeddings['input_ids'][torch.arange(batch_size), positions + context_offset] = tokens
                
                loss = F.cross_entropy(logits, target)
                total_loss += loss.item()
                
            print_a_sentence(response, input_embeddings['input_ids'][0], tokenizer, generation_max_len)
    
    return total_loss / (len(val_dataloader) - 1)


def print_a_sentence(response, generated, tokenizer, generation_max_len):
  print()
  print(f'GROUND TRUTH: \t{tokenizer.decode(response[0])}')
  print(f'GENERATED:    \t{tokenizer.decode(generated[-(generation_max_len + 1):-1])}\n')




def finetune(textgenerator, train_dataloader, val_dataloader = None, generation_max_len=40, num_epochs=8, num_iter=150, log_interval=15, use_apex=False, device = 'cpu'):
    torch.autograd.set_detect_anomaly(True)
    num_batches = len(train_dataloader)
    total_steps =  num_batches * num_epochs * num_iter
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    #todo implement nvidia apex
    if use_apex:
        device = 'cuda'
        textgenerator.model, optimizer = amp.initialize(textgenerator.model,
                                          optimizer,
                                          opt_level="O2",
                                          keep_batchnorm_fp32=True,
                                          loss_scale="dynamic")
    else:
        textgenerator.model.to(device)
    
    
    # todo defjne total steps (w.r.t num of iterations or number of batches)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=0)
    train_loss = []
    val_loss = np.nan
    val_ppl = np.nan

    for epoch in range(1, num_epochs + 1):
        print(f'| epoch {epoch:3d} |')
        print('-' * 89)
        epoch_start_time = time.time()
        
        train(textgenerator, train_dataloader, optimizer, scheduler, log_interval=20, device=device, use_apex=use_apex)

        if val_dataloader is not None:
          val_loss = evaluate(textgenerator, val_dataloader, device=device)
          val_ppl = math.exp(val_loss)
        
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join('/content/drive/MyDrive/experiments/dstc9/model', f'bert-{datetime.datetime.now().__str__()}'))


In [12]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModelForMaskedLM

import numpy as np


DEFAULT_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

textgenerator = MaskedLMGenerator('bert-base-uncased')
tokenizer = textgenerator.tokenizer
model = textgenerator.model

import os

PATH = '/content/masked-language-model-text-generation/data'
train_log = os.path.join(PATH, 'train/logs.json')
train_lab = os.path.join(PATH, 'train/labels.json')
train_kb  = os.path.join(PATH, 'knowledge.json')

val_log = os.path.join(PATH, 'val/logs.json')
val_lab = os.path.join(PATH, 'val/labels.json')
val_kb  = os.path.join(PATH, 'knowledge.json')

train_set = MultiWOZDataset(train_log, train_lab, train_kb)
val_set = MultiWOZDataset(val_log, val_lab, val_kb)

tokenizer.add_special_tokens({'additional_special_tokens':list(train_set.special_tokens.values()) + ['<eos>']})
model.resize_token_embeddings(len(tokenizer))

from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_set, batch_size=8, shuffle=True)

num_epochs = 8
num_iter = 150
generation_max_len = 40



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
finetune(textgenerator, 
         train_dataloader, 
         val_dataloader = None, 
         generation_max_len=40, 
         num_epochs=8, 
         num_iter=80, 
         log_interval=5,
         use_apex=True,
         device = 'cuda')

Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.

Defaults for this optimization level are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O2
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : True
master_weights         : True
loss_scale             : dynamic


RuntimeError: ignored