In [None]:
#default_exp models

In [None]:
#hide
%load_ext autoreload
%autoreload 2
%load_ext line_profiler



In [None]:
#export
import numpy as np
from transformers import (AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer)
from sentence_transformers import SentenceTransformer
from types import MethodType
from undecorated import undecorated
from travis_attack.config import Config


In [None]:
#hide 
from IPython.core.debugger import set_trace

# Models

In [None]:
#export
def _prepare_pp_tokenizer_and_model(cfg): 
    """As well as preparing the pp model and tokenizer this function also adds a new method `generate_with_grad` to 
    the pp model so that we can backprop when generating."""
    pp_tokenizer = AutoTokenizer.from_pretrained(cfg.pp_name)
    # takes about 3GB memory space up on the GPU
    # change the `local_files_only` argument if changing the model name 
    pp_model = AutoModelForSeq2SeqLM.from_pretrained(cfg.pp_name, local_files_only=True)
    pp_model.train()
    pp_model_freeze_layers(cfg, pp_model)  # dictated by cfg.unfreeze_last_n_layers; set to "all" to do no freezing
    generate_with_grad = undecorated(pp_model.generate)      # remove the @no_grad decorator from generate
    pp_model.generate_with_grad = MethodType(generate_with_grad, pp_model) 
    return pp_tokenizer, pp_model 
    
def _prepare_vm_tokenizer_and_model(cfg): 
    vm_tokenizer = AutoTokenizer.from_pretrained(cfg.vm_name)
    #TODO: do I need the .to(device) for the vm model when using accelerate?
    # change the `local_files_only` argument if changing the model name 
    vm_model = AutoModelForSequenceClassification.from_pretrained(cfg.vm_name, local_files_only=True).to(cfg.device)
    vm_model.eval()
    return vm_tokenizer, vm_model 
    
def _pad_model_token_embeddings(cfg, pp_model, vm_model, sts_model): 
    """Resize first/embedding layer of all models to be a multiple of cfg.embedding_padding_multiple. 
    Good for tensor core efficiency when using fp16.
    Makes changes to models in-place."""
    def pad_token_embeddings_to_multiple_of_n(model, n):
        def get_new_vocab_size(model): return int((np.floor(model.config.vocab_size / n) + 1) * n)
        model.resize_token_embeddings(get_new_vocab_size(model))
    pad_token_embeddings_to_multiple_of_n(pp_model, cfg.embedding_padding_multiple)
    pad_token_embeddings_to_multiple_of_n(vm_model, cfg.embedding_padding_multiple)
    # sts_model is from SentenceTransformers so needs a bit of unwrapping to access the base huggingface model 
    pad_token_embeddings_to_multiple_of_n(sts_model._first_module().auto_model, cfg.embedding_padding_multiple) 

def _update_config(cfg, vm_model, pp_model): 
    cfg.vm_num_labels = vm_model.num_labels
    cfg.vocab_size = pp_model.get_input_embeddings().num_embeddings   # unlike pp_tokenizer.vocab_size this includes the padding 
    return cfg

def prepare_models(cfg): 
    """Load tokenizers and models for vm, pp, sts. 
    Pad the first embedding layer if specified in the config.  
    Do layer freezing if specified (TODO). 
    Update config with some model-specific variables. 
    """
    vm_tokenizer, vm_model = _prepare_vm_tokenizer_and_model(cfg)
    pp_tokenizer, pp_model = _prepare_pp_tokenizer_and_model(cfg)
    sts_model = SentenceTransformer(cfg.sts_name)
    if cfg.pad_token_embeddings:  _pad_model_token_embeddings(cfg, pp_model, vm_model, sts_model)
    cfg = _update_config(cfg, vm_model, pp_model)
    return vm_tokenizer, vm_model, pp_tokenizer, pp_model, sts_model, cfg

def _get_layers_to_unfreeze(cfg): 
    """Return a list that determines which layers should be kept unfrozen"""
    if cfg.pp_name   == "tuner007/pegasus_paraphrase":               
        unfreeze_layer_list,last_layer_num = ['decoder.layer_norm'],          15
    elif cfg.pp_name == "tdopierre/ProtAugment-ParaphraseGenerator": 
        unfreeze_layer_list,last_layer_num = ['decoder.layernorm_embedding'],  5
    elif cfg.pp_name == "eugenesiow/bart-paraphrase":                
        unfreeze_layer_list,last_layer_num = ['decoder.layernorm_embedding'], 11
    for i in range(last_layer_num, last_layer_num-cfg.unfreeze_last_n_layers, -1): 
        unfreeze_layer_list.append(f'decoder.layers.{i}')
    # self.lm_head is tied (the same parameter as) to self.encoder.embed_tokens and self.decoder.embed_tokens.
    # and this is given by shared.weight
    # From here: https://github.com/huggingface/transformers/issues/10479#issuecomment-788964822
    unfreeze_layer_list.append('shared.weight')
    return unfreeze_layer_list

def pp_model_freeze_layers(cfg, pp_model): 
    """Freeze all layers of pp_model except the last few decoder layers (determined by cfg.unfreeze_last_n_layers), 
    the final layer_norm layer, and the linear head (which is tied to the input embeddings). """
    if cfg.unfreeze_last_n_layers == "all":
        for i, (name, param) in enumerate(pp_model.base_model.named_parameters()): param.requires_grad = True
    else: 
        unfreeze_layer_list = _get_layers_to_unfreeze(cfg)
        for i, (name, param) in enumerate(pp_model.base_model.named_parameters()): 
            if np.any([o in name for o in unfreeze_layer_list]):   param.requires_grad = True
            else:                                                  param.requires_grad = False
    return pp_model


def save_pp_model(pp_model, optimizer, path_run, epoch): 
    """Save training state (for both pp_model and optimiser) as a checkpoint at a given epoch. """
    path = f"{path_run}model_{epoch}"
    torch.save({
                'epoch': epoch,
                'pp_model_state_dict': pp_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                }, path)
    
def resume_pp_model(pp_model, optimizer, path): 
    """Replace the training state with a saved checkpoint.. Reinitialises both pp_model and optimiser state. """
    state = torch.load(path)
    pp_model.load_state_dict(state['pp_model_state_dict'])
    optimizer.load_state_dict(state['optimizer_state_dict'])
    return pp_model, optimizer

## Usage

### Basics

The main function is `prepare_models(cfg)`. This gives all the models and tokenizers needed for the other sections and also updates + returns the config. It takes care of embedding layer resizing and layer freezing too. 

In [None]:
cfg = Config()
vm_tokenizer, vm_model, pp_tokenizer, pp_model, sts_model, cfg = prepare_models(cfg)

### Loading and saving model checkpoints  

If needed you can save and reload models from a checkpoint with `save_pp_model()` and `resume_pp_model()`.

### Layer freezing

To do layer freezing on the pp model use the `pp_model_freeze_layers()` function. This will freeze all layers except:  

* the last `cfg.unfreeze_last_n_layers` layers of the decoder.
* the final layernorm layer 
* the LM head (which is tied to the input embeddings) which is given by `shared.weight`. 

At the moment I'm not certain on if I should be unfreezing the linear head and the layernorm layers or leaving them frozen. I am erring on the side of unfreezing and making them trainable).  

To freeze all layers except the last few, set the number of layers to unfreeze by assigning cfg.unfreeze_last_n_layers to an int. To unfreeze the whole model, set `cfg.unfreeze_last_n_layers = "all"` 


Let's test it out. 

In [None]:
cfg = Config()
cfg.pp_name  = "tuner007/pegasus_paraphrase"
cfg.unfreeze_last_n_layers = 3
_, pp_model = _prepare_pp_tokenizer_and_model(cfg)
pp_model = pp_model_freeze_layers(cfg, pp_model)
print("Number of decoder layers to unfreeze:", cfg.unfreeze_last_n_layers)

Number of decoder layers to unfreeze: 3


We can check which layers are frozen and unfrozen: 

In [None]:
for i, (name, param) in enumerate(pp_model.base_model.named_parameters()): print(i, name, param.requires_grad)

0 shared.weight True
1 encoder.embed_positions.weight False
2 encoder.layers.0.self_attn.k_proj.weight False
3 encoder.layers.0.self_attn.k_proj.bias False
4 encoder.layers.0.self_attn.v_proj.weight False
5 encoder.layers.0.self_attn.v_proj.bias False
6 encoder.layers.0.self_attn.q_proj.weight False
7 encoder.layers.0.self_attn.q_proj.bias False
8 encoder.layers.0.self_attn.out_proj.weight False
9 encoder.layers.0.self_attn.out_proj.bias False
10 encoder.layers.0.self_attn_layer_norm.weight False
11 encoder.layers.0.self_attn_layer_norm.bias False
12 encoder.layers.0.fc1.weight False
13 encoder.layers.0.fc1.bias False
14 encoder.layers.0.fc2.weight False
15 encoder.layers.0.fc2.bias False
16 encoder.layers.0.final_layer_norm.weight False
17 encoder.layers.0.final_layer_norm.bias False
18 encoder.layers.1.self_attn.k_proj.weight False
19 encoder.layers.1.self_attn.k_proj.bias False
20 encoder.layers.1.self_attn.v_proj.weight False
21 encoder.layers.1.self_attn.v_proj.bias False
22 encod

We can test too that we leave the LM head unfrozen. 

In [None]:
for i, (name, param) in enumerate(pp_model.lm_head.named_parameters()):     assert param.requires_grad 

Let's try with no layer freezing.

In [None]:
cfg.unfreeze_last_n_layers = "all"
pp_model = pp_model_freeze_layers(cfg, pp_model)
for i, (name, param) in enumerate(pp_model.base_model.named_parameters()): assert param.requires_grad
for i, (name, param) in enumerate(pp_model.lm_head.named_parameters()):    assert param.requires_grad 

Let's also test this works with the other models

In [None]:
cfg.unfreeze_last_n_layers = 2
for name in ["tdopierre/ProtAugment-ParaphraseGenerator", "eugenesiow/bart-paraphrase"]: 
    cfg = Config()
    cfg.pp_name  = name
    _, pp_model = _prepare_pp_tokenizer_and_model(cfg)
    pp_model = pp_model_freeze_layers(cfg, pp_model)
    print("Number of decoder layers to unfreeze:", cfg.unfreeze_last_n_layers)
    for i, (name, param) in enumerate(pp_model.base_model.named_parameters()): print(i, name, param.requires_grad)
    for i, (name, param) in enumerate(pp_model.lm_head.named_parameters()):     assert param.requires_grad 
    print("\n#################################################\n")

Number of decoder layers to unfreeze: 2
0 shared.weight True
1 encoder.embed_positions.weight False
2 encoder.layers.0.self_attn.k_proj.weight False
3 encoder.layers.0.self_attn.k_proj.bias False
4 encoder.layers.0.self_attn.v_proj.weight False
5 encoder.layers.0.self_attn.v_proj.bias False
6 encoder.layers.0.self_attn.q_proj.weight False
7 encoder.layers.0.self_attn.q_proj.bias False
8 encoder.layers.0.self_attn.out_proj.weight False
9 encoder.layers.0.self_attn.out_proj.bias False
10 encoder.layers.0.self_attn_layer_norm.weight False
11 encoder.layers.0.self_attn_layer_norm.bias False
12 encoder.layers.0.fc1.weight False
13 encoder.layers.0.fc1.bias False
14 encoder.layers.0.fc2.weight False
15 encoder.layers.0.fc2.bias False
16 encoder.layers.0.final_layer_norm.weight False
17 encoder.layers.0.final_layer_norm.bias False
18 encoder.layers.1.self_attn.k_proj.weight False
19 encoder.layers.1.self_attn.k_proj.bias False
20 encoder.layers.1.self_attn.v_proj.weight False
21 encoder.layer

Number of decoder layers to unfreeze: 2
0 shared.weight True
1 encoder.embed_positions.weight False
2 encoder.layers.0.self_attn.k_proj.weight False
3 encoder.layers.0.self_attn.k_proj.bias False
4 encoder.layers.0.self_attn.v_proj.weight False
5 encoder.layers.0.self_attn.v_proj.bias False
6 encoder.layers.0.self_attn.q_proj.weight False
7 encoder.layers.0.self_attn.q_proj.bias False
8 encoder.layers.0.self_attn.out_proj.weight False
9 encoder.layers.0.self_attn.out_proj.bias False
10 encoder.layers.0.self_attn_layer_norm.weight False
11 encoder.layers.0.self_attn_layer_norm.bias False
12 encoder.layers.0.fc1.weight False
13 encoder.layers.0.fc1.bias False
14 encoder.layers.0.fc2.weight False
15 encoder.layers.0.fc2.bias False
16 encoder.layers.0.final_layer_norm.weight False
17 encoder.layers.0.final_layer_norm.bias False
18 encoder.layers.1.self_attn.k_proj.weight False
19 encoder.layers.1.self_attn.k_proj.bias False
20 encoder.layers.1.self_attn.v_proj.weight False
21 encoder.layer

#hide
## Export 

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 03_config.ipynb.
Converted 07_models.ipynb.
Converted 10_data.ipynb.
Converted 20_trainer.ipynb.
Converted 30_logging.ipynb.
Converted index.ipynb.
Converted run.ipynb.
