In [1]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from tokenizers import ByteLevelBPETokenizer
from fastai.text.all import *
import torch
import os

In [2]:
pretrained_weights = 'gpt2'
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model_en = GPT2LMHeadModel.from_pretrained(pretrained_weights)
tokenizer_en.pad_token = tokenizer_en.eos_token
ByteLevelBPE_tokenizer_es_vocab_size = tokenizer_en.vocab_size
ByteLevelBPE_tokenizer_es_vocab_size
ByteLevelBPE_tokenizer_es = ByteLevelBPETokenizer()
paths = [ "es_wiki/all_texts_eswiki.txt" ]
ByteLevelBPE_tokenizer_es.train(files=paths, 
    vocab_size=ByteLevelBPE_tokenizer_es_vocab_size, 
    min_frequency=2, 
    special_tokens=["<|endoftext|>"])
ByteLevelBPE_tokenizer_es.enable_truncation(max_length=1024)

ByteLevelBPE_tokenizer_es_rep = 'ByteLevelBPE_tokenizer_es'
path_to_ByteLevelBPE_tokenizer_es_rep = "es_wiki/" + ByteLevelBPE_tokenizer_es_rep
if not os.path.exists(path_to_ByteLevelBPE_tokenizer_es_rep):
    os.mkdir(path_to_ByteLevelBPE_tokenizer_es_rep)
        
ByteLevelBPE_tokenizer_es.save_model(path_to_ByteLevelBPE_tokenizer_es_rep)
    # 3. Import the tokenizer config files in Portuguese into the pre-trained GPT2 Tokenizer
tokenizer_es = GPT2TokenizerFast.from_pretrained( path_to_ByteLevelBPE_tokenizer_es_rep, pad_token='<|endoftext|>')
    # Get sequence length max of 1024
tokenizer_es.model_max_length = 1024






In [3]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))


In [4]:
tokenizer_fastai_en = TransformersTokenizer(tokenizer_en)
tokenizer_fastai_es = TransformersTokenizer(tokenizer_es)

In [5]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
pretrained_weights = 'gpt2'
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model_en = GPT2LMHeadModel.from_pretrained(pretrained_weights)

# To correct the warning about token_pad (GPT2TokenizerFast), run the following code
# source: https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044
tokenizer_en.pad_token = tokenizer_en.eos_token

In [6]:
# Get the path to ByteLevelBPE_tokenizer_pt config files
ByteLevelBPE_tokenizer_es_rep = 'ByteLevelBPE_tokenizer_es'
path_to_ByteLevelBPE_tokenizer_es_rep = "es_wiki/" + ByteLevelBPE_tokenizer_es_rep
#path_to_ByteLevelBPE_tokenizer_es_rep = path_data/ByteLevelBPE_tokenizer_pt_rep

# import the pre-trained GPT2TokenizerFast tokenizer with the tokenizer_pt config files
tokenizer_es = GPT2TokenizerFast.from_pretrained(
    path_to_ByteLevelBPE_tokenizer_es_rep, 
    pad_token='<|endoftext|>')

# Get sequence length max of 1024
tokenizer_es.model_max_length = 1024

In [7]:
tokenizer_fastai_en = TransformersTokenizer(tokenizer_en)
old_vocab_size = tokenizer_fastai_en.tokenizer.vocab_size

tokenizer_fastai_es = TransformersTokenizer(tokenizer_es)
new_vocab_size = tokenizer_fastai_es.tokenizer.vocab_size

old_vocab_size,new_vocab_size,old_vocab_size-new_vocab_size

(50257, 50257, 0)

In [8]:
# Get weights of the old wte
old_wgts = model_en.transformer.get_input_embeddings().weight.clone().detach()

# Get the mean embedding vetor of the old wte
wgts_m = old_wgts.mean(0)

# Initialize vocab size and weights of the new wte
new_vocab_size = tokenizer_fastai_es.tokenizer.vocab_size
new_wgts = old_wgts.new_zeros(new_vocab_size,old_wgts.size(1))

In [9]:
# Get the new wte keeping the embeddings vetors of tokens in common in the 2 vocabs
# A token present in the new vocab but not in the old one gets the mean embedding vetor of the old wte
old_vocab = tokenizer_fastai_en.tokenizer.get_vocab()
new_vocab = tokenizer_fastai_es.tokenizer.get_vocab()
same_tokens_list = list()
different_tokens_list = list()
    
for w,idx_new in new_vocab.items():    
    idx_old = old_vocab.get(w, -1)
    if idx_old>=0:
        new_wgts[idx_new] = old_wgts[idx_old]
        same_tokens_list.append((w,idx_new))
    else:
        new_wgts[idx_new] = wgts_m
        different_tokens_list.append((w,idx_new))

# setup in model the new wte
new_wte = nn.Embedding(new_vocab_size,old_wgts.size(1))
#new_wte.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
new_wte.weight.data = new_wgts
model_en.transformer.set_input_embeddings(new_wte)
print(f'Spanish wte matrix setup done!\n\nWe kept {len(same_tokens_list)} embeddings vectors from the English one.\nWe did not kept {len(different_tokens_list)} embeddings vectors from the English one (instead, we used the old wte mean vector).\n')

# Check identical tokens between the 2 vocabs               
num = 15
print(f'{num} first tokens IN common between the 2 vocabs:\n{same_tokens_list[:num]}\n')
print(f'{num} first tokens NOT in common between the 2 vocabs:\n{different_tokens_list[:num]}')

# save new_wgts
torch.save(new_wgts, "ByteLevelBPE_tokenizer_es/new_wte_wgts.pt")
# save same_tokens_list and different_tokens_list
torch.save(same_tokens_list, "ByteLevelBPE_tokenizer_es/same_tokens_list.pt")
torch.save(different_tokens_list, 'ByteLevelBPE_tokenizer_es/different_tokens_list.pt')

Spanish wte matrix setup done!

We kept 11804 embeddings vectors from the English one.
We did not kept 38453 embeddings vectors from the English one (instead, we used the old wte mean vector).

15 first tokens IN common between the 2 vocabs:
[('ĠPRO', 29654), ('cott', 31077), ('pat', 36506), ('ĠAud', 36442), ('bos', 4001), ('001', 46539), ('tell', 9109), ('Ġti', 467), ('Russ', 49643), ('jo', 473), ('ĠVision', 34988), ('ĠVic', 2417), ('ner', 2717), ('ĠPrice', 23716), ('ading', 28422)]

15 first tokens NOT in common between the 2 vocabs:
[('Ġdisputan', 26419), ('crip', 6661), ('ĠabarcÃ³', 38367), ('Ġarreglista', 39012), ('ĠChacarita', 41312), ('ĠTormes', 41773), ('Ġnotas', 10256), ('ĠYuca', 12397), ('Ġmara', 9540), ('Ġvulnera', 12630), ('ĠdocumentaciÃ³n', 13212), ('ĠLeopoldo', 15383), ('Ġpatrocina', 18089), ('Ġdroga', 20668), ('Ġopa', 27065)]


In [10]:
# load new_wgts
new_wgts = torch.load('ByteLevelBPE_tokenizer_es/new_wte_wgts.pt')
# load same_tokens_list and different_tokens_list
same_tokens_list = torch.load('ByteLevelBPE_tokenizer_es/same_tokens_list.pt')
different_tokens_list = torch.load('ByteLevelBPE_tokenizer_es/different_tokens_list.pt')
                      
# setup in model the new wte
new_wte = nn.Embedding(new_vocab_size,old_wgts.size(1))
new_wte.weight.data = new_wgts
model_en.transformer.set_input_embeddings(new_wte)
print(f'Spanish wte matrix setup done!')
print(f'We kept {len(same_tokens_list)} embeddings vectors from the English one.')
print(f'We did not kept {len(different_tokens_list)} embeddings vectors from the English one (instead, we used the old wte mean vector).\n')

# Check identical tokens between the 2 vocabs               
num = 15
print(f'{num} first tokens IN common between the 2 vocabs:\n{same_tokens_list[:num]}\n')
print(f'{num} first tokens NOT in common between the 2 vocabs:\n{different_tokens_list[:num]}')

Spanish wte matrix setup done!
We kept 11804 embeddings vectors from the English one.
We did not kept 38453 embeddings vectors from the English one (instead, we used the old wte mean vector).

15 first tokens IN common between the 2 vocabs:
[('ĠPRO', 29654), ('cott', 31077), ('pat', 36506), ('ĠAud', 36442), ('bos', 4001), ('001', 46539), ('tell', 9109), ('Ġti', 467), ('Russ', 49643), ('jo', 473), ('ĠVision', 34988), ('ĠVic', 2417), ('ner', 2717), ('ĠPrice', 23716), ('ading', 28422)]

15 first tokens NOT in common between the 2 vocabs:
[('Ġdisputan', 26419), ('crip', 6661), ('ĠabarcÃ³', 38367), ('Ġarreglista', 39012), ('ĠChacarita', 41312), ('ĠTormes', 41773), ('Ġnotas', 10256), ('ĠYuca', 12397), ('Ġmara', 9540), ('Ġvulnera', 12630), ('ĠdocumentaciÃ³n', 13212), ('ĠLeopoldo', 15383), ('Ġpatrocina', 18089), ('Ġdroga', 20668), ('Ġopa', 27065)]


In [11]:
model_en.lm_head.weight = model_en.transformer.wte.weight
model_en.lm_head

Linear(in_features=768, out_features=50257, bias=False)

In [12]:
import pandas as pd
import numpy as np

In [13]:
lang = 'es'
fname = f'all_texts_{lang}wiki.csv'
df = pd.read_csv("es_wiki/" + fname)
print(len(df))

200000


In [14]:
df_sample = df[:1000]

num = int(0.8*len(df_sample))

idxs = np.random.randint(0, len(df_sample), len(df_sample))
idxs_train = idxs[:num]
idxs_val = idxs[num:]

In [15]:
all_texts = np.concatenate([df_sample.iloc[idxs_train].text.values, df_sample.iloc[idxs_val].text.values])
splits = [list(idxs_train), list(idxs_val)]
tls = TfmdLists(all_texts, TransformersTokenizer(tokenizer_es), splits=splits, dl_type=LMDataLoader)

In [17]:
num = int(0.8*len(df))

idxs = np.random.randint(0, len(df), len(df))
idxs_train = idxs[:num]
idxs_val = idxs[num:]

#save idxs train and valid
torch.save(idxs_train, 'ByteLevelBPE_tokenizer_es/idxs_train.pt')
torch.save(idxs_val, 'ByteLevelBPE_tokenizer_es/idxs_val.pt')

In [None]:
idxs_train = torch.load('ByteLevelBPE_tokenizer_es/idxs_train.pt')
idxs_val = torch.load('ByteLevelBPE_tokenizer_es/idxs_val.pt')

In [18]:
all_texts = np.concatenate([df.iloc[idxs_train].text.values, df.iloc[idxs_val].text.values])

In [20]:
splits = [list(idxs_train), list(idxs_val)]
tls = TfmdLists(all_texts, TransformersTokenizer(tokenizer_es), splits=splits, dl_type=LMDataLoader)

In [22]:
bs,sl = 8, 1082
dls = tls.dataloaders(bs=bs, seq_len=sl)

KeyboardInterrupt: 

In [None]:
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

In [None]:
def splitter(model):
    "Split a GPT2 `model` in 3 groups for differential learning rates."
    
    # First layers group : decoder blocks from 0 to 3
    modules = []
    for i in range(4): modules.append(model.transformer.h[i])
    groups = [nn.Sequential(*modules)]

    # Second layers group : decoder blocks from 4 to 7
    modules = []
    for i in range(4,8,1): modules.append(model.transformer.h[i])
    groups = L(groups + [nn.Sequential(*modules)])

    # Third layers group : decoder blocks from 8 to 11
    modules = []
    for i in range(8,12,1): modules.append(model.transformer.h[i])
    groups = L(groups + [nn.Sequential(*modules)])
    
    # Fourth layers group : embeddings matrices wte and wpe + LayerNorm at the model output
    groups = L(groups + [nn.Sequential(model.transformer.wte,model.transformer.wpe,model.transformer.ln_f)])
    
    return groups.map(params)

In [23]:
learn = Learner(dls, model_en, loss_func=CrossEntropyLossFlat(),
                splitter = splitter,
                cbs=[DropOutput], 
                metrics=[accuracy, Perplexity()]).to_fp16()

NameError: name 'splitter' is not defined

In [None]:
learn.validate()