In [1]:
import torch

import os

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

#from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers.processors import RobertaProcessing

from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline



from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

  from pandas.core import (


In [3]:
MODELS = {'fr_10M_10K_wiki': {'file':'./data/fr_10M_wiki/wiki_fr_10M.txt', 'vocab_size' : 10000, 'min_freq': 2},
          'fr_10M_4K_wiki': {'file':'./data/fr_10M_wiki/wiki_fr_10M.txt', 'vocab_size' : 4000, 'min_freq': 2},
          'fr_10M_52K_wiki': {'file':'./data/fr_10M_wiki/wiki_fr_10M.txt', 'vocab_size' : 52000, 'min_freq': 2},
          'fr_10M_10K_conv': {'file':'./data/fr_10M_conv/fr_10M_conv.txt', 'vocab_size' : 10000, 'min_freq': 2},
          'fr_10M_4K_conv': {'file':'./data/fr_10M_conv/fr_10M_conv.txt', 'vocab_size' : 4000, 'min_freq': 2},
          'fr_10M_52K_conv': {'file':'./data/fr_10M_conv/fr_10M_conv.txt', 'vocab_size' : 52000, 'min_freq': 2},
          'fr_10M_30K_wiki': {'file':'./data/fr_10M_wiki/wiki_fr_10M.txt', 'vocab_size' : 30000, 'min_freq': 2},
          'fr_10M_30K_conv': {'file':'./data/fr_10M_conv/fr_10M_conv.txt', 'vocab_size' : 30000, 'min_freq': 2},

         }

FOLDER_MODELS = './models/'

# Train a tokenizer on the data

In [32]:
def train_tokenizer(folder_models, model_name):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.normalizer = normalizers.Sequence([normalizers.Replace('-',''),normalizers.BertNormalizer(lowercase=True)])
    tokenizer.train(files=MODELS[model_name]['file'], 
                    vocab_size=MODELS[model_name]['vocab_size'],
                    min_frequency=MODELS[model_name]['min_freq'], 
                    special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"])

    tokenizer.save_model(folder_models+model_name)
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
        )
    
    tokenizer.enable_truncation(max_length=512)
                           
    tokenizer = RobertaTokenizerFast.from_pretrained(folder_models+model_name, max_len=512)
    return tokenizer


In [33]:
def create_trainer(model_name,tokenizer):
    config = RobertaConfig(
        vocab_size=MODELS[model_name]['vocab_size'],
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )
    
    model = RobertaForMaskedLM(config=config)
    #model.num_parameters()   
 
    # Create Dataset
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=MODELS[model_name]['file'],
        block_size=128,
    )
    print(dataset)
    
    # Create Data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    
    # Define other args
    training_args = TrainingArguments(
    output_dir=FOLDER_MODELS+model_name,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10000,
    save_total_limit=2,
    prediction_loss_only=True,
    )
    
    # Define Trainer
    return Trainer(model=model,args=training_args,data_collator=data_collator,train_dataset=dataset)
    

In [27]:
def create_and_save_model(folder_models, model_name):
    os.mkdir(folder_models+model_name)
    print('Training Tokenizer')
    tokenizer = train_tokenizer(FOLDER_MODELS,model_name)
    print('Training Model')
    trainer = create_trainer(model_name,tokenizer)
    trainer.train()
    trainer.save_model(FOLDER_MODELS+model_name)
    return 0
    

In [28]:
%%time

create_and_save_model(FOLDER_MODELS,'fr_10M_10K_wiki')


Training Tokenizer



Training Model
<transformers.data.datasets.language_modeling.LineByLineTextDataset object at 0x7d6b6518e210>


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,6.1246
1000,5.3438
1500,4.7938
2000,4.4602
2500,4.176
3000,4.0014
3500,3.8466
4000,3.7322
4500,3.629
5000,3.5331


CPU times: user 43min 54s, sys: 9.06 s, total: 44min 3s
Wall time: 42min 9s


0

In [35]:
%%time

for mod in MODELS.keys():
    create_and_save_model(FOLDER_MODELS,mod)


Training Tokenizer



Training Model


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


<transformers.data.datasets.language_modeling.LineByLineTextDataset object at 0x7d6a54c47650>


Step,Training Loss
500,5.8795
1000,5.209
1500,4.6518
2000,4.2834
2500,4.0085
3000,3.8111
3500,3.6627
4000,3.5446
4500,3.446
5000,3.3718


Training Tokenizer







Training Model


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


<transformers.data.datasets.language_modeling.LineByLineTextDataset object at 0x7d6a4a143610>


Step,Training Loss
500,6.4754
1000,5.4066
1500,4.8121
2000,4.5118
2500,4.2434
3000,4.0496
3500,3.9237
4000,3.8012
4500,3.718
5000,3.6444


Training Tokenizer







Training Model


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


<transformers.data.datasets.language_modeling.LineByLineTextDataset object at 0x7d6a4520b610>


Step,Training Loss
500,5.7506
1000,4.6114
1500,4.0986
2000,3.7831
2500,3.5767
3000,3.4478
3500,3.3054
4000,3.2195
4500,3.1551
5000,3.1135


Training Tokenizer



Training Model


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


<transformers.data.datasets.language_modeling.LineByLineTextDataset object at 0x7d6a2f77fad0>


Step,Training Loss
500,5.4813
1000,4.4198
1500,3.9345
2000,3.6393
2500,3.437
3000,3.2902
3500,3.182
4000,3.0921
4500,3.0037
5000,2.9409


Training Tokenizer







Training Model


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


<transformers.data.datasets.language_modeling.LineByLineTextDataset object at 0x7d6a4f8edf50>


Step,Training Loss
500,6.1348
1000,4.7499
1500,4.2526
2000,3.8999
2500,3.7006
3000,3.5538
3500,3.4732
4000,3.3373
4500,3.2676
5000,3.1764


CPU times: user 4h 21min 17s, sys: 1min 9s, total: 4h 22min 27s
Wall time: 4h 3min 59s


# Test the model

In [4]:
for mod in MODELS.keys():
    print(mod)
    fill_mask = pipeline("fill-mask",model=FOLDER_MODELS+mod,tokenizer=FOLDER_MODELS+mod)
    print(fill_mask("la petite <mask> dort"))
    print(fill_mask("ah d' <mask> oui"))
    


fr_10M_10K_wiki


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.011491399258375168, 'token': 654, 'token_str': ' aussi', 'sequence': 'la petite aussi dort'}, {'score': 0.011219908483326435, 'token': 268, 'token_str': ' de', 'sequence': 'la petite de dort'}, {'score': 0.010336298495531082, 'token': 406, 'token_str': ' plus', 'sequence': 'la petite plus dort'}, {'score': 0.00953746773302555, 'token': 927, 'token_str': ' partie', 'sequence': 'la petite partie dort'}, {'score': 0.0091394716873765, 'token': 586, 'token_str': ' pays', 'sequence': 'la petite pays dort'}]
[{'score': 0.08441455662250519, 'token': 311, 'token_str': 'un', 'sequence': "ah d'un oui"}, {'score': 0.037614110857248306, 'token': 330, 'token_str': 'est', 'sequence': "ah d'est oui"}, {'score': 0.037533558905124664, 'token': 1214, 'token_str': 'origine', 'sequence': "ah d'origine oui"}, {'score': 0.03541676327586174, 'token': 347, 'token_str': 'une', 'sequence': "ah d'une oui"}, {'score': 0.022666728124022484, 'token': 1057, 'token_str': 'art', 'sequence': "ah d'art oui"}

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.020219974219799042, 'token': 268, 'token_str': ' de', 'sequence': 'la petite de dort'}, {'score': 0.013238407671451569, 'token': 569, 'token_str': ' deux', 'sequence': 'la petite deux dort'}, {'score': 0.012288931757211685, 'token': 301, 'token_str': ' des', 'sequence': 'la petite des dort'}, {'score': 0.01215873472392559, 'token': 1838, 'token_str': ' premiers', 'sequence': 'la petite premiers dort'}, {'score': 0.008466473780572414, 'token': 406, 'token_str': ' plus', 'sequence': 'la petite plus dort'}]
[{'score': 0.3145544230937958, 'token': 311, 'token_str': 'un', 'sequence': "ah d'un oui"}, {'score': 0.2018461674451828, 'token': 347, 'token_str': 'une', 'sequence': "ah d'une oui"}, {'score': 0.031564027070999146, 'token': 1058, 'token_str': 'autres', 'sequence': "ah d'autres oui"}, {'score': 0.029208842664957047, 'token': 1214, 'token_str': 'origine', 'sequence': "ah d'origine oui"}, {'score': 0.015505088493227959, 'token': 294, 'token_str': 'il', 'sequence': "ah d'il 

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.17534729838371277, 'token': 1212, 'token_str': ' fille', 'sequence': 'la petite fille dort'}, {'score': 0.08288508653640747, 'token': 575, 'token_str': ' maman', 'sequence': 'la petite maman dort'}, {'score': 0.04977859929203987, 'token': 897, 'token_str': ' petite', 'sequence': 'la petite petite dort'}, {'score': 0.04546881094574928, 'token': 2039, 'token_str': ' dame', 'sequence': 'la petite dame dort'}, {'score': 0.022174082696437836, 'token': 361, 'token_str': ' qui', 'sequence': 'la petite qui dort'}]
[{'score': 0.8731542825698853, 'token': 606, 'token_str': 'accord', 'sequence': "ah d'accord oui"}, {'score': 0.03843608498573303, 'token': 1450, 'token_str': 'abord', 'sequence': "ah d'abord oui"}, {'score': 0.019755449146032333, 'token': 1647, 'token_str': 'ailleurs', 'sequence': "ah d'ailleurs oui"}, {'score': 0.00864246767014265, 'token': 1415, 'token_str': 'autres', 'sequence': "ah d'autres oui"}, {'score': 0.003066019853577018, 'token': 832, 'token_str': 'autre', '