In [1]:
import torch

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

#from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers.processors import RobertaProcessing

from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline



from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

  from pandas.core import (


In [9]:
#PRETRAINING_CORPUS_FILE = './data/fr_10M_conv/fr_10M_conv.txt'
PRETRAINING_CORPUS_FILE = './data_raw_txt/fr_10M_wiki/wiki_fr_10M.txt'

#CORPUS_FOLDER = './data/fr_10M_wiki/'
MODEL_NAME = 'fr_10M_10K_wiki'
#MODEL_NAME = 'fr_10M_4K_wiki'

# Train a tokenizer on the data

In [10]:
%%time 

#paths = [str(x) for x in Path(CORPUS_FOLDER).glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.normalizer = normalizers.Sequence([normalizers.Replace('-',''),normalizers.BertNormalizer(lowercase=True)])

# Customize training
tokenizer.train(files=[PRETRAINING_CORPUS_FILE], vocab_size=10000, min_frequency=5, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model('./models/'+MODEL_NAME)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

tokenizer = RobertaTokenizerFast.from_pretrained("./models/"+MODEL_NAME, max_len=512)






CPU times: user 50 s, sys: 3.02 s, total: 53 s
Wall time: 3.14 s




# Train model

In [11]:
config = RobertaConfig(
    vocab_size=10000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)


model.num_parameters()

51206416

In [12]:
%%time

# Create Dataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=PRETRAINING_CORPUS_FILE,
    block_size=128,
)

# Create Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Define other args
training_args = TrainingArguments(
    output_dir="./models/"+MODEL_NAME,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10000,
    save_total_limit=2,
    prediction_loss_only=True,
)


# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)


Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


CPU times: user 1min 7s, sys: 616 ms, total: 1min 8s
Wall time: 9.48 s


# Run Trainer

In [13]:
%%time
trainer.train()

# Save Models
trainer.save_model("./models/"+MODEL_NAME)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,6.1212
1000,5.3054
1500,4.7488
2000,4.4253
2500,4.1538
3000,3.984
3500,3.8375
4000,3.7185
4500,3.617
5000,3.5311


CPU times: user 40min 25s, sys: 2.67 s, total: 40min 28s
Wall time: 40min 27s


# Test the model

In [25]:
fill_mask = pipeline(
    "fill-mask",
    model="./models/"+MODEL_NAME,
    tokenizer="./models/"+MODEL_NAME
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [27]:
fill_mask("la petite <mask> dort")

[{'score': 0.2275107204914093,
  'token': 451,
  'token_str': ' elle',
  'sequence': 'la petite elle dort'},
 {'score': 0.14483419060707092,
  'token': 1212,
  'token_str': ' fille',
  'sequence': 'la petite fille dort'},
 {'score': 0.10206688940525055,
  'token': 897,
  'token_str': ' petite',
  'sequence': 'la petite petite dort'},
 {'score': 0.08099895715713501,
  'token': 575,
  'token_str': ' maman',
  'sequence': 'la petite maman dort'},
 {'score': 0.019143374636769295,
  'token': 912,
  'token_str': ' voiture',
  'sequence': 'la petite voiture dort'}]

In [28]:
fill_mask("ah d' <mask> oui")

[{'score': 0.8823479413986206,
  'token': 606,
  'token_str': 'accord',
  'sequence': "ah d'accord oui"},
 {'score': 0.03516211733222008,
  'token': 1450,
  'token_str': 'abord',
  'sequence': "ah d'abord oui"},
 {'score': 0.023529747501015663,
  'token': 1647,
  'token_str': 'ailleurs',
  'sequence': "ah d'ailleurs oui"},
 {'score': 0.011113138869404793,
  'token': 1415,
  'token_str': 'autres',
  'sequence': "ah d'autres oui"},
 {'score': 0.006881808862090111,
  'token': 832,
  'token_str': 'autre',
  'sequence': "ah d'autre oui"}]