# 10 - Pretraining CamemBERT

Run the script below with :
- train_file: the local absolute path to paper-ner-bench-das22/dataset/unsupervised_pretraining/10-normalized/all.txt
- output_dir: where to save the pretrained model

In [1]:
""" Loads the configuration """

# Set to 1/true/ to set the logging level of nerlogger to DEBUG 
# and save the the spacy datasets as TXT along with the .spacy file
#  for easier debug of the training set generation.
%env DEBUG=1

# If True, activates a set of assertions in the notebooks to ensure
# that the scripts runs with the parameters used in the paper.
%env AS_IN_THE_PAPER = True

import util.config as config

config.show()

env: DEBUG=1
env: AS_IN_THE_PAPER=True


18/05/2022 03:32:38 ; INFO ; BASEDIR: /home/bertrand/paper-ner-bench-das22
18/05/2022 03:32:38 ; INFO ; Input datasets will be loaded from DATASETDIR /home/bertrand/paper-ner-bench-das22/dataset
18/05/2022 03:32:38 ; INFO ; Training data and models will be saved to NERDIR /home/bertrand/paper-ner-bench-das22/src/ner
18/05/2022 03:32:38 ; INFO ; Debug mode is ON
18/05/2022 03:32:38 ; INFO ; Random seed: 42
18/05/2022 03:32:38 ; INFO ; Enable reproducibility checks: True


In [2]:
""" Import all modules at once """

# General imports
import nltk
import gzip

# NER imports
from util.as_in_the_paper import assert_expected


## 11 - Creation of the pre-training set from raw entries extracted with OCR

In [15]:
import nltk

pretraining_set_path = config.DATASETDIR / "unsupervised_pretraining/10-normalized"

# Filter the normalized pre-training 
# Keep only entries containing at least MIN_WORDS_PER_ENTRY words
MIN_WORDS_PER_ENTRY = 7 # Keep simples entries like "Morel abbé ,Tournon, 14."

assert_expected(actual=MIN_WORDS_PER_ENTRY, expected=7)

nltk.download("punkt")
number_of_entries = 0
valid_entries = []

with gzip.open(pretraining_set_path / "all.txt.gz", "rt") as all_txt:
    for entry in all_txt:
        entry = entry.strip() # Sanitize
        words = nltk.word_tokenize(entry, language="fr", preserve_line=True)
        if len(words) >= MIN_WORDS_PER_ENTRY:
            valid_entries.append(entry)
        number_of_entries += 1
        
with open("/tmp/pretraining_data.txt", "w") as wfp:
    wfp.write("\n".join(valid_entries))

print(len(valid_entries))
print("Valid entries:%d, %f.2 percent of the total" % (len(valid_entries),len(valid_entries) / number_of_entries))
assert_expected(actual=845014, expected=len(valid_entries))
assert_expected(actual=1045674, expected=number_of_entries)

[nltk_data] Downloading package punkt to /home/bertrand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


845014
Valid entries:845014, 0.808105.2 percent of the total


## 12 - Pretraining CamemBERT on pretraining_data.txt

In [16]:
!python pretrain_camembert.py --model_name_or_path "Jean-Baptiste/camembert-ner" --train_file "/tmp/pretraining_data.txt" --do_train --do_eval --line_by_line --output_dir "${config.BASEDIR}/10-camembert_pretrained_model" --save_total_limit 1 --load_best_model_at_end True --evaluation_strategy "steps" --save_strategy "steps"

05/18/2022 16:05:46 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=500,
evaluation_strategy=IntervalStrategy.STEPS,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_met