# M2 - 210 - Experiment #1 - Joint-Labelling

This notebook aims to fine-tune a NER model on a specialized dataset

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(os.environ["CUDA_VISIBLE_DEVICES"])

1


In [2]:
import torch
torch.cuda.empty_cache()
torch.cuda.is_available()
with torch.no_grad():
    torch.cuda.empty_cache()

In [4]:
import os, sys
from pathlib import Path

BASE = Path(os.path.dirname(os.path.realpath("__file__"))).resolve() # If not on GColab, BASE will be the directory of this notebook
DATASETS = Path('/home/STual/DAN-cadastre/data').resolve()
OUT_BASE = Path('/home/STual/DAN-cadastre/outputs/NER_inference').resolve()

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/STual/DAN-cadastre/.venv_dan/lib/python3.10/site-packages']
/home/STual/DAN-cadastre/scripts/NER
/home/STual/DAN-cadastre/data
/home/STual/DAN-cadastre/outputs/NER_inference


## Parameters

**Don't forget to check tokenizer name in *model_util_IO.py* file (same name as model) !**

In [5]:
RUN_CAMEMBERT_IO = True

# Number of times a model will be trained & evaluated on each a dataset
N_RUNS = 5

In [6]:
# COMMON CONSTANTS

TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100,
    "save_total_limit": 1
}

In [7]:
import pandas as pd
from datasets import load_from_disk

#Print examples from datasets
def loadExample(INPUT_DIR,set_length:int,i:int,subset:str):
    set_ = load_from_disk(INPUT_DIR / f"huggingface_{set_length}")
    data = {"tokens": set_[subset][i]["tokens"],
            "labels": set_[subset][i]["ner_tags"]}
    df = pd.DataFrame.from_dict(data)
    print(df)

  from .autonotebook import tqdm as notebook_tqdm


# 1. Train & eval

In [1]:
import os
from pathlib import Path
from config import logger
import time
import datetime
import torch

In [9]:
MODEL_NAME = "camembert_ner"
MODEL = "Jean-Baptiste/camembert-ner"
LABEL = "io"
FOLDER = "ner-joint-labelling-io-1534"

### 1.1 Load IO dataset

In [10]:
# Expected datasets indexed by number of examples in the trainset
SIZE = 1534
TRAINSETS_SIZES = [SIZE] #To train only on the biggest dataset

# INPUT / OUTPUT DIRS
INPUT_DIR = OUT_BASE / f"m2-experiment_1_prepared_dataset_ref_{LABEL}_{MODEL_NAME}"
METRICS_OUTPUT_DIR = OUT_BASE / "m2-210-experiment_1_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

(PosixPath('/home/STual/DAN-cadastre/outputs/NER_inference/m2-experiment_1_prepared_dataset_ref_io_camembert_ner'),
 PosixPath('/home/STual/DAN-cadastre/outputs/NER_inference/m2-210-experiment_1_metrics'))

### Example

In [11]:
loadExample(INPUT_DIR,TRAINSETS_SIZES[-1],10,"train")

   tokens            labels
0    ▁Vie          I-name+O
1     nos          I-name+O
2      ▁V  I-familystatus+O
3   <unk>  I-familystatus+O
4       e  I-familystatus+O
5   <unk>  I-familystatus+O
6      ▁J    I-firstnames+O
7   <unk>    I-firstnames+O
8       n    I-firstnames+O
9   <unk>    I-firstnames+O
10     ▁l    I-firstnames+O
11    oui    I-firstnames+O
12      s    I-firstnames+O
13      ,               O+O
14  ▁fils               O+O
15  ▁Jean               O+O
16    ▁ba               O+O
17     pt               O+O
18  <unk>               O+O
19      e               O+O
20  <unk>               O+O
21      à               O+O
22   ▁vin       I-address+O
23      c       I-address+O


In [12]:
set_ = load_from_disk(INPUT_DIR / f"huggingface_{SIZE}")
set_

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1534
    })
    dev: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 171
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 427
    })
})

### 2.2 Fine-tuning with IO labels - train & eval

In [13]:
from config import logger
from datasets import load_from_disk
import json
from camembert_utils.util_IO import init_model, train_eval_loop, _convert_tokenizer

def train_bert(metrics_output_directory):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        for trainset_size in TRAINSETS_SIZES:
            datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"
            logger.info(f"Running on datasets in {datasetdir}")
            logger.info(f"Metrics will be saved in {output_dir}")
            
            model, tokenizer, training_args = init_model(MODEL,local_config,run)
            logger.info(f"{model} #{run}, will save in {output_dir}")

            train_dev_test = load_from_disk(datasetdir)
            train = train_dev_test["train"]
            dev = train_dev_test["dev"]
            test = train_dev_test["test"]
            metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                      training_args, # Idem
                                      tokenizer,
                                      train,dev,test)

            # Save the dev and test metrics
            metrics_file = output_dir / f"test_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[0], o)

            metrics_file = output_dir / f"dev_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[1], o)
                
            torch.cuda.empty_cache()

[nltk_data] Downloading package punkt to /home/STual/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
if RUN_CAMEMBERT_IO:
    assert _convert_tokenizer.name_or_path == MODEL
    
    # MODEL CONSTS
    MODEL_METRICS_DIR = METRICS_OUTPUT_DIR / f"{FOLDER}"
    MODEL_METRICS_DIR.mkdir(exist_ok=True, parents=True)
    MODEL_OUTPUT_MODEL_PATH = OUT_BASE / f"tmp/{FOLDER}"
    MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH

    # Set config output dir
    local_config = TRAINING_CONFIG.copy() 
    local_config["output_dir"]=MODEL_OUTPUT_MODEL_PATH

    # Run the main loop
    h = time.time()
    train_bert(MODEL_METRICS_DIR)
    runtime = (time.time()- h)/N_RUNS
    print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")
    torch.cuda.empty_cache()
else:
    print("Skipped finetuning model for IO labels")

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.496916,0.801418,0.809069,0.805226,0.888527
200,No log,0.306686,0.875895,0.875895,0.875895,0.924329
300,No log,0.283455,0.873508,0.873508,0.873508,0.921074
400,No log,0.31381,0.852459,0.868735,0.86052,0.917819
500,0.409500,0.270599,0.872941,0.885442,0.879147,0.92677
600,0.409500,0.30183,0.872642,0.883055,0.877817,0.928397
700,0.409500,0.326238,0.885167,0.883055,0.88411,0.923515
800,0.409500,0.312103,0.889157,0.880668,0.884892,0.930024
900,0.409500,0.29706,0.888095,0.890215,0.889154,0.934093
1000,0.095200,0.293919,0.890777,0.875895,0.883273,0.924329


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.50104,0.765258,0.778043,0.771598,0.868999
200,No log,0.373625,0.827103,0.844869,0.835891,0.901546
300,No log,0.288022,0.881235,0.885442,0.883333,0.918633
400,No log,0.286652,0.888361,0.892601,0.890476,0.929211
500,0.419300,0.287214,0.901914,0.899761,0.900836,0.931652
600,0.419300,0.350325,0.869048,0.871122,0.870083,0.921074
700,0.419300,0.235889,0.881517,0.887828,0.884661,0.936534
800,0.419300,0.278993,0.879433,0.887828,0.88361,0.938161
900,0.419300,0.322305,0.870588,0.883055,0.876777,0.933279
1000,0.098100,0.325348,0.885986,0.890215,0.888095,0.92677


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.549373,0.761905,0.763723,0.762813,0.864931
200,No log,0.330268,0.847255,0.847255,0.847255,0.910496
300,No log,0.307824,0.846868,0.871122,0.858824,0.914565
400,No log,0.302199,0.867788,0.861575,0.864671,0.924329
500,0.435000,0.26531,0.870283,0.880668,0.875445,0.93572
600,0.435000,0.253597,0.896226,0.906921,0.901542,0.941416
700,0.435000,0.222136,0.895487,0.899761,0.897619,0.943857
800,0.435000,0.25069,0.883886,0.890215,0.887039,0.936534
900,0.435000,0.316228,0.883886,0.890215,0.887039,0.92677
1000,0.099100,0.278449,0.897375,0.897375,0.897375,0.940602


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.516179,0.738717,0.742243,0.740476,0.873881
200,No log,0.368334,0.836105,0.840095,0.838095,0.90236
300,No log,0.308492,0.882494,0.878282,0.880383,0.908055
400,No log,0.284796,0.877698,0.873508,0.875598,0.922701
500,0.429100,0.288824,0.901679,0.897375,0.899522,0.934906
600,0.429100,0.29225,0.895238,0.897375,0.896305,0.938975
700,0.429100,0.307045,0.888095,0.890215,0.889154,0.928397
800,0.429100,0.314974,0.888889,0.897375,0.893112,0.931652
900,0.429100,0.337019,0.884615,0.878282,0.881437,0.928397
1000,0.094000,0.328618,0.889952,0.887828,0.888889,0.928397


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.512632,0.769231,0.787589,0.778302,0.876322
200,No log,0.393257,0.825581,0.847255,0.836278,0.897478
300,No log,0.327684,0.840278,0.866348,0.853114,0.912937
400,No log,0.288785,0.854118,0.866348,0.86019,0.925956
500,0.402400,0.272125,0.873832,0.892601,0.883117,0.932465
600,0.402400,0.328633,0.886525,0.894988,0.890736,0.92677
700,0.402400,0.237849,0.889151,0.899761,0.894425,0.938975
800,0.402400,0.287955,0.882075,0.892601,0.887307,0.928397
900,0.402400,0.287911,0.913669,0.909308,0.911483,0.937347
1000,0.091300,0.35518,0.883055,0.883055,0.883055,0.92677


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




Run-time is equal to 0:02:25.519071
