# 40 - Experiment #2

## Initialization

In [1]:
import os

# Is this notebook running on Google Colab ?
ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)
ENV_IS_GOOGLE_COLAB

False

In [2]:
if ENV_IS_GOOGLE_COLAB:
  !pip install transformers datasets spacy transformers[sentencepiece] seqeval

In [3]:
import sys, os
from pathlib import Path

# If on GColab, workdir is expected to be in Google Drive
# Adapt the value BASE according to your situation

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  BASE = mountpoint / "/content/drive/MyDrive/SODUCO/article_das_2022"
  drive.mount(str(mountpoint)) # Mount gdrive immediately
  sys.path.append(str(BASE)) # Add BASE to Python Path
else:
  # Expect all dependencies to be in the same directory as this notebook
  BASE = Path(os.path.dirname(os.path.realpath("__file__"))).resolve()

BASE

PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner')

## 41. Fine-tune CamemBERT-pretrained on the gold REFerence


In [4]:
# PATHS
FINETUNED_MODEL_OUTPUT_DIR = BASE / "41-experiment_2_finetuned_camembert_pretrained"

REF_GOLD_DATASET = BASE / "02-experiment_2_prepared_datasets/huggingface_ref"

MODEL_NAME = BASE / "10-camembert_pretrained_model"

# Same training configuration as in 20-experiment_1
TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100, # Make Early callback bug ?
    "save_total_limit": 1,
    "output_dir": FINETUNED_MODEL_OUTPUT_DIR
}

In [5]:
from datasets import load_from_disk
from camembert_util import train_eval_loop, init_model

train_dev_test = load_from_disk(REF_GOLD_DATASET)

# Get the model components
model, tokenizer, training_args = init_model(str(MODEL_NAME), TRAINING_CONFIG)

# Run train eval
metrics = train_eval_loop(model,
                          training_args, 
                          tokenizer,     
                          **train_dev_test, 
                          patience=5)
metrics

RuntimeError: KeyboardInterrupt: 

Run the following if you want to download the model from GColab. It might take a while.

In [None]:
if ENV_IS_GOOGLE_COLAB:
  !zip -r "/tmp/41-experiment_2_finetuned_camembert_pretrained.zip" $FINETUNED_MODEL_OUTPUT_DIR

  from google.colab import files
  files.download("/tmp/41-experiment_2_finetuned_camembert_pretrained.zip")

## 42 - Fine-tune Camembert on the gold REFerence

In [None]:
# PATHS

REF_GOLD_DATASET = BASE / "02-experiment_2_prepared_datasets/huggingface_ref"

MODEL_NAME = "Jean-Baptiste/camembert-ner"

# Same training configuration as in 41...
assert TRAINING_CONFIG
# ... but change the output directory
TRAINING_CONFIG["output_dir"] = BASE / "42-experiment_2_camembert" 

In [None]:
from datasets import load_from_disk
from camembert_util import train_eval_loop, init_model

train_dev_test = load_from_disk(REF_GOLD_DATASET)

# Get the model components
model, tokenizer, training_args = init_model(str(MODEL_NAME), TRAINING_CONFIG)

# Run train eval
metrics = train_eval_loop(model,
                          training_args, 
                          tokenizer,     
                          **train_dev_test,
                          patience=5)
metrics

## 43 - Evaluate CamemBERT-pretrained and camembert (simple) on the REF, PERO and TESS test datasets

Load the model fine-tuned on the ref data from `BASE/41-experiment_2_finetuned_camembert_pretrained`.

If you prefer to use the model shared on huggingface.co, replace `model_path` with `HueyNemud/HueyNemud/das2022-41-berties-pretrained-finetuned-ref`. 


In [5]:
REF_GOLD_DATASET = BASE / "02-experiment_2_prepared_datasets/huggingface_ref"
PERO_GOLD_DATASET = BASE / "02-experiment_2_prepared_datasets/huggingface_pero"
TESS_GOLD_DATASET = BASE / "02-experiment_2_prepared_datasets/huggingface_tess"

REF_GOLD_DATASET, PERO_GOLD_DATASET, TESS_GOLD_DATASET 

(PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/02-experiment_2_prepared_datasets/huggingface_ref'),
 PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/02-experiment_2_prepared_datasets/huggingface_pero'),
 PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/02-experiment_2_prepared_datasets/huggingface_tess'))

In [6]:
# Where to store the computed metrics
METRICS_OUTPUT = BASE / "43-experiment_2_metrics"
METRICS_OUTPUT.mkdir(exist_ok=True, parents=True)
METRICS_OUTPUT

PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/43-experiment_2_metrics')

In [14]:
#Helper func to predict & save the computed metrics
import json
from datasets import load_from_disk

def eval_save(dataset_path, name):
    train_dev_test = load_from_disk(dataset_path)
    predictions_ref = trainer.predict(train_dev_test["test"])

    with open( METRICS_OUTPUT / f"camembert_{name}.json" ,"w", encoding="utf-8") as pf:
        js = json.dumps(predictions_ref.metrics)
        pf.write(js)
        js

In [None]:
# We won't train so keep it minimalistic
dummy_config = {"output_dir": "/tmp"}

# Load the fine-tuned model
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer
from camembert_util import compute_metrics, init_model


model_path = BASE / "41-experiment_2_finetuned_camembert_pretrained"
model, tokenizer, training_args = init_model(str(model_path), dummy_config)

model.eval() # Switch to evaluation mode

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [16]:
eval_save(REF_GOLD_DATASET, "pretrained_ref")
eval_save(PERO_GOLD_DATASET, "pretrained_pero")
eval_save(TESS_GOLD_DATASET, "pretrained_tess")

The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: tokens, ner_tags.
***** Running Prediction *****
  Num examples = 1669
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: tokens, ner_tags.
***** Running Prediction *****
  Num examples = 1669
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: tokens, ner_tags.
***** Running Prediction *****
  Num examples = 1669
  Batch size = 8


In [20]:
# Load the fine-tuned model
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer
from camembert_util import compute_metrics

# We won't train so keep it minimalistic
dummy_config = {"output_dir": "/tmp"}

model_path = BASE / "42-experiment_2_finetuned_camembert"
model, tokenizer, training_args = init_model(str(model_path), dummy_config)

model.eval() # Switch to evaluation mode

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

16/01/2022 11:29:13 ; INFO ; Model /home/bertrand/dev/paper-ner-bench-das22/src/ner/42-experiment_2_finetuned_camembert
Didn't find file /home/bertrand/dev/paper-ner-bench-das22/src/ner/42-experiment_2_finetuned_camembert/added_tokens.json. We won't load it.
loading file /home/bertrand/dev/paper-ner-bench-das22/src/ner/42-experiment_2_finetuned_camembert/sentencepiece.bpe.model
loading file /home/bertrand/dev/paper-ner-bench-das22/src/ner/42-experiment_2_finetuned_camembert/tokenizer.json
loading file None
loading file /home/bertrand/dev/paper-ner-bench-das22/src/ner/42-experiment_2_finetuned_camembert/special_tokens_map.json
loading file /home/bertrand/dev/paper-ner-bench-das22/src/ner/42-experiment_2_finetuned_camembert/tokenizer_config.json
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start upd

In [21]:
eval_save(REF_GOLD_DATASET, "simple_ref")
eval_save(PERO_GOLD_DATASET, "simple_pero")
eval_save(TESS_GOLD_DATASET, "simple_tess")

The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: tokens, ner_tags.
***** Running Prediction *****
  Num examples = 1669
  Batch size = 8


The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: tokens, ner_tags.
***** Running Prediction *****
  Num examples = 1669
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: tokens, ner_tags.
***** Running Prediction *****
  Num examples = 1669
  Batch size = 8
