In [1]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForTokenClassification
import scml
from scml import pandasx as pdx
from mylib.ner import NerDataset, evaluation
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
model_dir = Path("models/ner/deberta_v3_large/20240324_153756")
validation_data_file = Path("input/val_240102.json")
model_max_length = 768
window_length = 768
window_stride = 256
batch_size = 32

In [3]:
device = torch.device("cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")
    batch_size = 128
    print("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda:1")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [4]:
ds = NerDataset.from_json(
    filepath=str(validation_data_file),
    tokenizer_directory=model_dir,
    model_max_length=model_max_length,
    window_length=window_length,
    window_stride=window_stride,
)
print(f"len(ds)={len(ds):,}\nds[0]={ds[0]}")

len(ds)=1,303
ds[0]={'input_ids': tensor([    1,  2169, 12103,   270,  8432, 63632,   608,  3365, 26097, 50209,
          358,   689,  1374,   366, 16789,  6738,   573,  1719,   264,  3634,
          269,   262,   735,   265,   266,   483,   272,   269,   497,  4172,
         1909,   264,   676,   263,  1044,   359,   451,  2746,  1696,   366,
          263,   295,   298,  8563,   275,   262,  1479,  1935,   272,   306,
          286,   330,   267,   262,   437,   375,   740,   323,   279,   483,
          269,   267,   266, 14920,   366,   283,   278,   303,   375,  1068,
         2848,  2600,   279,   362,  7031,   826,   262,   906,   265,  1909,
         1241,   270,   262,  2820,   265,   262,  1696,   366,   267,  1067,
          313,  5387,   264, 16902,   267,   262,   568,   263,   365,   991,
         1028,   267,   262,  1061,   714,   515,   313,  5470,   272,   262,
          781,   313,   303,   303,  1569,   397,   793,   267,  1404,   388,
          263,   264,   286,  



In [5]:
%%time
model = AutoModelForTokenClassification.from_pretrained(model_dir)
print(model)

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwis

In [6]:
%%time
res = evaluation(
    ds=ds,
    model=model,
    batch_size=batch_size,
    device=device,
)

predict ner: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [01:24<00:00,  2.07s/it]


CPU times: user 1min 23s, sys: 5.75 s, total: 1min 29s
Wall time: 1min 29s


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
print(json.dumps(res, indent=2))

{
  "micro_f5": 0.9930982759082507,
  "recall": 0.992920970175235,
  "precision": 0.9975515914655474,
  "labels": {
    "I-URL_PERSONAL": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "I-EMAIL": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "I-USERNAME": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "B-STREET_ADDRESS": {
      "micro_f5": 0.9654677549556012,
      "recall": 0.965034965034965,
      "precision": 0.9764150943396226
    },
    "B-NAME_STUDENT": {
      "micro_f5": 0.9856202178506225,
      "recall": 0.9850746268656716,
      "precision": 0.9994591671173607
    },
    "I-ID_NUM": {
      "micro_f5": 0.990234375,
      "recall": 0.9898477157360406,
      "precision": 1.0
    },
    "B-ID_NUM": {
      "micro_f5": 0.9910104279036318,
      "recall": 0.9906542056074766,
      "precision": 1.0
    },
    "I-NAME_STUDENT": {
      "micro_f5": 0.9911846582121868,
      "recall":

In [8]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:01:42.792263
