In [1]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForTokenClassification
import scml
from scml import pandasx as pdx
from mylib.ner import NerDataset, evaluation
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
model_dir = Path("models/ner/deberta_v3_base/20240325_143353")
validation_data_file = Path("input/val_240102.json")
model_max_length = 1024
window_length = 1024
window_stride = 256
batch_size = 32

In [3]:
device = torch.device("cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")
    batch_size = 128
    print("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda:1")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

mps


In [4]:
ds = NerDataset.from_json(
    filepath=str(validation_data_file),
    tokenizer_directory=model_dir,
    model_max_length=model_max_length,
    window_length=window_length,
    window_stride=window_stride,
)
print(f"len(ds)={len(ds):,}\nds[0]={ds[0]}")

len(ds)=1,011
ds[0]={'input_ids': tensor([    1,  2169, 12103,  ...,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0,  ..., 0, 0, 0]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'labels': tensor([-100,    0,    0,  ..., -100, -100, -100])}




In [5]:
%%time
model = AutoModelForTokenClassification.from_pretrained(model_dir)
print(model)

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=Tr

In [6]:
%%time
res = evaluation(
    ds=ds,
    model=model,
    batch_size=batch_size,
    device=device,
)

predict ner: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [12:27<00:00, 93.48s/it]


CPU times: user 5.55 s, sys: 6min 46s, total: 6min 52s
Wall time: 12min 30s


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
print(json.dumps(res, indent=2))

{
  "micro_f5": 0.0837859183494379,
  "recall": 0.08098505268316589,
  "precision": 0.6189138576779026,
  "labels": {
    "I-URL_PERSONAL": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "I-EMAIL": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "I-USERNAME": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "B-STREET_ADDRESS": {
      "micro_f5": 0.03596837944664032,
      "recall": 0.034739454094292806,
      "precision": 0.3111111111111111
    },
    "I-STREET_ADDRESS": {
      "micro_f5": 0.04551938788743353,
      "recall": 0.0438489646772229,
      "precision": 0.9557522123893806
    },
    "B-PHONE_NUM": {
      "micro_f5": 0.04711567502265177,
      "recall": 0.045454545454545456,
      "precision": 0.5454545454545454
    },
    "B-EMAIL": {
      "micro_f5": 0.049429657794676805,
      "recall": 0.047619047619047616,
      "precision": 1.0
    },
    "B-ID_NUM": {
      "micro_f5": 0

In [8]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:12:31.047169
