In [1]:
import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForTokenClassification
import scml
from scml import pandasx as pdx
from mylib.ner import NerDataset, evaluation, CustomDebertaV2ForTokenClassification

In [2]:
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")
device = torch.device("cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")
    batch_size = 128
    print("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda:1")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

int16, min=-32768, max=32767
device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [3]:
model_dir = Path("models/ner/deberta_v3_large/20240423_131925")
validation_data_file = Path("input/val_v020.json")
model_max_length = 512
window_length = 512
window_stride = 256
batch_size = 32
model_class = "CustomDebertaV2ForTokenClassification"
#model_class = "auto"

In [4]:
ds = NerDataset.from_json(
    filepath=str(validation_data_file),
    tokenizer_directory=model_dir,
    model_max_length=model_max_length,
    window_length=window_length,
    window_stride=window_stride,
)
print(f"len(ds)={len(ds):,}\nds[0]={ds[0]}")



len(ds)=8,105
ds[0]={'input_ids': tensor([    1,  7181,   877,  1393,   288,  4882,   804,  3805, 42621,   366,
         1749, 20379,   366,  6292, 12461,  4649,  4150,   288,  4882,   804,
         3805, 42621,   267,   262, 19789,   263, 13717,   648,   265,  1749,
        20379,   366,  1993,   366,   269,   299,   517,   272,   273, 18770,
          469,   406,   323, 38993, 17159, 11351, 25673,   263, 19392,  8402,
          366,   291,   990,   826,   266,   988,  4404,   265,  2293,   366,
         2352,   366,   263,  4904,   272,   273,   286,  3069,   264,  2685,
          323,   279,  1250,   265,   312,  4513,   269,   311,   265,   359,
          370,  1318,  5228,   323, 63356,   557,   292,   262, 20953,   263,
        25747,   265,   707,   432,   366,  1749, 20379,   888,   266, 20574,
         1192,   399,   273,   295, 20170,   263, 32283,   323,   279, 13457,
         2163,   265,  1417,   341,  4109, 63726,   366,  2423, 67819,   267,
          262,  2450,   366,  

In [5]:
%%time
if model_class=="CustomDebertaV2ForTokenClassification":
    model = CustomDebertaV2ForTokenClassification.from_pretrained(model_dir)
else:
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
print(model)

CustomDebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elem

In [6]:
%%time
res = evaluation(
    ds=ds,
    model=model,
    batch_size=batch_size,
    device=device,
)

predict ner: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 254/254 [04:45<00:00,  1.12s/it]


CPU times: user 4min 35s, sys: 20.8 s, total: 4min 56s
Wall time: 4min 56s


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
print(json.dumps(res, indent=2))

{
  "micro_f5": 0.7952193707605854,
  "recall": 0.7893735765948394,
  "precision": 0.9758972772277228,
  "labels": {
    "I-URL_PERSONAL": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "I-ID_NUM": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "I-EMAIL": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "I-USERNAME": {
      "micro_f5": 0.0,
      "recall": 0.0,
      "precision": 0.0
    },
    "B-URL_PERSONAL": {
      "micro_f5": 0.46452476572958495,
      "recall": 0.45478374836173,
      "precision": 1.0
    },
    "B-NAME_STUDENT": {
      "micro_f5": 0.5121968267281751,
      "recall": 0.502813229038013,
      "precision": 0.960167714884696
    },
    "I-NAME_STUDENT": {
      "micro_f5": 0.766745998234233,
      "recall": 0.7600791491466733,
      "precision": 0.9821029082774049
    },
    "B-STREET_ADDRESS": {
      "micro_f5": 0.8062787136294026,
      "recall": 0.8007604562737642

In [8]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:05:09.995752
