In [1]:
from pathlib import Path
import os
import sys
import gc
import shutil
import json
import math
from collections import defaultdict
import numpy as np
import pandas as pd
from spacy.lang.en import English
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any

In [2]:
class ModelConf(NamedTuple):
    name: str
    directory: Path
    model_max_length: int=512
    batch_size: int=16
    weight: float=1
    model_class: str="auto"
        

class Conf(NamedTuple):
    debug: bool = False  
    input_dir: Path = Path("/kaggle/input")
    comp_dir: Path = input_dir / "pii-detection-removal-from-educational-data"
    temp_dir: Path = Path('/kaggle/temp')
    # write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
    working_dir: Path = Path('/kaggle/working')
    resource_dir: Path = input_dir / "libtlalpii/tlal-pii-0.1"
    data_dir: Path = resource_dir / "input"
    models: List[ModelConf] = [
        ModelConf(
            name="deberta_v3_base",
            directory=Path(resource_dir) / "models/ner/deberta_v3_base/20240424_062422",
            model_max_length=512,
            batch_size=32,
            weight=1,
            model_class="CustomDebertaV2ForTokenClassification",
        ),
    ]
    window_length: int = 512
    window_stride: int = 256
    outside_label_threshold: float = 0.70
    duplicate_dt_strategy: str = "first"  # first, mean, min_outside_proba 
    postprocess_email: bool = False
    postprocess_url: bool = False
    postprocess_phone: bool = False
        

conf = Conf()
print(conf)

Conf(debug=False, input_dir=PosixPath('/kaggle/input'), comp_dir=PosixPath('/kaggle/input/pii-detection-removal-from-educational-data'), temp_dir=PosixPath('/kaggle/temp'), working_dir=PosixPath('/kaggle/working'), resource_dir=PosixPath('/kaggle/input/libtlalpii/tlal-pii-0.1'), data_dir=PosixPath('/kaggle/input/libtlalpii/tlal-pii-0.1/input'), models=[ModelConf(name='deberta_v3_base', directory=PosixPath('/kaggle/input/libtlalpii/tlal-pii-0.1/models/ner/deberta_v3_base/20240424_062422'), model_max_length=512, batch_size=32, weight=1, model_class='CustomDebertaV2ForTokenClassification')], window_length=512, window_stride=256, outside_label_threshold=0.7, duplicate_dt_strategy='first', postprocess_email=False, postprocess_url=False, postprocess_phone=False)


In [3]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, Tesla P100-PCIE-16GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [4]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
sys.path.append(str(conf.input_dir / "sgcharts-ml/src"))
sys.path.append(str(conf.resource_dir / "src"))
import scml
from scml import nlp as snlp
from scml import pandasx as pdx
from mylib.ner import predict_ner_proba, NerDataset, blend_predictions, CustomDebertaV2ForTokenClassification
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
scml.seed_everything()

In [5]:
fp = conf.comp_dir / "test.json"
if conf.debug:
    fp = conf.comp_dir / "train.json"
with open(str(fp)) as f:
    data = json.load(f)
print(f"len(data)={len(data)}")
texts: List[List[str]] = []
dids: List[str] = []
for row in data:
    texts.append(row["tokens"])
    dids.append(str(row["document"]))

len(data)=10


# Model Inference

In [6]:
ds = NerDataset(
    tokenizer=None,
    texts=texts,
    document_ids=dids,
    window_length=conf.window_length,
    window_stride=conf.window_stride,
)
dwm_map=defaultdict(list)
for m, mc in enumerate(conf.models):
    print(mc.name)
    tokenizer = AutoTokenizer.from_pretrained(
        str(mc.directory),
        model_max_length=mc.model_max_length,
    )
    ds.tokenizer = tokenizer
    if mc.model_class=="CustomDebertaV2ForTokenClassification":
        model = CustomDebertaV2ForTokenClassification.from_pretrained(str(mc.directory))
    else:
        model = AutoModelForTokenClassification.from_pretrained(str(mc.directory))
    # (seqs, sequence length in tokens, classes)
    y_proba = predict_ner_proba(
        ds=ds,
        model=model,
        batch_size=mc.batch_size,
        device=device,
    )
    print(f"{y_proba.shape} y_proba[0][0]={y_proba[0][0]}")
    # token-to-word mapping (document, word): (models, classes)
    for i in range(len(y_proba)):
        d=int(ds.document_ids[i])
        for j in range(len(y_proba[i])):
            w = ds.word_ids[i][j]
            if w is None:
                continue
            w+=ds.word_ranges[i][0]
            # collect all predictions including duplicate doc-word pairs
            if conf.duplicate_dt_strategy=="first" and len(dwm_map[(d,w,m)])!=0:
                continue
            dwm_map[(d,w,m)].append(y_proba[i][j])
dw_map=defaultdict(list)
for k,v in dwm_map.items():
    d,w,m = k
    if conf.duplicate_dt_strategy=="min_outside_proba":
        i = np.argmin(v, axis=0)[0]  # `Outside` label at index 0
        dw_map[(d,w)].append(v[i].flatten().tolist())
    else:
        dw_map[(d,w)].append(np.mean(v, axis=0).flatten().tolist()) 

deberta_v3_base




(38, 512, 15) y_proba[0][0]=[0.0730754  0.06579716 0.08677855 0.06226376 0.05844414 0.07046641
 0.07404029 0.05362616 0.06816988 0.04697222 0.08053344 0.0478453
 0.10761833 0.05604679 0.0483222 ]


In [7]:
del texts, dids, ds, model, tokenizer, dwm_map
gc.collect()

120

In [8]:
sub = blend_predictions(
    weights=np.array([[mc.weight for mc in conf.models]], dtype=np.float32),
    dw_map=dw_map,
    outside_label_threshold=conf.outside_label_threshold,
)

# Postprocess

In [9]:
if conf.postprocess_phone or conf.postprocess_email or conf.postprocess_url:
    dw_map={}
    for t in sub.itertuples():
        document = int(getattr(t, "document"))
        token = int(getattr(t, "token"))
        dw_map[(document,token)] = str(getattr(t, "label"))
    sp_tokenizer = English().tokenizer


    def postprocess_regex(label, fn) -> None:
        for row in data:
            did = row["document"]
            text = row["full_text"]
            ms = fn(text)
            if len(ms)==0:
                continue
            sp_tokens = sp_tokenizer(text)
            for m in ms:
                beginning = True
                j = 0
                while j < len(sp_tokens):
                    k = (did, j)
                    # do not alter model predictions. Only add new (doc,token) pairs. 
                    if m.start <= sp_tokens[j].idx < m.end and k not in dw_map:
                        if beginning:
                            dw_map[k] = f"B-{label}"
                            beginning = False
                        else:
                            dw_map[k] = f"I-{label}"
                    j += 1
                
    
    
    
    if conf.postprocess_phone:
        print("postprocess_phone")
        postprocess_regex(label="PHONE_NUM", fn=snlp.find_phone_number)
    if conf.postprocess_email:
        print("postprocess_email")
        postprocess_regex(label="EMAIL", fn=snlp.find_email)
    if conf.postprocess_url:
        print("postprocess_url")
        postprocess_regex(label="URL_PERSONAL", fn=snlp.find_url)
    rows = []
    for k,v in dw_map.items():
        rows.append({"document": k[0], "token": k[1], "label": v})
    sub = pd.DataFrame.from_records(rows)
    sub["row_id"]=sub.index

In [10]:
sub.to_csv("submission.csv", index=False)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  53 non-null     int64 
 1   token     53 non-null     int64 
 2   label     53 non-null     object
 3   row_id    53 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 1.8+ KB


In [11]:
sub.head(10)

Unnamed: 0,document,token,label,row_id
0,7,6,B-STREET_ADDRESS,0
1,7,9,B-NAME_STUDENT,1
2,7,10,I-NAME_STUDENT,2
3,7,479,B-STREET_ADDRESS,3
4,7,482,B-NAME_STUDENT,4
5,7,483,I-NAME_STUDENT,5
6,7,610,I-PHONE_NUM,6
7,7,738,B-USERNAME,7
8,7,741,B-NAME_STUDENT,8
9,7,742,I-NAME_STUDENT,9


# Debug

In [12]:
#!python -V && which python
#!pip list