In [31]:
import os
import json
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scipy
import scml
from scml import pandasx as pdx
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [32]:
tokenizer = AutoTokenizer.from_pretrained("huggingface/microsoft/deberta-v3-base", is_fast=True)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

DebertaV2TokenizerFast(name_or_path='huggingface/microsoft/deberta-v3-base', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
model_input_names=['input_ids', '



In [33]:
with open("input/train.json") as f:
    data = json.load(f)
print(data[0])

{'document': 7, 'full_text': "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.\n\nWhat exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1\n\nThis tool has many advantages:\n\n•  It is accessible to all and does not require significant material investment and can be done  quickly\n\n•  It is scalable\n\n•  It allows categorization and linking of information\n\n•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas\n\n•  It is suitable for all people and is easy to learn\n\n•  It is fun and encourages 

In [34]:
texts = []
label_examples = []
for row in tqdm(data):
    did = int(row["document"])
    tokens = row["tokens"]
    labels = row["labels"]
    assert len(tokens)==len(labels)
    ts=[]
    la=""
    for i in range(len(tokens)):
        if labels[i]=="O":
            if len(ts)!=0:
                label_examples.append({"tokens": " ".join(ts), "label": la, "did": did})
                ts=[]
                la=""
            continue
        if len(ts)==0:
            la=labels[i][2:]
        ts.append(tokens[i])
    # remember to save the last entity!
    if len(ts)!=0:
        label_examples.append({"tokens": " ".join(ts), "label": la, "did": did})
    texts.append(str(row["full_text"]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6807/6807 [00:00<00:00, 19389.19it/s]


In [35]:
x = tokenizer(texts, truncation=False, add_special_tokens=False)
deberta_tokens = [len(tokens) for tokens in x["input_ids"]]
pd.Series(deberta_tokens).describe(percentiles=percentiles)

count    6807.000000
mean      690.068753
std       293.549286
min        65.000000
1%        164.000000
5%        285.000000
10%       359.000000
20%       453.000000
30%       523.000000
40%       592.000000
50%       655.000000
60%       720.000000
70%       799.000000
80%       901.000000
90%      1051.400000
95%      1214.000000
99%      1590.700000
max      3074.000000
dtype: float64

In [36]:
df = pd.DataFrame.from_records(label_examples)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1603 entries, 0 to 1602
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  1603 non-null   object
 1   label   1603 non-null   object
 2   did     1603 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 37.7+ KB


In [37]:
df.head()

Unnamed: 0,tokens,label,did
0,Nathalie Sylla,NAME_STUDENT,7
1,Nathalie Sylla,NAME_STUDENT,7
2,Nathalie Sylla,NAME_STUDENT,7
3,Diego Estrada,NAME_STUDENT,10
4,Diego Estrada,NAME_STUDENT,10


In [38]:
pdx.value_counts(df["label"])

Unnamed: 0_level_0,count,percent
label,Unnamed: 1_level_1,Unnamed: 2_level_1
NAME_STUDENT,1365,0.851528
URL_PERSONAL,109,0.067998
ID_NUM,76,0.047411
EMAIL,39,0.024329
USERNAME,6,0.003743
PHONE_NUM,6,0.003743
STREET_ADDRESS,2,0.001248


In [39]:
%%time
df.to_csv(f"output/label_examples.csv", index=False)
assert df.notna().all(axis=None)

CPU times: user 1.34 ms, sys: 551 µs, total: 1.89 ms
Wall time: 1.5 ms


In [40]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:06.832944
