In [1]:
import os
import json
import gc
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scml
from scml import pandasx as pdx
from mylib.ner import NerDataset
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
files = [
    Path("input/train.json"),
]
easy_classes = {"B-NAME_STUDENT", "I-NAME_STUDENT"}
medium_classes = {
    "B-URL_PERSONAL", "I-URL_PERSONAL", 
    "B-ID_NUM", "I-ID_NUM", 
    "B-EMAIL", "I-EMAIL",
}
hard_classes = {
    "B-USERNAME", "I-USERNAME", 
    "B-PHONE_NUM", "I-PHONE_NUM", 
    "B-STREET_ADDRESS", "I-STREET_ADDRESS",
}
validation_frac=0.05
n_splits=int(1/validation_frac)
version = datetime.now().strftime("%y%m%d")
print(f"n_splits={n_splits}\nversion={version}")

n_splits=20
version=240221


In [3]:
data = []
for filepath in files:
    with open(str(filepath)) as f:
        data += json.load(f)
print(f"len(data)={len(data):,}\ndata[0]={data[0]}")

len(data)=6,807
data[0]={'document': 7, 'full_text': "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.\n\nWhat exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1\n\nThis tool has many advantages:\n\n•  It is accessible to all and does not require significant material investment and can be done  quickly\n\n•  It is scalable\n\n•  It allows categorization and linking of information\n\n•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas\n\n•  It is suitable for all people and is easy to learn\n\n•  I

In [4]:
stratification_groups=[]
for row in data:
    easy,medium,hard=False,False,False
    for label in row["labels"]:
        if easy and medium and hard:
            break
        if label in easy_classes:
            easy=True
        elif label in medium_classes:
            medium=True
        elif label in hard_classes:
            hard=True
    stratification_groups.append(NerDataset.stratification_group(easy=easy, medium=medium, hard=hard))

In [5]:
tra,val=[],[]
splitter = StratifiedKFold(n_splits=n_splits, shuffle=False)
dummy = np.zeros(len(data))
for ti, vi in splitter.split(dummy, y=stratification_groups):
    for i in ti:
        tra.append(data[i])
    for i in vi:
        val.append(data[i])
    break
with open(f"output/tra_{version}.json", "w") as f:
    json.dump(tra,f)
with open(f"output/val_{version}.json", "w") as f:
    json.dump(val,f)
print(f"len(tra)={len(tra):,}, len(val)={len(val):,}")



len(tra)=6,466, len(val)=341


In [6]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:02.486474
