In [1]:
import os
import json
import gc
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scml
from scml import pandasx as pdx
from mylib.ner import NerDataset
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
version = "v026"
tra_files = [
    Path("input/train.json"),
    Path("input/nicholasbroad/nb01.json"),
    Path("input/valentinwerner/pjm12.json"),  # PJM v12
    Path("input/valentinwerner/moth12.json"),  # Moth v12
    #Path("input/mpware/mpware02.json"),  # mpware v2
    Path("input/tonyarobertson/tonrob01.json"),  # tonyarobertson v1
    Path("input/mandrilator/mandri02.json"),  # mandrilator v2
]
val_files = [
    Path("input/valentinwerner/vw03.json"),
]
stratification_strategy = "hes"
if stratification_strategy=="hme":
    easy_classes = {"B-NAME_STUDENT", "I-NAME_STUDENT"}
    medium_classes = {
        "B-URL_PERSONAL", "I-URL_PERSONAL", 
        "B-ID_NUM", "I-ID_NUM", 
        "B-EMAIL", "I-EMAIL",
    }
    hard_classes = {
        "B-USERNAME", "I-USERNAME", 
        "B-PHONE_NUM", "I-PHONE_NUM", 
        "B-STREET_ADDRESS", "I-STREET_ADDRESS",
    }
elif stratification_strategy=="hes":
    easy_classes = {
        "B-URL_PERSONAL", "I-URL_PERSONAL", 
        "B-USERNAME", "I-USERNAME", 
        "B-PHONE_NUM", "I-PHONE_NUM",
        "B-EMAIL", "I-EMAIL",
    }
    medium_classes=set()
    hard_classes = {
        "B-NAME_STUDENT", "I-NAME_STUDENT"
        "B-STREET_ADDRESS", "I-STREET_ADDRESS",
        "B-ID_NUM", "I-ID_NUM",
    }

In [3]:
data = []
for filepath in tra_files:
    with open(str(filepath)) as f:
        tmp = json.load(f)
        filename = filepath.stem
        for row in tmp:
            row["source"] = filename
        data += tmp
        print(f"{filename}={len(tmp):,}")
tra = data
print(f"len(tra)={len(tra):,}\ntra[0]={tra[0]}")

train=6,807
nb01=2,355
pjm12=2,000
moth12=4,434
tonrob01=1,850
mandri02=2,000
len(tra)=19,446
tra[0]={'document': 7, 'full_text': "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.\n\nWhat exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1\n\nThis tool has many advantages:\n\n•  It is accessible to all and does not require significant material investment and can be done  quickly\n\n•  It is scalable\n\n•  It allows categorization and linking of information\n\n•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation o

In [4]:
data = []
for filepath in val_files:
    with open(str(filepath)) as f:
        tmp = json.load(f)
        filename = filepath.stem
        for row in tmp:
            row["source"] = filename
        data += tmp
        print(f"{filename}={len(tmp):,}")
val = data
print(f"len(val)={len(val):,}\nval[0]={val[0]}")

vw03=4,367
len(val)=4,367
val[0]={'tokens': [' ', 'Title', ':', 'Home', 'at', '958', 'Ryan', 'Ports', ',', 'Jamesstad', ',', 'FL', '88807', '\r\n\r\n', 'Living', 'at', '958', 'Ryan', 'Ports', 'in', 'the', 'quaint', 'and', 'picturesque', 'community', 'of', 'Jamesstad', ',', 'Florida', ',', 'is', 'an', 'experience', 'that', 'I', 'cherish', 'every', 'day', '.', 'Nestled', 'amidst', 'lush', 'greenery', 'and', 'tranquil', 'surroundings', ',', 'this', 'address', 'offers', 'a', 'unique', 'blend', 'of', 'peace', ',', 'comfort', ',', 'and', 'convenience', 'that', 'I', 'have', 'grown', 'to', 'appreciate', '.', '\r\n\r\n', 'The', 'location', 'of', 'my', 'residence', 'is', 'one', 'of', 'its', 'most', 'significant', 'advantages', '.', 'Tucked', 'away', 'from', 'the', 'hustle', 'and', 'bustle', 'of', 'city', 'life', ',', 'Jamesstad', 'provides', 'a', 'serene', 'environment', 'where', 'I', 'can', 'unwind', 'and', 'rejuvenate', '.', 'The', 'soothing', 'sounds', 'of', 'nature', '-', 'birds', 'chirping'

In [5]:
dids = set()
for row in tra:
    dids.add(str(row["document"]))
print(f"len(dids)={len(dids):,}, len(tra)={len(tra):,}")
assert len(dids)==len(tra)

len(dids)=19,446, len(tra)=19,446


In [6]:
for fold in [tra, val]:
    stratification_groups=[]
    for row in fold:
        easy,medium,hard=False,False,False
        if stratification_strategy=="hes":
            medium=True
        for label in row["labels"]:
            if easy and medium and hard:
                break
            if label in easy_classes:
                easy=True
            elif label in medium_classes:
                medium=True
            elif label in hard_classes:
                hard=True
        if stratification_strategy=="hme":
            stratification_groups.append(NerDataset.stratification_group(easy=easy, medium=medium, hard=hard))
        elif stratification_strategy=="hes":
            stratification_groups.append(NerDataset.stratification_group_hes(easy=easy, hard=hard, data_source_name=row["source"]))
    display(pdx.value_counts(pd.Series(stratification_groups)))

Unnamed: 0,count,percent
0_train,5862,0.30145
3_moth12,4202,0.216086
3_nb01,2333,0.119973
3_pjm12,1995,0.102592
3_mandri02,1669,0.085827
2_tonrob01,1109,0.05703
2_train,850,0.043711
3_tonrob01,741,0.038106
2_moth12,188,0.009668
1_mandri02,161,0.008279


Unnamed: 0,count,percent
2_vw03,1925,0.440806
3_vw03,989,0.226471
1_vw03,987,0.226013
0_vw03,466,0.106709


In [7]:
# 1. hard=0, medium=0, easy=1
# 2. hard=0, medium=1, easy=0
# 3. hard=0, medium=1, easy=1
# 4. hard=1, medium=0, easy=0
# 5. hard=1, medium=0, easy=1
# 6. hard=1, medium=1, easy=0
# 7. hard=1, medium=1, easy=1
#pdx.value_counts(pd.Series(stratification_groups))

In [8]:
with open(f"output/tra_{version}.json", "w") as f:
    json.dump(tra,f)
with open(f"output/val_{version}.json", "w") as f:
    json.dump(val,f)
print(f"len(tra)={len(tra):,}, len(val)={len(val):,}")

len(tra)=19,446, len(val)=4,367


In [9]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:06.709409
