In [1]:
import pandas as pd
from pathlib import Path

In [2]:
file_paths = list(Path("./data/processed/").glob("./**/*.json"))
file_paths

[WindowsPath('data/processed/bn/test.json'),
 WindowsPath('data/processed/bn/train.json'),
 WindowsPath('data/processed/en/test.json'),
 WindowsPath('data/processed/en/train.json'),
 WindowsPath('data/processed/hn/test.json'),
 WindowsPath('data/processed/hn/train.json'),
 WindowsPath('data/processed/ma/test.json'),
 WindowsPath('data/processed/ma/train.json'),
 WindowsPath('data/processed/tm/test.json'),
 WindowsPath('data/processed/tm/train.json')]

In [3]:
file_paths[0].parts

('data', 'processed', 'bn', 'test.json')

In [4]:
counts = {}
for file_path in file_paths:
    _, _, lang, split = file_path.parts
    split = split.split(".")[0]
    counts[(lang, split)] = pd.read_json(file_path, orient="records", lines=True).shape[0]

In [5]:
df_t = pd.DataFrame(
    [k + (v,) for k,v in counts.items()],
    columns=["lang", "split", "#docs"]
)
df_t

Unnamed: 0,lang,split,#docs
0,bn,test,204
1,bn,train,800
2,en,test,206
3,en,train,828
4,hn,test,160
5,hn,train,677
6,ma,test,265
7,ma,train,1030
8,tm,test,257
9,tm,train,1013


In [6]:
df_t.pivot(index="split", columns="lang", values="#docs")

lang,bn,en,hn,ma,tm
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test,204,206,160,265,257
train,800,828,677,1030,1013


In [7]:
print(df_t.pivot(index="split", columns="lang", values="#docs").to_latex())

\begin{tabular}{lrrrrr}
\toprule
lang &   bn &   en &   hn &    ma &    tm \\
split &      &      &      &       &       \\
\midrule
test  &  204 &  206 &  160 &   265 &   257 \\
train &  800 &  828 &  677 &  1030 &  1013 \\
\bottomrule
\end{tabular}



In [8]:
row = pd.read_json(file_paths[3], orient="records", lines=True).head().loc[0,["tokens", "tags"]]
row

tokens    [1, dead,, 18, hurt, in, explosion, at, natura...
tags      [B-CASUALTIES-ARG, I-CASUALTIES-ARG, I-CASUALT...
Name: 0, dtype: object

In [9]:
print(" ".join(f"{token}/{tag}" for token, tag in zip(row.tokens, row.tags)))

1/B-CASUALTIES-ARG dead,/I-CASUALTIES-ARG 18/I-CASUALTIES-ARG hurt/I-CASUALTIES-ARG in/O explosion/B-MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT at/O natural/B-PLACE-ARG gas/I-PLACE-ARG plant/I-PLACE-ARG An/O explosion/B-MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT on/O Tuesday/B-TIME-ARG at/O a/O natural/B-PLACE-ARG gas/I-PLACE-ARG facility/I-PLACE-ARG near/I-PLACE-ARG Austria’s/I-PLACE-ARG border/I-PLACE-ARG with/I-PLACE-ARG Slovakia/I-PLACE-ARG left/O one/B-CASUALTIES-ARG person/I-CASUALTIES-ARG dead,/I-CASUALTIES-ARG authorities/O said./O A/O further/O 18/B-CASUALTIES-ARG people/I-CASUALTIES-ARG were/I-CASUALTIES-ARG injured/I-CASUALTIES-ARG in/B-TIME-ARG the/I-TIME-ARG morning/I-TIME-ARG blast/B-MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT at/O the/O plant/B-PLACE-ARG in/I-PLACE-ARG Baumgarten/I-PLACE-ARG an/I-PLACE-ARG der/I-PLACE-ARG March,/I-PLACE-ARG east/I-PLACE-ARG of/I-PLACE-ARG Vienna,/I-PLACE-ARG regional/O Red/O Cross/O official/O Sonja/O Kellner/O said./O Two/O medical/O helicopters/O were/O sent/O

In [10]:
from spacy import displacy

In [18]:
def split_tag(tag):
    return tuple(tag.split("-", 1)) if tag != "O" else (tag, None)

def extract_entities(tags):
    tags = list(tags)
    curr_entity = []
    entities = []
    for i, tag in enumerate(tags + ["O"]):
        # Add dummy tag in end to ensure the last entity is added to entities
        boundary, label = split_tag(tag)
        if curr_entity:
            # Exit entity
            if boundary in {"B", "O"} or label != curr_entity[-1][1]:
                start = i - len(curr_entity)
                end = i
                entity_label = curr_entity[-1][1]
                entities.append((entity_label, start, end))
                curr_entity = []
            elif boundary == "I":
                curr_entity.append((boundary, label))
        if boundary == "B":
            # Enter or inside entity
            assert not curr_entity, f"Entity should be empty. Found: {curr_entity}"
            curr_entity.append((boundary, label))
    return entities


def get_entity_info(bio_labels, tokens, text=None, spans=None):
    entities_info = extract_entities(bio_labels)
    entities = []        
    for label, start, end in entities_info:
        entity_phrase = None
        start_char_idx = None
        end_char_idx = None
        if text and spans:
            start_char_idx = spans[start][0]
            end_char_idx = spans[end-1][1]
            entity_phrase = text[start_char_idx:end_char_idx]
        entities.append(dict(
            tokens=tokens[start:end],
            label=label,
            start=start,
            start_char_idx=start_char_idx,
            end_char_idx=end_char_idx,
            end=end,
            entity_phrase=entity_phrase))
    return entities

In [37]:
from IPython.display import display, display_html, HTML

In [42]:
def render(tokens, tags):
    text = " ".join(row.tokens)
    spans = []
    start_char_idx = 0
    for token in tokens:
        spans.append((start_char_idx, start_char_idx+len(token)))
        start_char_idx += len(token) + 1 # For space
    entity_info = get_entity_info(tags, tokens, text, spans)
    unique_labels = set([e["label"] for e in entity_info])
    pallet = "#48cd4a,#cb4f78,#e97e0b,#48f061,#8ff1df,#9ead18,#27e57b,#e051c0,#7add51,#dab304,#1dfafb".split(",")
    colors = {
        l:c for l,c in zip(unique_labels, pallet)
    }
    
    doc = {
        "text": text,
        "ents": [
            {"label": e["label"], "start": e["start_char_idx"], "end": e["end_char_idx"]}
            for e in entity_info
        ]
    }
    bio_text = " ".join(f"{token}/{tag}" for token, tag in zip(row.tokens, row.tags))
    display(HTML("<h3>Highlighted Entities</h3>"))
    displacy.render(doc, style="ent", manual=True, options=dict(colors=colors))
    display(HTML(f"<h3>BIO Format</h3><pre>{bio_text}</pre>"))

In [43]:
render(row.tokens, row.tags)