In [18]:
from pathlib import Path
import csv
import spacy
import json
from collections import Counter

In [13]:
disable = ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']
nlp = spacy.load("en_core_sci_sm", disable=disable)
nlp.add_pipe("sentencizer") # use senter

<spacy.pipeline.sentencizer.Sentencizer at 0x7fb95b2740a0>

In [4]:
# paths to data directories
data_path = Path("n2c2Track1TrainingData/data")
train_path = data_path / "train"
dev_path = data_path / "dev"

# paths to data and annotation files
train_data_files = sorted(list(train_path.rglob("*.txt")), key=lambda x: x.name)
train_ann_files = sorted(list(train_path.rglob("*.json")), key=lambda x: x.name)
dev_data_files = sorted(list(dev_path.rglob("*.txt")), key=lambda x: x.name)
dev_ann_files = sorted(list(dev_path.rglob("*.json")), key=lambda x: x.name)

assert(len(train_data_files) == len(train_ann_files))
assert(len(dev_data_files) == len(dev_ann_files))

print(f"Training samples: {len(train_data_files)}")
print(f"Dev samples: {len(dev_data_files)}")

Training samples: 350
Dev samples: 50


In [11]:
train_label_counts = {
    "num_medications": 0,
    "event": {"Disposition": 0, "NoDisposition": 0, "Undetermined": 0},
    "action": {"Start": 0, "Stop": 0, "Increase": 0, "Decrease": 0, "UniqueDose": 0, "OtherChange": 0, "Unknown": 0},
    "temporality": {"Past": 0, "Present": 0, "Future": 0, "Unknown": 0},
    "certainity": {"Certain": 0, "Hypothetical": 0, "Conditional": 0, "Unknown": 0},
    "actor": {"Physician": 0, "Patient": 0, "Unknown": 0}
}

In [12]:
for train_ann_path in train_ann_files:
    anns = json.loads(train_ann_path.read_text())
    for ann in anns:
        train_label_counts["num_medications"] += 1
        for task in train_label_counts:
            if task == "num_medications":
                continue
            
            if task in ann:
                if ann[task] in train_label_counts[task]:
                    train_label_counts[task][ann[task]] += 1

for task in train_label_counts:
    print(train_label_counts[task])

6188
{'Disposition': 1183, 'NoDisposition': 4535, 'Undetermined': 470}
{'Start': 466, 'Stop': 280, 'Increase': 103, 'Decrease': 41, 'UniqueDose': 263, 'OtherChange': 0, 'Unknown': 0}
{'Past': 605, 'Present': 440, 'Future': 112, 'Unknown': 0}
{'Certain': 993, 'Hypothetical': 105, 'Conditional': 83, 'Unknown': 0}
{'Physician': 1077, 'Patient': 88, 'Unknown': 0}


In [21]:
dev_label_counts = {
    "num_medications": 0,
    "event": {"Disposition": 0, "NoDisposition": 0, "Undetermined": 0},
    "action": {"Start": 0, "Stop": 0, "Increase": 0, "Decrease": 0, "UniqueDose": 0, "OtherChange": 0, "Unknown": 0},
    "temporality": {"Past": 0, "Present": 0, "Future": 0, "Unknown": 0},
    "certainity": {"Certain": 0, "Hypothetical": 0, "Conditional": 0, "Unknown": 0},
    "actor": {"Physician": 0, "Patient": 0, "Unknown": 0}
}

In [22]:
for dev_ann_path in dev_ann_files:
    anns = json.loads(dev_ann_path.read_text())
    for ann in anns:
        dev_label_counts["num_medications"] += 1
        for task in dev_label_counts:
            if task == "num_medications":
                continue
            
            if task in ann:
                if ann[task] in dev_label_counts[task]:
                    dev_label_counts[task][ann[task]] += 1

for task in dev_label_counts:
    print(dev_label_counts[task])

1031
{'Disposition': 219, 'NoDisposition': 725, 'Undetermined': 87}
{'Start': 95, 'Stop': 60, 'Increase': 23, 'Decrease': 13, 'UniqueDose': 22, 'OtherChange': 0, 'Unknown': 0}
{'Past': 129, 'Present': 54, 'Future': 33, 'Unknown': 0}
{'Certain': 173, 'Hypothetical': 29, 'Conditional': 17, 'Unknown': 0}
{'Physician': 192, 'Patient': 17, 'Unknown': 0}


In [19]:
num_tokens = 0
num_sents = 0
counter = Counter()
for train_data_path in train_data_files:
    text = train_data_path.read_text()
    doc = nlp(text)
    num_tokens += len(doc)
    num_sents += len(list(doc.sents))
    words = [token.text for token in doc]
    counter.update(words)

print(f"Training tokens: {num_tokens}")
print(f"Training types: {len(counter)}")
print(f"Training sentences: {num_sents}")

Training tokens: 314674
Training types: 23275
Training sentences: 15228


In [20]:
num_tokens = 0
num_sents = 0
counter = Counter()
for dev_data_path in dev_data_files:
    text = dev_data_path.read_text()
    doc = nlp(text)
    num_tokens += len(doc)
    num_sents += len(list(doc.sents))
    words = [token.text for token in doc]
    counter.update(words)

print(f"Dev tokens: {num_tokens}")
print(f"Dev types: {len(counter)}")
print(f"Dev sentences: {num_sents}")

Dev tokens: 48987
Dev types: 8224
Dev sentences: 2574
