In [1]:
from pathlib import Path
import csv
import spacy

In [2]:
disable = ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']
nlp = spacy.load("en_core_sci_sm", disable=disable)
nlp.add_pipe("sentencizer") # use senter

<spacy.pipeline.sentencizer.Sentencizer at 0x7f7dcde53140>

In [3]:
# paths to data directories
data_path = Path("n2c2Track1TrainingData/data")
train_path = data_path / "train"
dev_path = data_path / "dev"

# paths to data and annotation files
train_data_files = sorted(list(train_path.rglob("*.txt")), key=lambda x: x.name)
train_ann_files = sorted(list(train_path.rglob("*.ann")), key=lambda x: x.name)
dev_data_files = sorted(list(dev_path.rglob("*.txt")), key=lambda x: x.name)
dev_ann_files = sorted(list(dev_path.rglob("*.ann")), key=lambda x: x.name)

assert(len(train_data_files) == len(train_ann_files))
assert(len(dev_data_files) == len(dev_ann_files))

print(f"Training samples: {len(train_data_files)}")
print(f"Dev samples: {len(dev_data_files)}")

Training samples: 350
Dev samples: 50


In [4]:
example_data = train_data_files[2].read_text()
example_anns = train_ann_files[2].read_text()

In [5]:
doc = nlp(example_data)

In [18]:
print(example_data)




Record date: 2079-08-11




MERCY CARE CENTER
MERCY Internal Medicine Associates
07 Newburgh Street
Terrell, AR  72985

Russell, Donna
10418061
08/11/79


HISTORY OF PRESENT ILLNESS:  Ms. Russell returns for further evaluation.  
She has been doing very well.  She has been exercising and has lost from 
200 to 170 pounds.  She is feeling very well.

Notably, Dr. Lu sent her for a follow-up ETT.  He did not find the 
first ETT acceptable.  The second ETT was grossly positive.  As a result 
of this, I think it is reasonable for us in addition to having her on 
atenolol to stop the hydrochlorothiazide, put her on ramipril and a 
nitrate.  She is having once every two weeks feeling a slight twinge of 
pain that she was having before when she went up steps.

She did have hyperlipidemia.  We have put her on Lipitor, which has 
provided some control.  However, her HCL is still 36 and LDL 118, which 
is not an excellent ratio.  Nonetheless, her CK has been within normal 
limits.

She also ha

In [19]:
print(example_anns)

T2	NoDisposition 567 575	atenolol
E2	NoDisposition:T2 
T3	Disposition 620 628	ramipril
E3	Disposition:T3 
T4	Disposition 815 822	Lipitor
E4	Disposition:T4 
T6	Disposition 1707 1726	long acting nitrate
E6	Disposition:T6 
T7	Undetermined 1746 1760	nitroglycerine
E7	Undetermined:T7 
T8	Disposition 1780 1799	hydrochlorothiazide
E8	Disposition:T8 
T9	NoDisposition 1818 1826	atenolol
E9	NoDisposition:T9 
T10	NoDisposition 1848 1855	aspirin
E10	NoDisposition:T10 
T11	NoDisposition 1874 1882	Premarin
E11	NoDisposition:T11 
T12	NoDisposition 1887 1896	Synthroid
E12	NoDisposition:T12 
T13	Disposition 588 607	hydrochlorothiazide
E13	Disposition:T13 
A1	Certainity E13 Certain
A2	Actor E13 Physician
A3	Action E13 Stop
A4	Temporality E13 Present
A5	Certainity E3 Certain
A6	Actor E3 Physician
A7	Action E3 Start
A8	Temporality E3 Present
T14	Disposition 636 643	nitrate
E14	Disposition:T14 
A9	Certainity E14 Certain
A10	Actor E14 Physician
A11	Action E14 Start
A12	Temporality E14 Present
A13	Certainity

In [10]:
parsed_anns = list(csv.reader(example_anns.splitlines(), delimiter="\t"))
parsed_anns

[['T2', 'NoDisposition 567 575', 'atenolol'],
 ['E2', 'NoDisposition:T2 '],
 ['T3', 'Disposition 620 628', 'ramipril'],
 ['E3', 'Disposition:T3 '],
 ['T4', 'Disposition 815 822', 'Lipitor'],
 ['E4', 'Disposition:T4 '],
 ['T6', 'Disposition 1707 1726', 'long acting nitrate'],
 ['E6', 'Disposition:T6 '],
 ['T7', 'Undetermined 1746 1760', 'nitroglycerine'],
 ['E7', 'Undetermined:T7 '],
 ['T8', 'Disposition 1780 1799', 'hydrochlorothiazide'],
 ['E8', 'Disposition:T8 '],
 ['T9', 'NoDisposition 1818 1826', 'atenolol'],
 ['E9', 'NoDisposition:T9 '],
 ['T10', 'NoDisposition 1848 1855', 'aspirin'],
 ['E10', 'NoDisposition:T10 '],
 ['T11', 'NoDisposition 1874 1882', 'Premarin'],
 ['E11', 'NoDisposition:T11 '],
 ['T12', 'NoDisposition 1887 1896', 'Synthroid'],
 ['E12', 'NoDisposition:T12 '],
 ['T13', 'Disposition 588 607', 'hydrochlorothiazide'],
 ['E13', 'Disposition:T13 '],
 ['A1', 'Certainity E13 Certain'],
 ['A2', 'Actor E13 Physician'],
 ['A3', 'Action E13 Stop'],
 ['A4', 'Temporality E13 Pr

In [11]:
annotations = []
for ann in parsed_anns:
    if ann[0][0] == "T":
        med_id = ann[0] # medication mention id
        col2 = ann[1].split()
        start = int(col2[1])
        end = int(col2[2])
        med_name = ann[2]
        new_ann = {
            "med_id": med_id,
            "char_span": (start, end),
            "med_name": med_name
        }
        annotations.append(new_ann)

for ann_dict in annotations:
    med_id = ann_dict["med_id"]
    for ann in parsed_anns:
        if ann[0][0] == "E":
            col2 = ann[1].split(":")
            if col2[1].strip() == med_id:
                ann_dict["event_id"] = ann[0]
                ann_dict["event"] = col2[0]

for ann_dict in annotations:
    event_id = ann_dict["event_id"]
    for ann in parsed_anns:
        if ann[0][0] == "A":
            col2 = ann[1].split()
            if col2[1] == event_id:
                context_dim = col2[0].lower()
                context_label = col2[2]
                ann_dict[context_dim] = context_label
                
for ann in annotations:
    print(ann)

{'med_id': 'T2', 'char_span': (567, 575), 'med_name': 'atenolol', 'event_id': 'E2', 'event': 'NoDisposition'}
{'med_id': 'T3', 'char_span': (620, 628), 'med_name': 'ramipril', 'event_id': 'E3', 'event': 'Disposition', 'certainity': 'Certain', 'actor': 'Physician', 'action': 'Start', 'temporality': 'Present'}
{'med_id': 'T4', 'char_span': (815, 822), 'med_name': 'Lipitor', 'event_id': 'E4', 'event': 'Disposition', 'certainity': 'Certain', 'actor': 'Physician', 'action': 'Start', 'temporality': 'Past'}
{'med_id': 'T6', 'char_span': (1707, 1726), 'med_name': 'long acting nitrate', 'event_id': 'E6', 'event': 'Disposition', 'certainity': 'Certain', 'actor': 'Physician', 'action': 'Start', 'temporality': 'Present'}
{'med_id': 'T7', 'char_span': (1746, 1760), 'med_name': 'nitroglycerine', 'event_id': 'E7', 'event': 'Undetermined'}
{'med_id': 'T8', 'char_span': (1780, 1799), 'med_name': 'hydrochlorothiazide', 'event_id': 'E8', 'event': 'Disposition', 'certainity': 'Certain', 'actor': 'Physicia

In [8]:
print(parsed_anns)

[['A2', 'Actor E13 Physician'], ['A4', 'Temporality E13 Present'], ['A6', 'Actor E3 Physician'], ['A8', 'Temporality E3 Present'], ['A10', 'Actor E14 Physician'], ['A12', 'Temporality E14 Present'], ['A14', 'Actor E4 Physician'], ['A16', 'Temporality E4 Past'], ['A18', 'Actor E15 Physician'], ['A20', 'Temporality E15 Present'], ['A22', 'Actor E6 Physician'], ['A24', 'Temporality E6 Present'], ['A27', 'Actor E8 Physician'], ['A29', 'Temporality E8 Present']]


In [9]:
print(parsed_anns[0][1].split())

['Actor', 'E13', 'Physician']


In [16]:
for ann in annotations:
    span = doc.char_span(*ann["char_span"], label=ann["med_id"])
    ann["token_span"] = (span.start, span.end)

In [17]:
for annotation in annotations:
    print(annotation)

{'med_id': 'T2', 'char_span': (567, 575), 'med_name': 'atenolol', 'event_id': 'E2', 'event': 'NoDisposition', 'token_span': (133, 134)}
{'med_id': 'T3', 'char_span': (620, 628), 'med_name': 'ramipril', 'event_id': 'E3', 'event': 'Disposition', 'certainity': 'Certain', 'action': 'Start', 'token_span': (142, 143)}
{'med_id': 'T4', 'char_span': (815, 822), 'med_name': 'Lipitor', 'event_id': 'E4', 'event': 'Disposition', 'certainity': 'Certain', 'action': 'Start', 'token_span': (186, 187)}
{'med_id': 'T6', 'char_span': (1707, 1726), 'med_name': 'long acting nitrate', 'event_id': 'E6', 'event': 'Disposition', 'certainity': 'Certain', 'action': 'Start', 'token_span': (391, 394)}
{'med_id': 'T7', 'char_span': (1746, 1760), 'med_name': 'nitroglycerine', 'event_id': 'E7', 'event': 'Undetermined', 'token_span': (401, 402)}
{'med_id': 'T8', 'char_span': (1780, 1799), 'med_name': 'hydrochlorothiazide', 'event_id': 'E8', 'event': 'Disposition', 'certainity': 'Certain', 'action': 'Stop', 'token_span

In [21]:
with train_ann_files[2].open() as f:
    anns = list(csv.reader(f, delimiter="\t"))
print(anns)

[['T2', 'NoDisposition 567 575', 'atenolol'], ['E2', 'NoDisposition:T2 '], ['T3', 'Disposition 620 628', 'ramipril'], ['E3', 'Disposition:T3 '], ['T4', 'Disposition 815 822', 'Lipitor'], ['E4', 'Disposition:T4 '], ['T6', 'Disposition 1707 1726', 'long acting nitrate'], ['E6', 'Disposition:T6 '], ['T7', 'Undetermined 1746 1760', 'nitroglycerine'], ['E7', 'Undetermined:T7 '], ['T8', 'Disposition 1780 1799', 'hydrochlorothiazide'], ['E8', 'Disposition:T8 '], ['T9', 'NoDisposition 1818 1826', 'atenolol'], ['E9', 'NoDisposition:T9 '], ['T10', 'NoDisposition 1848 1855', 'aspirin'], ['E10', 'NoDisposition:T10 '], ['T11', 'NoDisposition 1874 1882', 'Premarin'], ['E11', 'NoDisposition:T11 '], ['T12', 'NoDisposition 1887 1896', 'Synthroid'], ['E12', 'NoDisposition:T12 '], ['T13', 'Disposition 588 607', 'hydrochlorothiazide'], ['E13', 'Disposition:T13 '], ['A1', 'Certainity E13 Certain'], ['A2', 'Actor E13 Physician'], ['A3', 'Action E13 Stop'], ['A4', 'Temporality E13 Present'], ['A5', 'Certaini

In [18]:
for sent in doc.sents:
    print(len(sent))

50
8
15
7
12
11
8
37
26
6
15
20
12
8
13
17
27
5
7
9
15
7
6
12
8
19
3
12
8
12
16
3
4
7
12
5
3
5
14
22
28
57


In [25]:
len([str(token) for token in doc])

591

In [26]:
for sent in doc.sents:
    print(sent.start, sent.end)

0 50
50 58
58 73
73 80
80 92
92 103
103 111
111 148
148 174
174 180
180 195
195 215
215 227
227 235
235 248
248 265
265 292
292 297
297 304
304 313
313 328
328 335
335 341
341 353
353 361
361 380
380 383
383 395
395 403
403 415
415 431
431 434
434 438
438 445
445 457
457 462
462 465
465 470
470 484
484 506
506 534
534 591
