# Learning to Extract Pain Outcomes from Clinical Text without Labeled Data
## Suppliment 1: Annotating Clinical Notes

This notebook shows how to use Snorkel's internal annotation utility for generating gold labels for model evaluation.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import re
import sys
import bz2
import glob
import codecs
import random
import shutil
import datetime
import numpy as np

from snorkel import SnorkelSession
from snorkel.models import candidate_subclass, Candidate, Document, Label, StableLabel
from snorkel.viewer import *
from extractlib.utils import *
from extractlib.corpora import ClefCorpus

session = SnorkelSession()

In [4]:
# Define a candidate space
try:
    PainLocation = candidate_subclass('PainLocation', ['pain','anatomy'])
except:
    print "candidate subclass already exists, skipping..."

## 1. Label Train Set Candidates

In [5]:
train_cands = session.query(Candidate).filter(Candidate.split == 0).all()
print len(train_cands)

225


In [6]:
print train_cands[0]

PainLocation(Span("pain", sentence=6426, chars=[35,38], words=[4,4]), Span("chest", sentence=6426, chars=[29,33], words=[3,3]))


In [7]:
train_cands = session.query(PainLocation).filter(PainLocation.split == 0).all()
print len(train_cands)

print train_cands[0]

225
PainLocation(Span("pain", sentence=6426, chars=[35,38], words=[4,4]), Span("chest", sentence=6426, chars=[29,33], words=[3,3]))


In [9]:
from snorkel.viewer import SentenceNgramViewer

sv = SentenceNgramViewer(train_cands, session=session, n_per_page=1, height=225,
                         annotator_name='gold')
sv

<IPython.core.display.Javascript object>

The installed widget Javascript is the wrong version. It must satisfy the semver range ~2.1.4.


## 2. Label Dev Set Candidates

In [10]:
dev_cands = session.query(Candidate).filter(Candidate.split == 1).all()
print len(dev_cands)

73


In [11]:
sv = SentenceNgramViewer(dev_cands, session, n_per_page=1, 
                         annotator_name="gold")
sv

<IPython.core.display.Javascript object>

The installed widget Javascript is the wrong version. It must satisfy the semver range ~2.1.4.


## 3. Label Test Set Candidates

In [10]:
test_cands = session.query(Candidate).filter(Candidate.split == 2).all()
print len(test_cands)

137


In [11]:
sv = SentenceNgramViewer(test_cands, session, n_per_page=1, 
                         annotator_name="gold")
sv

<IPython.core.display.Javascript object>

## 4. Export Annotator Generated Labels

In [25]:
def export_labels(session, annotator_name, filename, delimitter="\t"):
    '''Extract all labels for a given annotator'''
    with open(filename,"w") as fp:
        fp.write(delimitter.join(['context_stable_ids',"label"]) + "\n")
        for label in session.query(StableLabel).all():
            if label.annotator_name == annotator_name:
                row = [label.context_stable_ids, label.value]
                fp.write(delimitter.join(map(unicode,row))+"\n")

In [26]:
ts = datetime.datetime.now()
params = ("gold", ts.year, ts.month, ts.day)
export_labels(session,"gold", "../data/annotations/clef.{}.{}.{}.{}.tsv".format(*params))

In [22]:
print len(session.query(StableLabel).all())

462
