# Learning to Extract Pain Outcomes from Clinical Text without Labeled Data
## II: Discriminative Model

We show 2 standard discriminative model:

- Bidirectional Long Short Term Memory (LSTM)
- Sparse Logisitic Regression 


In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import numba
import numpy as np
import matplotlib

from snorkel import SnorkelSession
from snorkel.learning.disc_models.rnn import *
from snorkel.annotations import LabelAnnotator
from snorkel.annotations import load_gold_labels
from snorkel.models import candidate_subclass, Document, Sentence, Candidate, Span
from snorkel.learning import GenerativeModel
from rwe.extractlib.labelers import *

In [3]:
session = SnorkelSession()

try:
    PainLocation = candidate_subclass('PainLocation', ['pain', 'anatomy'])
except:
    pass

relation = PainLocationRelation(dict_root="../data/")
print "Labeling Functions n={}".format(len(relation.lfs))

Labeling Functions n=24


### Load Candidates and Gold Labels

In [4]:
train_cands = session.query(Candidate).filter(Candidate.split == 0).order_by(Candidate.id).all()
dev_cands   = session.query(Candidate).filter(Candidate.split == 1).order_by(Candidate.id).all()
test_cands  = session.query(Candidate).filter(Candidate.split == 2).order_by(Candidate.id).all()

L_gold_train = load_gold_labels(session, split=0, annotator_name='gold')
L_gold_dev   = load_gold_labels(session, split=1, annotator_name='gold')
L_gold_test  = load_gold_labels(session, split=2, annotator_name='gold')

print "Gold [TRAIN]", L_gold_train.size
print "Gold [DEV]  ", L_gold_dev.size
print "Gold [TEST] ", L_gold_test.size

Gold [TRAIN] 224
Gold [DEV]   63
Gold [TEST]  165


In [5]:
documents = session.query(Document).all()

In [6]:
print len(train_cands)
print len(dev_cands)
print len(test_cands)

candidates = train_cands + dev_cands + test_cands

225
63
168


In [18]:
from snorkel.annotations import LabelAnnotator
from rwe.extractlib.relations.anatomy_pain import *

relations = AnatomyPainRelation(candidates, data_root="../data/")
lfs = relations.lfs

labeler = LabelAnnotator(lfs=lfs)
L_train = labeler.load_matrix(session, split=0)
L_dev   = labeler.load_matrix(session, split=1)

print L_train.shape
print L_dev.shape

(225, 28)
(63, 28)


## Sparse Logistic Regression

### Create Features
This uses a standard NLP feature generation library using lemmatization, POS tags, and sentence dependency parsing to generate candidate features.

In [10]:
from snorkel.annotations import FeatureAnnotator
from rwe.extractlib.features import hybrid_span_mention_ftrs

featurizer = FeatureAnnotator(hybrid_span_mention_ftrs)

F_train = featurizer.load_matrix(session, split=0)
F_dev   = featurizer.load_matrix(session, split=1)
F_test  = featurizer.load_matrix(session, split=2)

F_train = F_train if F_train.size != 0 else featurizer.apply(split=0)
F_dev   = F_dev if F_dev.size != 0 else featurizer.apply_existing(split=1)
F_test  = F_test if F_test.size != 0 else featurizer.apply_existing(split=2)

Clearing existing...
Running UDF...

Clearing existing...
Running UDF...

Clearing existing...
Running UDF...



## Supervised Labels

In [11]:
from sklearn.metrics import f1_score,recall_score,precision_score

In [14]:
from snorkel.annotations import load_gold_labels
from snorkel.learning.utils import MentionScorer
from snorkel.learning import RandomSearch, ListParameter, RangeParameter

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

## Train Discriminitive Model

In [15]:
from snorkel.learning import SparseLogisticRegression, LogisticRegression
disc_model = SparseLogisticRegression()
#disc_model = LogisticRegression()

In [None]:

use_supervised = False
t_marginals = gold_train_marginals if use_supervised else train_marginals
    
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param   = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param   = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)

param_grid = [rate_param, l1_param, l2_param]

searcher = RandomSearch(session, disc_model, F_train, t_marginals,
                        param_grid, n=5)

In [None]:
np.random.seed(123456)
searcher.fit(F_dev, L_gold_dev, n_epochs=2000, rebalance=0.5, print_freq=250)

In [None]:
from snorkel.models import GoldLabel, GoldLabelKey, Label, LabelKey, Feature, FeatureKey, Candidate

In [None]:
tp, fp, tn, fn = disc_model.error_analysis(session, F_test, L_gold_test, b=0.5)

In [None]:
m = disc_model.marginals(F_test)
fig, ax = plt.subplots()
df = pd.DataFrame(data=m, columns=['marginals'])
pd.DataFrame.hist(df,range=(0.0, 1.0),bins=20, ax=ax)

In [None]:
def dump_marginals(cands, marginals):
    rows = ["\t".join(["DOC_NAME","CID","SID","PAIN","ANATOMY","MARGINAL"])]
    for i,c in enumerate(cands):
        row = [c.get_parent().document.name, c.id, c.get_parent().id]
        row += [c.pain.get_span(), c.anatomy.get_span()]
        row += [marginals[i]]
        rows.append("\t".join(map(str,row)))
    return "\n".join(rows)

test_marginals = disc_model.marginals(F_test)
#dump_marginals(test_cands, test_marginals)


In [None]:
#L_train.lf_stats(session, labels=L_gold_train.toarray().ravel())
L_dev.lf_stats(session, labels=L_gold_dev.toarray().ravel())

## LSTM

Long Short Term Memory (LSTM) models can acheive state-of-the-art performance on many text classification tasks. We'll train a simple bidirectional LSTM model below.

In deep learning, hyperparameter tuning is very important and computationally expensive step in training models. For purposes of this tutorial, we've pre-selected some settings so that you can train a model in under 10 minutes. Advanced users can look at our Grid Search Tutorial for more details on choosing these parameters.

In [None]:
hard_train_marginals = np.array([1 if train_marginals[i] > 0.5 else 0 for i in range(len(train_marginals))])

In [None]:
hard_train_marginals

In [None]:
np.random.seed(0)

def get_max_seq_len(cands):
    l = 0
    for c in cands:
        l = max(len(c[0].sentence.words),l)
    print "max seq len", l
    return l

attn_window      = ListParameter('attn_window', [0]) # get_max_seq_len(dev_cands)
batch_size_param = ListParameter('batch_size', [32, 64])
rate_param       = RangeParameter('lr', 1e-4, 1e-2, step=1, log_base=10)
dropout_param    = RangeParameter('dropout', 0.0, 0.5, step=0.25)
dim_param        = ListParameter('dim', [50, 100])

param_grid = [attn_window, rate_param, dropout_param, dim_param, batch_size_param]

lstm = reRNN()
searcher = RandomSearch(session, lstm, train_cands, train_marginals, param_grid, n=5)

In [None]:
searcher.fit(dev_cands, L_gold_dev, n_epochs=400, rebalance=0.0, print_freq=25)

In [None]:
m = lstm.marginals(test_cands)
fig, ax = plt.subplots()
df = pd.DataFrame(data=m, columns=['marginals'])
pd.DataFrame.hist(df,range=(0.0, 1.0),bins=20, ax=ax)

In [None]:
tp, fp, tn, fn = lstm.error_analysis(session, test_cands, L_gold_test, b=0.5)

In [None]:
pos = list(L_gold_test).count(1)
neg = list(L_gold_test).count(-1)

print pos/(float(neg)+float(pos))

import sys
def save_model(model, out_dir):
    if os.path.exists(out_dir):
        print>>sys.stderr,"warning, model already exists"
    else:
        os.mkdir(out_dir)
        
    model.save(out_dir+"/model")

save_model(lstm,"/users/fries/desktop/foobar/")

In [None]:
m = lstm.marginals(test_cands)
fig, ax = plt.subplots()
df = pd.DataFrame(data=m, columns=['marginals'])
pd.DataFrame.hist(df,range=(0.0, 1.0),bins=20, ax=ax)

## Evaluation

In [None]:
L_gold_test = load_gold_labels(session, annotator_name='gold', cand_gen=2)

In [None]:
tp, fp, tn, fn = disc_model.error_analysis(session, F_test, L_gold_test)