In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass

Disease = candidate_subclass('Disease', ['disease'])

In [2]:
from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Training Candidates').one()
print len(train)
dev = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates').one()
print len(dev)

28087
27896


In [3]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

L_gold_train = label_manager.load(session, train, "CDR Training Label Set")
print L_gold_train.shape
L_gold_dev = label_manager.load(session, dev, "CDR Development Label Set")
print L_gold_dev.shape

(28087, 1)
(27896, 1)


# Load `L_train` and `L_dev`

In [4]:
%%time
from snorkel.annotations import merge_annotations
from utils import binarize_LF_matrix, get_binarized_score

L_TRAIN_BLOCKS = ['1', '2', '3', '4.1', '4 T2', '4 N', '5', '6']
L_DEV_BLOCKS   = ['1', '2', '3.1', '4', '4 T2', '4 N', '5', '6']

L_train_blocks = [label_manager.load(session, train, 'LF Training Labels %s' % lfn) for lfn in L_TRAIN_BLOCKS]
L_dev_blocks   = [label_manager.load(session, dev, 'LF Development Labels %s' % lfn) for lfn in L_DEV_BLOCKS]

L_train   = merge_annotations(L_train_blocks)
L_train_b = binarize_LF_matrix(L_train)
L_dev     = merge_annotations(L_dev_blocks)
L_dev_b   = binarize_LF_matrix(L_dev)
print L_train.shape
print L_dev.shape

(28087, 1645)
(27896, 1557)
CPU times: user 20.5 s, sys: 740 ms, total: 21.2 s
Wall time: 21.1 s


# Load `F_train` and `F_dev`

In [5]:
%%time
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

F_train = feature_manager.load(session, train, 'Train Features')
F_dev   = feature_manager.load(session, dev, 'Train Features')
print F_train.shape
print F_dev.shape

(28087, 69885)
(27896, 69885)
CPU times: user 32.6 s, sys: 1.55 s, total: 34.1 s
Wall time: 33.7 s


# Load canonical & secondary dictionaries

In [7]:
from cPickle import load
cd = load(open('cd.pkl', 'rb'))

# Setup 2: DP LR Pre-filtering

In [8]:
from snorkel.learning import LogReg, NaiveBayes

gen_model_b = NaiveBayes()
%time gen_model_b.train(L_train_b, n_iter=10000, rate=1e-1, verbose=False)

train_marginals = gen_model_b.marginals(L_train_b)

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-3, mu=1e-6)

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



Training marginals (!= 0.5):	28087
Features:			1645
CPU times: user 28 s, sys: 12 ms, total: 28.1 s
Wall time: 28.1 s
Training marginals (!= 0.5):	20038
Features:			69885
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 13889.283204	Gradient magnitude = 9832.035530
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 3078.241784	Gradient magnitude = 379.275662
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 2894.743086	Gradient magnitude = 1296.907839
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 2494.032915	Gradient magnitude = 297.457588
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 2394.454863	Gradient magnitude = 31.977057
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 2337.136536	Gradient magnitude = 28.324998
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 2295.414531	Gradient magnitude = 25.754725
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 2263.718238	Gradient magnitud

In [19]:
yp = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp, L_gold_dev)

P :	0.838600797791
R :	0.687374245473
F1:	0.755494125777


# Setup 2.i: Exact Match

In [10]:
correct   = 0
predicted = 0
total_pos = 0
for i,c in enumerate(dev):
    if L_gold_dev[i,0] > 0:
        total_pos += 1
    
    # Check for exact *positive* matches to the canonical dictionary
    if yp[i] > 0:
        p    = c.disease.get_span().lower()
        sids = cd.term_to_sids[p]
        cids = set([cd.sid_to_cid[sid] for sid in sids if sid in cd.sid_to_cid])
        if len(cids) > 0:
            predicted += 1
            cid = list(cids)[0]
            if cid == L_gold_dev[i,0]:
                correct += 1

prec   = correct / float(predicted)
recall = correct / float(total_pos)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.880952380952
R:	0.279175050302
F1:	0.423987776929


# Setup 2.ii: TF-IDF

In [11]:
%%time
from entity_norm import CanonDictVectorizer 

# Create a vectorizer based around this 
cd_vectorizer = CanonDictVectorizer(cd.term_to_sids, other_phrases=[])

# Vectorize the dictionary
D_pos   = cd_vectorizer.vectorize_phrases(cd.pos_terms)
D_pos_t = D_pos.T
D_pos_t

|V| = 271630
CPU times: user 40.7 s, sys: 864 ms, total: 41.6 s
Wall time: 41 s


In [14]:
from collections import defaultdict

def get_pos_vector_matches(candidates):
    best_match = defaultdict(lambda : (0.0, -1))
    for c in candidates:
        p  = c.disease.get_span().lower()
        cx = cd_vectorizer.vectorize_phrases([p])
        m  = cx * D_pos_t
        m  = m.tocoo()
        for i, s in enumerate(m.data):
            j    = m.col[i]
            t    = cd.pos_terms[j]
            sids = cd.term_to_sids[t]
            cid  = list(set([cd.sid_to_cid[sid] for sid in sids if sid in cd.sid_to_cid]))[0]
            if s > best_match[c.id][0]:
                best_match[c.id] = (s, cid)
    return best_match

In [15]:
%time best_match = get_pos_vector_matches(dev)

CPU times: user 3min 2s, sys: 2.52 s, total: 3min 4s
Wall time: 3min 4s


In [17]:
correct   = 0
predicted = 0
total_pos = 0
for i,c in enumerate(dev):
    if L_gold_dev[i,0] > 0:
        total_pos += 1
        
    if yp[i] > 0:
        s, cid = best_match[c.id]
        predicted += 1
        if cid == L_gold_dev[i,0]:
            correct += 1
    
prec   = correct / float(predicted)
recall = correct / float(total_pos)
f1     = (2*prec*recall) / (prec+recall)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", f1

P:	0.67873580853
R:	0.556338028169
F1:	0.611472011057


# Setup 2.iii: Using the Gen. Model directly

Only consider the points predicted positive by the discriminative model:

In [20]:
from scipy.sparse import lil_matrix

N, M        = L_dev.shape
mn_maps     = []
mn_inv_maps = []
nz_idxs     = []
Xs          = []
for i in range(N):
    if yp[i] > 0:
        nz = L_dev.getrow(i).nonzero()[1]
        if len(nz) > 0:
            nz_idxs.append(i)
    
            # Construct the map from CID -> column index, and reverse
            mn_map     = {}
            mn_inv_map = []
            for j in nz:
                label = L_dev[i,j]
                if label not in mn_map:
                    mn_map[label] = len(mn_map)
                    mn_inv_map.append(label)
            mn_maps.append(mn_map)
            mn_inv_maps.append(mn_inv_map)
    
            # Construct the candidate label matrix
            X = np.zeros((M, len(mn_map)))
            for j in nz:
                k = mn_map[L_dev[i,j]]
                X[j, k] = 1
            Xs.append(X)

In [21]:
from snorkel.learning.learning_mn import LogReg

gen_model = LogReg()
gen_model.train(Xs, n_iter=100, rate=1e-2, w0=np.ones(M))

Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.251152
Final gradient magnitude for rate=0.01, mu=1e-06: 0.275


In [23]:
N_pos_dev = sum([1 for i in range(L_gold_dev.shape[0]) if L_gold_dev[i,0] > 0])
marginals = gen_model.marginals(Xs)

predicted = 0
correct   = 0
total_pos = 0
for i,m in enumerate(marginals):
    cid = mn_inv_maps[i][np.argmax(m)]
    if cid > 0:
        predicted += 1
        if cid == L_gold_dev[nz_idxs][i,0]:
            correct += 1
            
prec   = correct / float(predicted)
recall = correct / float(N_pos_dev)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.7896
R:	0.496478873239
F1:	0.609635577517


# Setup 2.iv: SSI

### Step 1:

We used the discriminative model, trained using the training_marginals from the binary gen model over the training set, to filter both the training and test sets to predicted positives

In [26]:
yp_d_train = disc_model.predict(F_train, b=0.5)
get_binarized_score(yp_d_train, L_gold_train)

P :	0.845507246377
R :	0.753746770026
F1:	0.796994535519


In [27]:
yp_d_dev = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp_d_dev, L_gold_dev)

P :	0.838600797791
R :	0.687374245473
F1:	0.755494125777


### Step 2:

We construct a vectorizer for positive disease terms, and vectorize the canonical dictionary, training and test sets:

In [44]:
%%time
from entity_norm import CanonDictVectorizer 

pos_term_to_sids = dict((t, sids) for t, sids in cd.term_to_sids.iteritems() if len(sids) > 0 and any([sid in cd.sid_to_cid for sid in sids]))

# Create a vectorizer *only* based on positive terms
cd_vectorizer = CanonDictVectorizer(pos_term_to_sids, other_phrases=[])

# Vectorize the dictionary
D_pos   = cd_vectorizer.vectorize_phrases(cd.pos_terms)
D_pos_t = D_pos.T
D_pos_t

|V| = 19468
CPU times: user 9.86 s, sys: 380 ms, total: 10.2 s
Wall time: 9.94 s


In [45]:
%%time
# Vectorize the train & dev candidates that were predicted positive by the stage I LR model
X_train = cd_vectorizer.vectorize_phrases([c.disease.get_span().lower() for i,c in enumerate(train) if yp_d_train[i] > 0])
X_dev   = cd_vectorizer.vectorize_phrases([c.disease.get_span().lower() for i,c in enumerate(dev) if yp_d_dev[i] > 0])
print X_train.shape
print X_dev.shape

(3450, 19469)
(3259, 19469)
CPU times: user 24.8 s, sys: 476 ms, total: 25.3 s
Wall time: 25.2 s


### Step 3:

We run the multinomial generative model over the training set

In [47]:
from scipy.sparse import lil_matrix

N, M        = L_train.shape
mn_maps     = []
mn_inv_maps = []
nz_idxs     = []
Xs          = []
for i in range(N):
    if yp_d_train[i] > 0:
        nz = L_train.getrow(i).nonzero()[1]
        if len(nz) > 0:
            nz_idxs.append(i)
    
            # Construct the map from CID -> column index, and reverse
            mn_map     = {}
            mn_inv_map = []
            for j in nz:
                label = L_train[i,j]
                if label not in mn_map:
                    mn_map[label] = len(mn_map)
                    mn_inv_map.append(label)
            mn_maps.append(mn_map)
            mn_inv_maps.append(mn_inv_map)
    
            # Construct the candidate label matrix
            X = np.zeros((M, len(mn_map)))
            for j in nz:
                k = mn_map[L_train[i,j]]
                X[j, k] = 1
            Xs.append(X)

In [48]:
from snorkel.learning.learning_mn import LogReg

gen_model = LogReg()
gen_model.train(Xs, n_iter=100, rate=1e-2, w0=np.ones(M))

Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.236760
Final gradient magnitude for rate=0.01, mu=1e-06: 0.258


In [49]:
N_pos_train = sum([1 for i in range(L_gold_train.shape[0]) if L_gold_train[i,0] > 0])
train_marginals = gen_model.marginals(Xs)

predicted = 0
correct   = 0
total_pos = 0
for i,m in enumerate(train_marginals):
    cid = mn_inv_maps[i][np.argmax(m)]
    if cid > 0:
        predicted += 1
        if cid == L_gold_train[nz_idxs][i,0]:
            correct += 1
            
prec   = correct / float(predicted)
recall = correct / float(N_pos_train)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.746175243394
R:	0.554521963824
F1:	0.636228876371


## TODO: Why such a big gap between len(tm) and N???

In [52]:
len(train_marginals)

2896

### Step 4:

We construct the training marginals as sparse matrix

In [51]:
train_marginals[0]

array([ 0.88301307,  0.11698693])

In [54]:
mn_inv_maps[0]

[61.0, 3163.0]

In [56]:
L_gold_train[nz_idxs][0,0]

61.0

In [70]:
Np = len(train_marginals)
K  = len(set(cd.sid_to_cid.values()))
Y  = lil_matrix((Np, K))

for i in range(Np):
    for j in range(len(train_marginals[i])):
        Y[i, int(mn_inv_maps[i][j])] = train_marginals[i][j]
Y = Y.tocsr()
Y

<2896x4790 sparse matrix of type '<type 'numpy.float64'>'
	with 4556 stored elements in Compressed Sparse Row format>

### Step 5:

We train the SSI model using the training marginals

In [71]:
D_pos

<63794x19469 sparse matrix of type '<type 'numpy.float64'>'
	with 211070 stored elements in Compressed Sparse Row format>

In [75]:
cids = [cd.sid_to_cid[list(sid for sid in cd.term_to_sids[t] if sid in cd.sid_to_cid)[0]] for t in cd.pos_terms]

In [76]:
from entity_norm import SSIModel

model = SSIModel(D_pos, cids)

In [78]:
X_train

<3450x19469 sparse matrix of type '<type 'numpy.float64'>'
	with 7396 stored elements in Compressed Sparse Row format>

In [77]:
model.train(X_train, Y)

Building close negatives dictionary...
Iteration: 0

IndexError: index out of bounds: 0 <= 2938 <= 2896, 0 <= 2939 <= 2896, 2938 <= 2939