In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass

Disease = candidate_subclass('Disease', ['disease'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Training Candidates').one()
print len(train)
dev = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates').one()
print len(dev)

28087
27896


In [3]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

L_gold_train = label_manager.load(session, train, "CDR Training Label Set")
print L_gold_train.shape
L_gold_dev = label_manager.load(session, dev, "CDR Development Label Set")
print L_gold_dev.shape

(28087, 1)
(27896, 1)


# Load `L_train` and `L_dev`

In [4]:
%%time
from snorkel.annotations import merge_annotations
from utils import binarize_LF_matrix, get_binarized_score

L_TRAIN_BLOCKS = ['1', '2', '3', '4.1', '4 T2', '4 N', '5', '6']
L_DEV_BLOCKS   = ['1', '2', '3.1', '4', '4 T2', '4 N', '5', '6']

L_train_blocks = [label_manager.load(session, train, 'LF Training Labels %s' % lfn) for lfn in L_TRAIN_BLOCKS]
L_dev_blocks   = [label_manager.load(session, dev, 'LF Development Labels %s' % lfn) for lfn in L_DEV_BLOCKS]

L_train   = merge_annotations(L_train_blocks)
L_train_b = binarize_LF_matrix(L_train)
L_dev     = merge_annotations(L_dev_blocks)
L_dev_b   = binarize_LF_matrix(L_dev)
print L_train.shape
print L_dev.shape

(28087, 1645)
(27896, 1557)
CPU times: user 53.1 s, sys: 1.21 s, total: 54.3 s
Wall time: 54 s


# Load `F_train` and `F_dev`

In [5]:
%%time
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

F_train = feature_manager.load(session, train, 'Train Features')
F_dev   = feature_manager.load(session, dev, 'Train Features')
print F_train.shape
print F_dev.shape

(28087, 69885)
(27896, 69885)
CPU times: user 4min 49s, sys: 3.7 s, total: 4min 52s
Wall time: 4min 52s


# Load canonical & secondary dictionaries

In [6]:
cd = load(open('cd.pkl', 'rb'))

# Setup 1.iii: Using the Gen. Model directly

### Using the multinomial version

In [None]:
from scipy.sparse import lil_matrix

N, M        = L_dev.shape
mn_maps     = []
mn_inv_maps = []
nz_idxs     = []
Xs          = []
for i in range(N):
    nz = L_dev.getrow(i).nonzero()[1]
    if len(nz) > 0:
        nz_idxs.append(i)
    
        # Construct the map from CID -> column index, and reverse
        mn_map     = {}
        mn_inv_map = []
        for j in nz:
            label = L_dev[i,j]
            if label not in mn_map:
                mn_map[label] = len(mn_map)
                mn_inv_map.append(label)
        mn_maps.append(mn_map)
        mn_inv_maps.append(mn_inv_map)
    
        # Construct the candidate label matrix
        X = np.zeros((M, len(mn_map)))
        for j in nz:
            k = mn_map[L_dev[i,j]]
            X[j, k] = 1
        Xs.append(X)

In [None]:
from snorkel.learning.learning_mn import LogReg

gen_model = LogReg()
gen_model.train(Xs, n_iter=100, rate=1e-2, w0=np.ones(M))

In [None]:
marginals = gen_model.marginals(Xs)

In [None]:
predicted = 0
correct   = 0
for i,m in enumerate(marginals):
    cid = mn_inv_maps[i][np.argmax(m)]
    if cid > 0:
        predicted += 1
        if cid == L_gold_dev[nz_idxs][i,0]:
            correct += 1
            
prec   = correct / float(predicted)
recall = correct / float(N_pos_dev)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

# Setup 2: DP LR Pre-filtering

In [None]:
from snorkel.learning import LogReg, NaiveBayes

gen_model_b = NaiveBayes()
%time gen_model_b.train(L_train_b, n_iter=10000, rate=1e-1, verbose=False)

train_marginals = gen_model_b.marginals(L_train_b)

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-3, mu=1e-6)

In [None]:
yp = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp, L_gold_dev)

# Setup 2.i: Exact Match

In [None]:
correct   = 0
predicted = 0
total_pos = 0
for i,c in enumerate(dev):
    if L_gold_dev[i,0] > 0:
        total_pos += 1
    
    # Check for exact *positive* matches to the canonical dictionary
    if yp[i] > 0:
        p    = c.disease.get_span().lower()
        sids = cd.term_to_sids[p]
        cids = set([cd.sid_to_cid[sid] for sid in sids if sid in cd.sid_to_cid])
        if len(cids) > 0:
            predicted += 1
            cid = list(cids)[0]
            if cid == L_gold_dev[i,0]:
                correct += 1

prec   = correct / float(predicted)
recall = correct / float(total_pos)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

# Setup 2.ii: TF-IDF

In [None]:
%%time
from entity_norm import CanonDictVectorizer 

# Create a vectorizer based around this 
cd_vectorizer = CanonDictVectorizer(cd.term_to_sids, other_phrases=[])

# Vectorize the dictionary
D_pos   = cd_vectorizer.vectorize_phrases(cd.pos_terms)
D_pos_t = D_pos.T
D_pos_t

In [None]:
%time best_match = get_pos_vector_matches(dev)

In [None]:
correct   = 0
predicted = 0
for i,c in enumerate(dev):
    if yp[i] > 0:
        s, cid = best_match[c.id]
        predicted += 1
        if cid == L_gold_dev[i,0]:
            correct += 1
    
prec   = correct / float(predicted)
recall = correct / float(N_pos_dev)
f1     = (2*prec*recall) / (prec+recall)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", f1

# Setup 2.iii: Using the Gen. Model directly