# Disease Norm

In this example, we'll be writing an application to extract *mentions of* diseases from Pubmed abstracts, using annotations from the [BioCreative CDR Challenge](http://www.biocreative.org/resources/corpora/biocreative-v-cdr-corpus/).  This tutorial, which has 5 parts, walks through the process of constructing a model to classify _candidate_ disease mentions as either true (i.e., that it is truly a mention of a disease) or false.

# Loading Candidates + Annotations

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass

Disease = candidate_subclass('Disease', ['disease'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Training Candidates').one()
print len(train)
dev = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates').one()
print len(dev)

28087
27896


In [3]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

L_gold_train = label_manager.load(session, train, "CDR Training Label Set")
print L_gold_train.shape
L_gold_dev = label_manager.load(session, dev, "CDR Development Label Set")
print L_gold_dev.shape

(28087, 1)
(27896, 1)


# Process / Load Features

In [4]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

Process:

In [5]:
%time F_train = feature_manager.create(session, train, 'Train Features')


Loading sparse Feature matrix...
CPU times: user 17min 41s, sys: 21.1 s, total: 18min 2s
Wall time: 17min 49s


In [6]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', False)


Loading sparse Feature matrix...
CPU times: user 12min 41s, sys: 22.3 s, total: 13min 3s
Wall time: 12min 50s


Or, load if already processed:

In [None]:
%time F_train = feature_manager.load(session, train, 'Train Features')

In [None]:
%time F_dev = feature_manager.load(session, dev, 'Train Features')

# Load dictionaries

### Load the MESH ID -> CID mapping

In [7]:
from cPickle import load
MESH_to_CID = load(open('MESH_to_CID.pkl', 'rb'))

### Create a canonical dictionary (CD)

In [8]:
from utils import CanonicalDictionary
cd = CanonicalDictionary(MESH_to_CID)

### Add MESH to CD

In [9]:
# Load MESH
from utils import load_mesh_raw
mesh_entries = load_mesh_raw('data/desc2017.xml')

Loaded 28472 entries


In [10]:
# Add MESH to cd
for entry in mesh_entries:
    mid, ps, terms = entry
    paths = [[p[0]] + p[1:].split('.') for p in ps]
    for term in terms:
        cd.add_term(term, mid, tree_paths=paths)

len(cd.term_to_sids)

152581

### Add MEDIC to CD

Custom CTD diseases dictionary made from MESH category C + OMIM

In [11]:
from utils import load_MEDIC, load_mesh_raw
medic_entries, MEDIC_to_CID = load_MEDIC()

Loaded 11864 MEDIC entries


In [12]:
# Add MESH to cd
for entry in medic_entries:
    if entry.id.startswith("MESH"):
        mid = entry.id.split(":")[1]
    elif len(entry.parent_ids) > 0 and entry.parent_ids[0].startswith("MESH"):
        mid = entry.parent_ids[0].split(":")[1]
    else:
        raise KeyError(entry)
    
    paths = []
    for p in entry.tree_nums:
        x = p.split("/")[0]
        paths.append([x[0]] + x[1:].split('.'))
    
    terms = [entry.name] + entry.synonyms
    for term in terms:
        cd.add_term(term, mid, tree_paths=paths)
        
len(cd.term_to_sids)

180265

### Add UMLS to CD

This may or may not be all of the UMLS... file from Jason

In [13]:
with open('cui2mesh.tsv', 'rb') as f:
    for line in f:
        term, cui, mid = line.rstrip('\n').split('\t')
        cd.add_term(term, mid)

len(cd.term_to_sids)

805576

# Writing some multinomial LFs

# NOTE: Beware of LF rollback bug!!!

## TYPE I LF: Subsets of MESH dictionary

In [14]:
seen_global = defaultdict(set)

## MESH exact match

In [15]:
def LFG_CD_match(c, p, key_mod=None):
    """
    Given a candidate c, some transformed candidate disease phrase p,
    and an optional key name modifier key_mod to be appended, return a generator
    of key, value pairs
    """
    if p in cd.term_to_sids:
        for sid in cd.term_to_sids[p]:
            cid   = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
            paths = cd.tree_paths[sid]
            for path in paths:
                    
                # NOTE: path may be shorter than max depth if higher up in the tree (e.g. 'cancer', 'ischemia')!
                key = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                        
                # CHECK AGAINST / ADD TO GLOBAL SEEN SET!
                # To make sure that relaxations of an LF don't overlap with each other on any given candidate
                if c.id not in seen_global[key]:
                    seen_global[key].add(c.id)
                    if key_mod:
                        key += "-" + key_mod
                    yield key, cid

In [16]:
POS_DEPTH = 3
NEG_DEPTH = 3

def LFG_MESH_exact(c):
    p = c.disease.get_span().lower()
    return LFG_CD_match(c, p)

In [18]:
%time L_train = label_manager.create(session, train, 'LF Training Labels -- ALL 1', f=LFG_MESH_exact)
L_train


Loading sparse Label matrix...
CPU times: user 1min 24s, sys: 13.9 s, total: 1min 38s
Wall time: 1min 28s


<28087x557 sparse matrix of type '<type 'numpy.float64'>'
	with 12891 stored elements in Compressed Sparse Row format>

### Drop JJs

In [20]:
def drop_jjs(c):
    toks  = []
    words = c.disease.get_attrib_tokens()
    for i, tag in enumerate(c.disease.get_attrib_tokens('pos_tags')):
        if re.match(r'JJ.*', tag) is None:
            toks.append(words[i])
    return " ".join(toks).lower()

In [21]:
POS_DEPTH = 3
NEG_DEPTH = 3

def LFG_MESH_exact_drop_JJs(c):
    p = drop_jjs(c)
    return LFG_CD_match(c, p, key_mod="DJ")

In [22]:
%time L_train = label_manager.update(session, train, 'LF Training Labels -- ALL 1', True, LFG_MESH_exact_drop_JJs)
L_train


Loading sparse Label matrix...
CPU times: user 1min 22s, sys: 14.3 s, total: 1min 36s
Wall time: 1min 26s


<28087x898 sparse matrix of type '<type 'numpy.float64'>'
	with 16690 stored elements in Compressed Sparse Row format>

## MESH TF-IDF cosine match

In [23]:
%%time
from entity_norm import CanonDictVectorizer

# Create a vectorizer based around this 
cd_vectorizer = CanonDictVectorizer(cd.term_to_sids, other_phrases=[])

# Vectorize the dictionary
D  = cd_vectorizer.vectorize_phrases(cd.terms)
Dt = D.T
Dt

|V| = 271954
CPU times: user 6min 18s, sys: 1.82 s, total: 6min 20s
Wall time: 6min 19s


In [25]:
POS_DEPTH = 3
NEG_DEPTH = 3

THRESHs = [0.75]

def LFG_MESH_cosine(c):
    mt = min(THRESHs)
    
    # Vectorize the phrase
    p  = c.disease.get_span().lower()
    cx = cd_vectorizer.vectorize_phrases([p])
    m  = cx * Dt
    
    # Keep track of the highest-score match so far _for each LF_
    highest_score = defaultdict(float)
    
    # Iterate over non-zero dictionary term matches > THRESH
    # Note: changing to COO and iterating over the data direcltly is ~OM faster
    m = m.tocoo()
    for i, s in enumerate(m.data):
        if s > mt:
            j   = m.col[i]
            for sid in cd.term_to_sids[cd.terms[j]]:
                cid = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
                for path in cd.tree_paths[sid]:
                
                    # We define each LF by a tree path code
                    key  = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                        
                    # ADD TO GLOBAL SEEN SET!
                    # This is to make sure that relaxations of an LF don't overlap with each other on any
                    # given candidate
                    if c.id not in seen_global[key]:
                        seen_global[key].add(c.id)
                        key += "-c"
                
                        # Only yield this value if higher than highest current emitted
                        # Note: This will just update the current value in the DB
                        if s > highest_score[key]:
                            for t in THRESHs:
                                if s > t:
                                    highest_score[key] = s
                                    yield key + "-%s" % t, cid

In [26]:
%time L_train = label_manager.update(session, train, 'LF Training Labels -- ALL 1', True, LFG_MESH_cosine)
L_train


Loading sparse Label matrix...
CPU times: user 25min 31s, sys: 2min 50s, total: 28min 22s
Wall time: 28min 3s


<28087x1849 sparse matrix of type '<type 'numpy.float64'>'
	with 57658 stored elements in Compressed Sparse Row format>

# Putting in some negative LFs

In [27]:
import re
from lf_terms import *
from snorkel.lf_helpers import get_left_tokens, get_right_tokens
from utils import *
from Disease_Tagging_Tutorial_LFs import *
chemicals = load_chemdner_dictionary()

def LF_organs(c):
    phrase = " ".join(c[0].get_attrib_tokens()).lower()
    return -1 if phrase in organs else 0      

def LF_chemical_name(c):
    phrase = " ".join(c[0].get_attrib_tokens())
    return -1 if phrase in chemicals and not phrase.isupper() else 0

def LF_bodysym(c):
    phrase = " ".join(c[0].get_attrib_tokens()).lower()
    return -1 if phrase in bodysym else 0  

def LF_protein_chemical_abbrv(c):
    '''Gene/protein/chemical name'''
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("\d+",lemma) else 0

def LF_base_pair_seq(c): 
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("^[GACT]{2,}$",lemma) else 0

LFs_false = [LF_chemical_name,
             LF_organs,
             LF_bodysym,
             LF_protein_chemical_abbrv,
             LF_base_pair_seq,
             LF_too_vague,
             LF_neg_surfix,
             LF_non_common_disease,
             LF_non_disease_acronyms,
             LF_pos_in,
             LF_gene_chromosome_link,
             LF_right_window_incomplete,
             LF_negative_indicator
            ]

In [28]:
%time L_train = label_manager.update(session, train, 'LF Training Labels -- ALL 1', True, LFs_false)
L_train


Loading sparse Label matrix...
CPU times: user 5min 22s, sys: 16.2 s, total: 5min 39s
Wall time: 5min 27s


<28087x1862 sparse matrix of type '<type 'numpy.float64'>'
	with 71581 stored elements in Compressed Sparse Row format>

In [None]:
# LOAD if already computed
L_train = label_manager.load(session, train, 'LF Training Labels -- ALL 1')
L_train

# Running gen. model

In [29]:
from utils import binarize_LF_matrix, get_binarized_score
L_train_b = binarize_LF_matrix(L_train)
L_train_b

<28087x1862 sparse matrix of type '<type 'numpy.float64'>'
	with 71581 stored elements in Compressed Sparse Row format>

In [35]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
%time gen_model.train(L_train_b, n_iter=5000, rate=1e-1, verbose=True)

Training marginals (!= 0.5):	28087
Features:			1862
Begin training for rate=0.1, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.175641
	Learning epoch = 250	Gradient mag. = 0.177829
	Learning epoch = 500	Gradient mag. = 0.165160
	Learning epoch = 750	Gradient mag. = 0.155768
	Learning epoch = 1000	Gradient mag. = 0.145635
	Learning epoch = 1250	Gradient mag. = 0.130910
	Learning epoch = 1500	Gradient mag. = 0.121831
	Learning epoch = 1750	Gradient mag. = 0.116028
	Learning epoch = 2000	Gradient mag. = 0.111482
	Learning epoch = 2250	Gradient mag. = 0.108365
	Learning epoch = 2500	Gradient mag. = 0.105589
	Learning epoch = 2750	Gradient mag. = 0.099150
	Learning epoch = 3000	Gradient mag. = 0.094544
	Learning epoch = 3250	Gradient mag. = 0.090845
	Learning epoch = 3500	Gradient mag. = 0.087891
	Learning epoch = 3750	Gradient mag. = 0.084424
	Learning epoch = 4000	Gradient mag. = 0.081310
	Learning epoch = 4250	Gradient mag. = 0.078674
	Learning epoch = 4500	Gradient mag. = 0.076279
	Lea

In [36]:
yp = gen_model.predict(L_train_b)
get_binarized_score(yp, L_gold_train)

P :	0.695609756098
R :	0.554719917012
F1:	0.617226951378


In [37]:
# Print LF stats...
from snorkel.learning import odds_to_prob
lfs = L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))
lfs.nlargest(50, "coverage")

Unnamed: 0,j,coverage,overlaps,conflicts,accuracy,Learned Acc.
LF_neg_surfix,1855,0.157689,0.105565,0.028056,0.972228,0.956537
LF_chemical_name,1849,0.121373,0.112579,0.016484,0.993253,0.985392
LF_negative_indicator,1861,0.104746,0.08495,0.013779,0.934058,0.991268
D-12-776-c-0.75,906,0.068822,0.066116,0.01417,0.93223,0.993295
N-02-421-c-0.75,935,0.046178,0.046071,0.005163,0.946029,0.956691
LF_too_vague,1854,0.039769,0.029053,0.006765,0.976723,0.999085
LF_protein_chemical_abbrv,1852,0.036209,0.017802,0.000961,0.971485,0.991914
N-03-540-c-0.75,936,0.032079,0.031972,0.000463,0.992231,0.999948
N-05-300-c-0.75,920,0.031758,0.031758,0.001887,0.947309,0.999028
N-05-715-c-0.75,948,0.029444,0.029444,0.004059,0.997582,0.999952


# Error analysis

In [38]:
from random import shuffle
N_train = L_gold_train.shape[0]

fps = []
fns = []
for i in range(N_train):
    if yp[i] > 0 and L_gold_train[i] < 0:
        fps.append(i)
    elif yp[i] < 0 and L_gold_train[i] > 0:
        fns.append(i)

shuffle(fps)
shuffle(fns)

print len(fps)
print len(fns)

936
846


In [39]:
from snorkel.viewer import SentenceNgramViewer
fn_cands = [L_train.get_candidate(i) for i in fns[:100]]
svn      = SentenceNgramViewer(fn_cands, session)
svn

<IPython.core.display.Javascript object>

In [50]:
c = svn.get_selected()
c

Disease(Span("hyperphosphatemia", parent=728, chars=[118,134], words=[25,25]))

In [51]:
from snorkel.models import Label
session.query(Label).filter(Label.candidate == c).all()

[Label (CDR Training Label = 5146),
 Label (C-18-452 = 5146),
 Label (C-16-614-c-0.75 = -1),
 Label (C-05-116-c-0.75 = -1)]

In [52]:
from snorkel.learning.gen_learning import odds_to_prob
i = L_train.get_row_index(c)

for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), odds_to_prob(gen_model.w[j]), int(L_train[i,j])

AnnotationKey (C-18-452) 0.799275428647 5146
AnnotationKey (C-16-614-c-0.75) 0.683247128473 -1
AnnotationKey (C-05-116-c-0.75) 0.832906032877 -1


In [53]:
p  = c.disease.get_span().lower()
cx = cd_vectorizer.vectorize_phrases([p])
m  = cx * Dt

In [54]:
m.nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([179102, 179101, 179100, 179098, 179097, 165775, 165774, 165708,
        165707, 165706, 136590, 136589], dtype=int32))

In [56]:
for j in m.nonzero()[1]:
    t = cd.terms[j]
    print t, cd.term_to_sids[t]

hyperphosphatemia tumoral calcinosis set(['C566870'])
hyperphosphatemia hyperostosis syndrome set(['C566870'])
hyperphosphatemia hyperostosis set(['C566870'])
with hyperphosphatemia tumoral calcinosis set(['C566870'])
tumoral with hyperphosphatemia calcinosis set(['C566870'])
and seizures polyuria hyperphosphatemia set(['C565494'])
polyuria and seizures hyperphosphatemia set(['C565494'])
hyperostosis with hyperphosphatemia set(['C538381', 'C566870'])
cortical hyperostosis with hyperphosphatemia set(['C538381', 'C566870'])
hyperostosis-hyperphosphatemia syndrome set(['C538381', 'C566870'])
hyperphosphatemias set(['D054559'])
hyperphosphatemia set(['D054559'])


In [57]:
cd.sid_to_cid['C566870']

KeyError: 'C566870'

In [None]:
[cd.sid_to_cid[sid] for sid in cd.term_to_sids['toxicity']]

In [43]:
cd.cid_to_sid[5593]

'D064420'

# TODO:

* Re-write the cosine matcher:
    - Currently it counts the highest-score match as _seen_, so labels with the next highest one!!!!
    - Either need to skip the no-deps check, or get max first before emitting anything (this could be more efficient too!)

* Switch to MESH 2016
* Add in supplementary records!!!

# Training Discriminative model

In [32]:
from snorkel.learning import LogReg

train_marginals = gen_model.marginals(L_train_b)

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-3, mu=1e-6)

Training marginals (!= 0.5):	20878
Features:			69885
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 14471.526836	Gradient magnitude = 11440.008285
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 5252.705794	Gradient magnitude = 2589.044600
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 3633.779540	Gradient magnitude = 1705.518976
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 2938.941213	Gradient magnitude = 942.095663
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 2667.006219	Gradient magnitude = 47.080719
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 2601.234768	Gradient magnitude = 30.086925
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 2554.348699	Gradient magnitude = 27.249469
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 2518.889631	Gradient magnitude = 25.184339
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 2491.162859	Gradient magnitude = 23.596542


In [33]:
yp = disc_model.predict(F_train)
get_binarized_score(yp, L_gold_train)

P :	0.744621141254
R :	0.619294605809
F1:	0.67619991505


In [34]:
yp = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp, L_gold_dev)

P :	0.747138047138
R :	0.561487854251
F1:	0.641144177983


In [None]:
plt.hist(disc_model.marginals(F_dev))

In [None]:
from snorkel.learning.gen_learning import odds_to_prob
plt.hist(odds_to_prob(gen_model.w))

In [None]:
L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))

# Hard filtering

In [None]:
# Try overriding with any exact matches...
ype = np.zeros(L_gold_train.shape[0])
for i,c in enumerate(dev):
    if i % 5000 == 0:
        print i
    pos = 0
    neg = 0
    for lf_name, label in LFG_MESH_exact(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    for lf_name, label in LFG_MESH_exact_drop_JJs(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    if neg > 0 and pos == 0:
        ype[i] = -1
    elif pos > 0 and neg == 0:
        ype[i] = 1
    else:
        ype[i] = yp[i]

In [None]:
get_binarized_score(ype, L_gold_dev)

# Now, getting CIDs in simple heuristic way

In [None]:
N = L_gold_train.shape[0]

In [None]:
pp_idxs = [i for i in range(N) if yp[i] == 1 and L_gold_train[i] > 0]
pp_idxs[:5]

In [None]:
i = 32
for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), gen_model.w[j], L_train[i,j]

In [None]:
i = 53
for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), gen_model.w[j], L_train[i,j]

In [None]:
L_gold_train[53,0]

In [None]:
print L_train.get_key(558)
print gen_model.w[558]
print L_train[18,558]

In [None]:
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_train[i,0] > 0:
        total += 1
    
    if yp[i] > 0:
        predicted += 1
        
        max_w = 0.0
        cid   = -1
        for j in L_train.getrow(i).nonzero()[1]:
            if gen_model.w[j] > max_w:
                max_w = gen_model.w[j]
                cid   = L_train[i,j]
        
        if cid == L_gold_train[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

In [None]:
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_train[i,0] > 0:
        total += 1
        
    max_w = 0.0
    cid   = -1
    for j in L_train.getrow(i).nonzero()[1]:
        if gen_model.w[j] > max_w:
            max_w = gen_model.w[j]
            cid   = L_train[i,j]
        
    if cid > 0:
        predicted += 1  
        if cid == L_gold_train[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

# ETC

# Different cut levels:

Note: G = gen model on training set, D = disc. model on test set

* Pos: 1, Neg: 1, Pos-cosine: 1, Neg-cosine: 1, Thresh-cosine: 0.75 = 56 F1 G / 63 F1 D
* TODO...
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: 0.75 = 61 F1 G / 68 F1 D
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: [0.5, 0.75] = 63 F1 G / 65 F1 D
* Pos: 4, Neg: 4, Pos-cosine: 4, Neg-cosine: 4, Thresh-cosine: 0.75 = 60 F1 G / 64 F1 D

### Adding in drop_JJs + NEG LFs:

* Pos: 2, Neg: 2, Pos-cosine: 2, Neg-cosine: 2, Thresh-cosine: 0.75 = 69 F1 G / 71 F1 D
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: 0.75 = 70 F1 G / 73 F1 D


#### Note: we're not yet dealing with acronyms!!!

In [None]:
hf = np.ones(L_train_b.shape[0])

In [None]:
for i in range(L_train_b.shape[0]):
    

In [None]:
L_train_b.getrow(0)

In [None]:
L_train_b.sum(1).shape

In [None]:
from snorkel.utils import sparse_abs
sparse_abs(L_train_b).sum(1).shape

In [None]:
hf = np.where(L_train_b.sum(1) == sparse_abs(L_train_b).sum(1), np.sign(L_train_b.sum(1)), 0)

In [None]:
hf.shape

In [None]:
L_gold_train_b = np.sign(L_gold_train.todense())

In [None]:
L_gold_train_b

In [None]:
L_gold_train_b.T.dot(hf)

In [None]:
hf.sum

In [None]:
L_train_b_sum = L_train_b.sum(1)
L_train_b_abs_sum = sparse_abs(L_train_b).sum(1)
L_train_b_sum_abs = sparse_abs(L_train_b.sum(1))

In [None]:
L_train_b

In [None]:
correct = 0
wrong   = 0
for i in range(L_train_b.shape[0]):
    if L_train_b_sum[i] < 0 and L_train_b_sum_abs[i] == L_train_b_abs_sum[i]:
        if L_gold_train[i] < 0:
            correct += 1
        else:
            wrong += 1
            
print correct
print wrong
print correct / float(correct + wrong)

In [None]:
N = L_train.shape[0]
N

In [None]:
filtered = 0
correct  = 0
for i,c in enumerate(train):
    if i % 1000 == 0:
        if filtered > 0:
            print i, filtered, correct, correct / float(filtered)
        else:
            print i, filtered, correct
    
    pos = 0
    neg = 0
    for lf_name, label in LFG_MESH_exact(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    for lf_name, label in LFG_MESH_exact_drop_JJs(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    #if neg > 0 and pos == 0:
    if neg > pos:
        filtered += 1
        if L_gold_train[i] < 0:
            correct += 1

print filtered
print correct
print correct / float(filtered)

In [None]:
filtered = 0
correct  = 0
for i,c in enumerate(train):
    if i % 1000 == 0:
        if filtered > 0:
            print i, filtered, correct, correct / float(filtered)
        else:
            print i, filtered, correct
    
    if len(c.disease.get_attrib_tokens()) == 1 and c.disease.get_span().lower() not in cd.term_to_sids:
        filtered += 1
        if L_gold_train[i] < 0:
            correct += 1

print filtered
print correct
print correct / float(filtered)

# LF STATS ANALYSIS

In [None]:
lfs = L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))

In [None]:
gen_model.w

In [None]:
BAD_LFs = [578, 627, 603, 687, 573, 579]
for i in BAD_LFs:
    gen_model.w[i] = 0.0

In [None]:
lfs.nsmallest(50, "accuracy")

In [None]:
for lf in lfs.iterrows():
    lf_name, s = lf
    

# ERROR ANALYSIS

In [None]:
from random import shuffle
N_dev = L_gold_dev.shape[0]

fps = []
fns = []
for i in range(N_dev):
    if yp[i] > 0 and L_gold_dev[i] < 0:
        fps.append(i)
    elif yp[i] < 0 and L_gold_dev[i] > 0:
        fns.append(i)

shuffle(fps)
shuffle(fns)

print len(fps)
print len(fns)

from snorkel.viewer import SentenceNgramViewer
fn_cands = [F_dev.get_candidate(i) for i in fns[:100]]
svn      = SentenceNgramViewer(fn_cands, session)
svn

In [None]:
exact_match = 0
for i in fns:
    c = F_dev.get_candidate(i)
    if c.disease.get_span() in mesh_tree:
        exact_match += 1

In [None]:
exact_match

In [None]:
c = svn.get_selected()

mesh_tree[c.disease.get_span()]

In [None]:
c.disease.get_attrib_tokens('pos_tags')

In [None]:
mesh_tree['alcohol abuse']

In [None]:
from snorkel.models import Label

l = session.query(Label).filter(Label.candidate == c).one()
CID_to_MESH[l.value]

In [None]:
i = F_dev.get_row_index(c)
[(F_dev.get_key(k), disc_model.w[k]) for k in F_dev.getrow(i).nonzero()[1]]

In [None]:
F_dev.get_key(1)

* Why is Parkinson's disease not caught?

In [None]:
from snorkel.viewer import SentenceNgramViewer
fp_cands = [F_dev.get_candidate(i) for i in fps[:100]]
sv       = SentenceNgramViewer(fp_cands, session)
sv

In [None]:
NEG_PHRASES = [
    'stenosis',
    'further attention',
    'presence',
    'absence',
    'syndrome',
    'association',
    'strain',
    'progression'
]

NEG_END_WORDS = [
    'therapies',
    'muscles',
    'concentrations',
    'normal',
    'heart',
    'side',
    'sinus',
    'convulsants',
    'latencies',
    'findings',
    'doses',
    'remission'
]

def end_in_plural(c):
    pass

def body_part(c):
    pass

def not_exact_single_word(d):
    pass