# Disease Norm

In this example, we'll be writing an application to extract *mentions of* diseases from Pubmed abstracts, using annotations from the [BioCreative CDR Challenge](http://www.biocreative.org/resources/corpora/biocreative-v-cdr-corpus/).  This tutorial, which has 5 parts, walks through the process of constructing a model to classify _candidate_ disease mentions as either true (i.e., that it is truly a mention of a disease) or false.

# Loading Candidates + Annotations

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass

Disease = candidate_subclass('Disease', ['disease'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Training Candidates').one()
print len(train)
dev = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates').one()
print len(dev)

28087
27896


In [3]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

L_gold_train = label_manager.load(session, train, "CDR Training Label Set")
print L_gold_train.shape
L_gold_dev = label_manager.load(session, dev, "CDR Development Label Set")
print L_gold_dev.shape

(28087, 1)
(27896, 1)


# Process / Load Features

In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

Process:

In [None]:
%time F_train = feature_manager.create(session, train, 'Train Features')

In [None]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', False)

Or, load if already processed:

In [None]:
%time F_train = feature_manager.load(session, train, 'Train Features')

In [None]:
%time F_dev = feature_manager.load(session, dev, 'Train Features')

# Load dictionaries

### Load the MESH ID -> CID mapping

In [4]:
from cPickle import load
MESH_to_CID = load(open('MESH_to_CID.pkl', 'rb'))
diseases    = load(open('diseases.pkl', 'rb'))

In [5]:
mesh_to_terms = defaultdict(set)
for term, mid in diseases.iteritems():
    mesh_to_terms[mid].add(term)

### Create a canonical dictionary (CD)

In [23]:
from utils import CanonicalDictionary
cd = CanonicalDictionary(MESH_to_CID)

### Add MESH to CD

In [24]:
# Load MESH
from utils import load_mesh_raw
mesh_entries = load_mesh_raw('data/desc2016.xml')

Loaded 27885 entries


In [25]:
# Add MESH to cd
for entry in mesh_entries:
    mid, ps, terms = entry
    paths = [[p[0]] + p[1:].split('.') for p in ps]
    for term in terms:
        cd.add_term(term, mid, tree_paths=paths)

len(cd.term_to_sids)

151006

### Add MEDIC to CD

Custom CTD diseases dictionary made from MESH category C + OMIM

In [26]:
from utils import load_MEDIC, load_mesh_raw
medic_entries, MEDIC_to_CID = load_MEDIC()

Loaded 11864 MEDIC entries


In [27]:
# Add MESH to cd
for entry in medic_entries:
    if entry.id.startswith("MESH"):
        mid = entry.id.split(":")[1]
    elif len(entry.parent_ids) > 0 and entry.parent_ids[0].startswith("MESH"):
        mid = entry.parent_ids[0].split(":")[1]
    else:
        raise KeyError(entry)
    
    paths = []
    for p in entry.tree_nums:
        x = p.split("/")[0]
        paths.append([x[0]] + x[1:].split('.'))
    
    terms = [entry.name] + entry.synonyms
    for term in terms:
        cd.add_term(term, mid, tree_paths=paths)
        
len(cd.term_to_sids)

178666

### Add UMLS to CD

This may or may not be all of the UMLS... file from Jason

In [28]:
with open('cui2mesh.tsv', 'rb') as f:
    for line in f:
        term, cui, mid = line.rstrip('\n').split('\t')
        cd.add_term(term, mid)

len(cd.term_to_sids)

801566

## TODO: Add MESH supp?  No paths though?

# Writing some multinomial LFs

# NOTE: Beware of LF rollback bug!!!

## TYPE I LF: Subsets of MESH dictionary

In [None]:
SEEN_GLOBAL = defaultdict(set)

## MESH exact match

In [12]:
POS_DEPTH = 3
NEG_DEPTH = 3
def LFG_CD_match(c, p, key_mod=None, seen_global=None, max_paths_per_sid=1):
    """
    Given a candidate c, some transformed candidate disease phrase p,
    and an optional key name modifier key_mod to be appended, return a generator
    of key, value pairs
    """
    if p in cd.term_to_sids:
        for sid in cd.term_to_sids[p]:
            cid   = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
            paths = cd.tree_paths[sid]
            for path in paths[:max_paths_per_sid]:
                    
                # NOTE: path may be shorter than max depth if higher up in the tree (e.g. 'cancer', 'ischemia')!
                key = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                        
                # CHECK AGAINST / ADD TO GLOBAL SEEN SET!
                # To make sure that relaxations of an LF don't overlap with each other on any given candidate
                if seen_global is None or c.id not in seen_global[key]:
                    if seen_global is not None:
                        seen_global[key].add(c.id)
                    if key_mod:
                        key += "-" + key_mod
                    yield key, cid

In [13]:
def LFG_MESH_exact(c):
    p = c.disease.get_span().lower()
    return LFG_CD_match(c, p)

In [14]:
%time L_train_1 = label_manager.create(session, train, 'LF Training Labels 1', f=LFG_MESH_exact)
L_train_1


Loading sparse Label matrix...
CPU times: user 1min 32s, sys: 17.9 s, total: 1min 50s
Wall time: 1min 37s


<28087x382 sparse matrix of type '<type 'numpy.float64'>'
	with 7688 stored elements in Compressed Sparse Row format>

In [None]:
# Or load if already computed:
%time L_train_1 = label_manager.load(session, train, 'LF Training Labels 1')
L_train_1

### Drop JJs

In [15]:
MOD_RGX = r'JJ.*|VB.*'

def LFG_drop_leading_modifiers(c):
    words    = c.disease.get_attrib_tokens()
    pos_tags = c.disease.get_attrib_tokens('pos_tags')
    while re.match(MOD_RGX, pos_tags[0]):
        words    = words[1:]
        pos_tags = pos_tags[1:]
        p   = " ".join(words)
        
        # Hackey, but works for now...
        g = LFG_CD_match(c, p, key_mod="DJ")
        if len(list(g)) > 0:
            for key, cid in LFG_CD_match(c, p, key_mod="DJ"):
                yield key, cid

In [16]:
%time L_train_2 = label_manager.create(session, train, 'LF Training Labels 2', LFG_drop_leading_modifiers)
L_train_2


Loading sparse Label matrix...
CPU times: user 1min 34s, sys: 18.8 s, total: 1min 53s
Wall time: 1min 39s


<28087x279 sparse matrix of type '<type 'numpy.float64'>'
	with 3367 stored elements in Compressed Sparse Row format>

In [None]:
# Or load if already computed:
%time L_train_2 = label_manager.load(session, train, 'LF Training Labels 2')
L_train_2

### Remove common words

In [59]:
REMOVE_COMMON = r'.*induced|patient.*|drug|inhibitor|\d+|human|mouse|mice|rats?|with|syndrome|famil.*|s$|low(er)?|upper|left|right|top|bottom'
def remove_common(c):
    p = re.sub(r'\s\s+', ' ', re.sub(REMOVE_COMMON, '', c.disease.get_span())).strip()
    return p

In [60]:
def LFG_MESH_exact_remove_common(c):
    p = remove_common(c)
    return LFG_CD_match(c, p, key_mod="RC")

In [61]:
%time L_train_3 = label_manager.create(session, train, 'LF Training Labels 3.1', LFG_MESH_exact_remove_common)
L_train_3


Loading sparse Label matrix...
CPU times: user 1min 28s, sys: 16.7 s, total: 1min 44s
Wall time: 1min 32s


<28087x314 sparse matrix of type '<type 'numpy.float64'>'
	with 5130 stored elements in Compressed Sparse Row format>

In [None]:
# Or load if already computed:
%time L_train_3 = label_manager.load(session, train, 'LF Training Labels 3')
L_train_3

## MESH TF-IDF cosine match: POSITIVE terms

Only cosine match with _positive_ (i.e. C, F03 MESH disease terms) here!

In [29]:
%%time
from entity_norm import CanonDictVectorizer 

# Create a vectorizer based around this 
cd_vectorizer = CanonDictVectorizer(cd.term_to_sids, other_phrases=[])

# Vectorize the dictionary
D_pos   = cd_vectorizer.vectorize_phrases(cd.pos_terms)
D_pos_t = D_pos.T
D_pos_t

|V| = 271512
CPU times: user 1min 17s, sys: 1.19 s, total: 1min 18s
Wall time: 1min 18s


In [32]:
POS_DEPTH         = 3
NEG_DEPTH         = 3
THRESH            = 0.75
seen_global       = None
max_paths_per_sid = 1

def LFG_CD_cosine_match_pos(c):
    p  = c.disease.get_span().lower()
    cx = cd_vectorizer.vectorize_phrases([p])
    m  = cx * D_pos_t
    m  = m.tocoo()

    best_match = defaultdict(lambda : (0, None))
    for i, s in enumerate(m.data):
        if s > THRESH:
            j = m.col[i]
            t = cd.pos_terms[j]
            for sid in cd.term_to_sids[t]:
                cid = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
                for path in cd.tree_paths[sid][:max_paths_per_sid]:
                    key  = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                    if s > best_match[key][0]:
                        best_match[key] = (s, cid)

    for key, x in best_match.iteritems():
        s, cid = x
        
        # CHECK AGAINST / ADD TO GLOBAL SEEN SET!
        # To make sure that relaxations of an LF don't overlap with each other on any given candidate
        if seen_global is None or c.id not in seen_global[key]:
            if seen_global is not None:
                seen_global[key].add(c.id)
            key += "-c"
            yield key, cid

In [34]:
%time L_train_4 = label_manager.create(session, train, 'LF Training Labels 4.1', LFG_CD_cosine_match_pos)
L_train_4


Loading sparse Label matrix...
CPU times: user 3min 18s, sys: 17.2 s, total: 3min 35s
Wall time: 3min 22s


<28087x150 sparse matrix of type '<type 'numpy.float64'>'
	with 5745 stored elements in Compressed Sparse Row format>

In [None]:
# Or load if already computed:
%time L_train_4 = label_manager.load(session, train, 'LF Training Labels 4')
L_train_4

## Neg cosine match

In [40]:
# Vectorize the dictionary
D_neg   = cd_vectorizer.vectorize_phrases(cd.neg_terms)
D_neg_t = D_neg.T
D_neg_t

<271513x737772 sparse matrix of type '<type 'numpy.float64'>'
	with 2508478 stored elements in Compressed Sparse Column format>

In [41]:
THRESH            = 0.85

def LFG_CD_cosine_match_neg(c):
    p  = c.disease.get_span().lower()
    cx = cd_vectorizer.vectorize_phrases([p])
    m  = cx * D_neg_t
    m  = m.tocoo()

    best_match = defaultdict(lambda : (0, None))
    for i, s in enumerate(m.data):
        if s > THRESH:
            j = m.col[i]
            t = cd.neg_terms[j]
            for sid in cd.term_to_sids[t]:
                cid = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
                for path in cd.tree_paths[sid][:max_paths_per_sid]:
                    key  = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                    if s > best_match[key][0]:
                        best_match[key] = (s, cid)

    for key, x in best_match.iteritems():
        s, cid = x
        
        # CHECK AGAINST / ADD TO GLOBAL SEEN SET!
        # To make sure that relaxations of an LF don't overlap with each other on any given candidate
        if seen_global is None or c.id not in seen_global[key]:
            if seen_global is not None:
                seen_global[key].add(c.id)
            key += "-c"
            yield key, cid

In [42]:
%time L_train_4_N = label_manager.create(session, train, 'LF Training Labels 4 N', LFG_CD_cosine_match_neg)
L_train_4_N


Loading sparse Label matrix...
CPU times: user 26min 1s, sys: 2min 36s, total: 28min 38s
Wall time: 28min 23s


<28087x406 sparse matrix of type '<type 'numpy.float64'>'
	with 11389 stored elements in Compressed Sparse Row format>

# Putting in some negative LFs

In [35]:
import re
from lf_terms import *
from snorkel.lf_helpers import get_left_tokens, get_right_tokens
from utils import *
from Disease_Tagging_Tutorial_LFs import *
chemicals = load_chemdner_dictionary()

def LF_organs(c):
    phrase = " ".join(c[0].get_attrib_tokens()).lower()
    return -1 if phrase in organs else 0      

def LF_chemical_name(c):
    phrase = " ".join(c[0].get_attrib_tokens())
    return -1 if phrase in chemicals and not phrase.isupper() and phrase.lower() not in cd.term_to_sids else 0

def LF_bodypart(c):
    phrase = " ".join(c[0].get_attrib_tokens()).lower()
    return -1 if phrase in bodypart else 0  

def LF_protein_chemical_abbrv(c):
    '''Gene/protein/chemical name'''
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("\d+",lemma) else 0

def LF_base_pair_seq(c): 
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("^[GACT]{2,}$",lemma) else 0

LFs_false = [LF_chemical_name,
             LF_organs,
             LF_bodypart,
             LF_protein_chemical_abbrv,
             LF_base_pair_seq,
             #LF_too_vague,
             #LF_neg_surfix,
             LF_non_common_disease,
             LF_non_disease_acronyms,
             #LF_pos_in,
             LF_gene_chromosome_link,
             LF_right_window_incomplete,
             #LF_negative_indicator
            ]

In [36]:
%time L_train_5 = label_manager.create(session, train, 'LF Training Labels 5', LFs_false)
L_train_5


Loading sparse Label matrix...
CPU times: user 3min 52s, sys: 14.9 s, total: 4min 7s
Wall time: 3min 56s


<28087x9 sparse matrix of type '<type 'numpy.float64'>'
	with 2727 stored elements in Compressed Sparse Row format>

In [None]:
# Or load if already computed:
%time L_train_5 = label_manager.load(session, train, 'LF Training Labels 5.1')
L_train_5

### More neg. LFs

In [94]:
NEG_COMMON_RGX = r'(finding|disease|syndrome|marker|defecit|.*event|mean|median|mg)s?'
def LF_common_neg_phrases_2(c):
    p = c.disease.get_span().lower()
    return -1 if re.match(NEG_COMMON_RGX, p) else 0

NEG_AFTER_WORDS = frozenset(['of', 'to'])
def LF_neg_after_2(c):
    rw = get_right_tokens(c, window=1, attrib='lemmas')
    return -1 if len(rw) > 0 and rw[0] in NEG_AFTER_WORDS else 0

def LF_after_num_2(c):
    lw = get_left_tokens(c, window=1, attrib='lemmas')
    return -1 if len(lw) > 0 and re.match(r'\d+', lw[0]) else 0

def LF_too_short_2(c):
    p = c.disease.get_span().lower()
    return -1 if len(p) < 3 else 0

LFs_false_2 = [
    LF_common_neg_phrases_2,
    LF_neg_after_2,
    LF_after_num_2,
    LF_too_short_2
]

In [95]:
%time L_train_6 = label_manager.create(session, train, 'LF Training Labels 6.3', LFs_false_2)
L_train_6


Loading sparse Label matrix...
CPU times: user 2min 34s, sys: 16.8 s, total: 2min 51s
Wall time: 2min 38s


<28087x4 sparse matrix of type '<type 'numpy.float64'>'
	with 8708 stored elements in Compressed Sparse Row format>

# Running gen. model

In [96]:
from utils import binarize_LF_matrix, get_binarized_score
from snorkel.annotations import merge_annotations

L_train = merge_annotations([
        L_train_1
        , L_train_2
        , L_train_3
        , L_train_4
        , L_train_4_N
        , L_train_5
        , L_train_6
    ])

L_train_b = binarize_LF_matrix(L_train)
L_train_b

<28087x1544 sparse matrix of type '<type 'numpy.float64'>'
	with 44754 stored elements in Compressed Sparse Row format>

In [97]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
%time gen_model.train(L_train_b, n_iter=5000, rate=1e-1, verbose=True)

Training marginals (!= 0.5):	28087
Features:			1544
Begin training for rate=0.1, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.080097
	Learning epoch = 250	Gradient mag. = 0.099166
	Learning epoch = 500	Gradient mag. = 0.118439
	Learning epoch = 750	Gradient mag. = 0.145059
	Learning epoch = 1000	Gradient mag. = 0.144527
	Learning epoch = 1250	Gradient mag. = 0.130869
	Learning epoch = 1500	Gradient mag. = 0.129422
	Learning epoch = 1750	Gradient mag. = 0.129358
	Learning epoch = 2000	Gradient mag. = 0.128045
	Learning epoch = 2250	Gradient mag. = 0.125770
	Learning epoch = 2500	Gradient mag. = 0.122429
	Learning epoch = 2750	Gradient mag. = 0.118965
	Learning epoch = 3000	Gradient mag. = 0.115031
	Learning epoch = 3250	Gradient mag. = 0.110108
	Learning epoch = 3500	Gradient mag. = 0.106268
	Learning epoch = 3750	Gradient mag. = 0.103078
	Learning epoch = 4000	Gradient mag. = 0.099877
	Learning epoch = 4250	Gradient mag. = 0.095215
	Learning epoch = 4500	Gradient mag. = 0.092106
	Lea

In [98]:
yp = gen_model.predict(L_train_b, b=0.5)
get_binarized_score(yp, L_gold_train)

P :	0.869129834254
R :	0.650387596899
F1:	0.74401418859


In [72]:
yp = gen_model.predict(L_train_b, b=0.5)
get_binarized_score(yp, L_gold_train)

P :	0.843413978495
R :	0.64857881137
F1:	0.733274905054


In [45]:
yp = gen_model.predict(L_train_b, b=0.5)
get_binarized_score(yp, L_gold_train)

P :	0.747949080622
R :	0.683204134367
F1:	0.714112086428


In [58]:
(2*0.84*0.68) / (0.84+0.68)

0.7515789473684211

In [None]:
# Print LF stats...
from snorkel.learning import odds_to_prob
lfs = L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))
lfs.nlargest(50, "coverage")

# Error analysis
_F1=63_

ROUND 1:
* _DONE: Take out 'F' category? [6]_
* _DONE: Take out supplemental entries [2]_
* _DONE: Don't emit all tree paths?  **This essentially gives many more votes if it has multiple tree paths!!!** [3]_

**_--> F1=69_**

ROUND 2:
* _DONE: Take out LF-negative-indicator: no longer appropriate here! [1]_
* _DONE: Common transforms [3]_
* _DONE: Remove leading VB(N|D) also! [2]_
* _DONE: Iterate over all JJs removed; stop at first (longest) match [2]_
* _DONE: Try "no reinforcing deps" setting where we avoid the positive reinforcement of cascading overlap?_

**_--> F1=72_**

ROUND 3:
* _DONE: Negative cosine matches tend to have super-high weight... **take these out??**_
    - _**In particular, we match e.g. the disease + associated proteins, body parts, etc...**_
    - _We are missing some **exact matches**...!!!  Just override model here?_
    - _Or, add multiple threshs again for cosine match (e.g. milk fever case...)?_
* _DONE: Weird words in chemicals dictionary ("hepatitis", "leprosy")? Take chemicals - diseases instead? [2]_

**_--> F1=73_**

ROUND 4:
* _DONE: `LF-neg-after` is broken! [2]_
* **Use stemming?** (e.g. "spontaneously hypertensive rats" -> "hypertension")
* Try running transforms through vector matcher as well?
* "C-induced D", "C D": explicitly create LF for this case!

**_--> F1=74_**

In [99]:
from random import shuffle
N_train = L_gold_train.shape[0]

fps    = []
fns    = []
fns_na = []
for i in range(N_train):
    if yp[i] > 0 and L_gold_train[i] < 0:
        fps.append(i)
    elif yp[i] <= 0 and L_gold_train[i] > 0:
        if yp[i] == 0:
            fns_na.append(i)
        else:
            fns.append(i)

shuffle(fps)
shuffle(fns)
shuffle(fns_na)

print len(fps)
print len(fns)
print len(fns_na)

379
470
883


In [100]:
from snorkel.viewer import SentenceNgramViewer
fn_cands = [L_train.get_candidate(i) for i in fns[:100]]
svn      = SentenceNgramViewer(fn_cands, session)
svn

<IPython.core.display.Javascript object>

In [92]:
c = svn.get_selected()
print c
print c.disease.get_attrib_tokens('pos_tags')

Disease(Span("autonomic neuropathy", parent=740, chars=[41,60], words=[7,8]))
[u'JJ', u'NN']


In [93]:
from snorkel.models import Label
session.query(Label).filter(Label.candidate == c).all()

[Label (CDR Training Label = 1922),
 Label (C-10-668-c = 732),
 Label (LF_neg_after = -1)]

In [53]:
from snorkel.learning.gen_learning import odds_to_prob
i = L_train.get_row_index(c)

for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), odds_to_prob(gen_model.w[j]), int(L_train[i,j])

AnnotationKey (C-10-597-c) 0.810728840474 1936


In [None]:
THRESH            = 0.75
def vector_matches(p, Dt, vectorizer, thresh=THRESH):
    cx  = cd_vectorizer.vectorize_phrases([p])
    m   = cx * Dt
    m   = m.tocoo()
    out = []
    for i, s in enumerate(m.data):
        if s > THRESH:
            j    = m.col[i]
            t    = cd.terms[j]
            sids = cd.term_to_sids[t]
            out.append((s, j, t, sids))
    return out

In [None]:
p = c.disease.get_span().lower()
vector_matches(p, Dt, cd_vectorizer)

In [None]:
cd.cid_to_sid[712]

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [None]:
cd_vectorizer.word_to_cids[stemmer.stem("amiodarone")]

In [None]:
print cd.cid_to_sid[2097]
mesh_to_terms[cd.cid_to_sid[2097]]

In [None]:
cd.sid_to_cid[list(cd.term_to_sids['hypomanic episode'])[0]]

In [None]:
L_train

In [None]:
cd.cid_to_sid[3624]

In [None]:
mesh_to_terms[cd.cid_to_sid[1419]]

In [None]:
cd.sid_to_cid[list(cd.term_to_sids["vein occlusion"])[0]]

In [None]:
cd.sid_to_cid['D012514']

In [None]:
# Find most common words
word_fs = [(len(cids), word) for word, cids in cd_vectorizer.word_to_cids.iteritems()]
word_fs.sort(key=lambda x : -x[0])
word_fs[:10]

In [None]:
remove_common(c)

In [None]:
for key, cid in LFG_MESH_exact(c):
    print key, cid

In [None]:
for key, cid in LFG_CD_cosine_match(c):
    print key, cid

In [None]:
mt = 0.5
m  = m.tocoo()
for i, s in enumerate(m.data):
    if s > mt:
        j = m.col[i]
        t = cd.terms[j]
        print t, s
        for sid in cd.term_to_sids[t]:
            cid = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
            print "\t", sid, cid
            for path in cd.tree_paths[sid]:
                key  = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                print "\t\t", key

In [None]:
for j in m.nonzero()[1]:
    t = cd.terms[j]
    print t, cd.term_to_sids[t]

In [None]:
cd.sid_to_cid['C566870']

In [None]:
[cd.sid_to_cid[sid] for sid in cd.term_to_sids['toxicity']]

In [None]:
cd.cid_to_sid[5593]

# TODO:

* _DONE: Re-write the cosine matcher_
* _DONE: Switch to MESH 2016_
* _DONE: Add in supplementary records!_
* **Try labeling in cascading if-then fashion...?**

# Training Discriminative model

In [None]:
from snorkel.learning import LogReg

train_marginals = gen_model.marginals(L_train_b)

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-3, mu=1e-6)

In [None]:
yp = disc_model.predict(F_train)
get_binarized_score(yp, L_gold_train)

In [None]:
yp = disc_model.predict(F_dev, b=0.4)
get_binarized_score(yp, L_gold_dev)

In [None]:
plt.hist(disc_model.marginals(F_dev))

In [None]:
from snorkel.learning.gen_learning import odds_to_prob
plt.hist(odds_to_prob(gen_model.w))

In [None]:
L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))

# Hard filtering

In [None]:
# Try overriding with any exact matches...
ype = np.zeros(L_gold_train.shape[0])
for i,c in enumerate(dev):
    if i % 5000 == 0:
        print i
    pos = 0
    neg = 0
    for lf_name, label in LFG_MESH_exact(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    for lf_name, label in LFG_MESH_exact_drop_JJs(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    if neg > 0 and pos == 0:
        ype[i] = -1
    elif pos > 0 and neg == 0:
        ype[i] = 1
    else:
        ype[i] = yp[i]

In [None]:
get_binarized_score(ype, L_gold_dev)

# Now, getting CIDs in simple heuristic way

In [None]:
N = L_gold_train.shape[0]

In [None]:
pp_idxs = [i for i in range(N) if yp[i] == 1 and L_gold_train[i] > 0]
pp_idxs[:5]

In [None]:
i = 32
for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), gen_model.w[j], L_train[i,j]

In [None]:
i = 53
for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), gen_model.w[j], L_train[i,j]

In [None]:
L_gold_train[53,0]

In [None]:
print L_train.get_key(558)
print gen_model.w[558]
print L_train[18,558]

In [None]:
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_train[i,0] > 0:
        total += 1
    
    if yp[i] > 0:
        predicted += 1
        
        max_w = 0.0
        cid   = -1
        for j in L_train.getrow(i).nonzero()[1]:
            if gen_model.w[j] > max_w:
                max_w = gen_model.w[j]
                cid   = L_train[i,j]
        
        if cid == L_gold_train[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

In [None]:
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_train[i,0] > 0:
        total += 1
        
    max_w = 0.0
    cid   = -1
    for j in L_train.getrow(i).nonzero()[1]:
        if gen_model.w[j] > max_w:
            max_w = gen_model.w[j]
            cid   = L_train[i,j]
        
    if cid > 0:
        predicted += 1  
        if cid == L_gold_train[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

# ETC

# Different cut levels:

Note: G = gen model on training set, D = disc. model on test set

* Pos: 1, Neg: 1, Pos-cosine: 1, Neg-cosine: 1, Thresh-cosine: 0.75 = 56 F1 G / 63 F1 D
* TODO...
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: 0.75 = 61 F1 G / 68 F1 D
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: [0.5, 0.75] = 63 F1 G / 65 F1 D
* Pos: 4, Neg: 4, Pos-cosine: 4, Neg-cosine: 4, Thresh-cosine: 0.75 = 60 F1 G / 64 F1 D

### Adding in drop_JJs + NEG LFs:

* Pos: 2, Neg: 2, Pos-cosine: 2, Neg-cosine: 2, Thresh-cosine: 0.75 = 69 F1 G / 71 F1 D
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: 0.75 = 70 F1 G / 73 F1 D


#### Note: we're not yet dealing with acronyms!!!

In [None]:
hf = np.ones(L_train_b.shape[0])

In [None]:
for i in range(L_train_b.shape[0]):
    

In [None]:
L_train_b.getrow(0)

In [None]:
L_train_b.sum(1).shape

In [None]:
from snorkel.utils import sparse_abs
sparse_abs(L_train_b).sum(1).shape

In [None]:
hf = np.where(L_train_b.sum(1) == sparse_abs(L_train_b).sum(1), np.sign(L_train_b.sum(1)), 0)

In [None]:
hf.shape

In [None]:
L_gold_train_b = np.sign(L_gold_train.todense())

In [None]:
L_gold_train_b

In [None]:
L_gold_train_b.T.dot(hf)

In [None]:
hf.sum

In [None]:
L_train_b_sum = L_train_b.sum(1)
L_train_b_abs_sum = sparse_abs(L_train_b).sum(1)
L_train_b_sum_abs = sparse_abs(L_train_b.sum(1))

In [None]:
L_train_b

In [None]:
correct = 0
wrong   = 0
for i in range(L_train_b.shape[0]):
    if L_train_b_sum[i] < 0 and L_train_b_sum_abs[i] == L_train_b_abs_sum[i]:
        if L_gold_train[i] < 0:
            correct += 1
        else:
            wrong += 1
            
print correct
print wrong
print correct / float(correct + wrong)

In [None]:
N = L_train.shape[0]
N

In [None]:
filtered = 0
correct  = 0
for i,c in enumerate(train):
    if i % 1000 == 0:
        if filtered > 0:
            print i, filtered, correct, correct / float(filtered)
        else:
            print i, filtered, correct
    
    pos = 0
    neg = 0
    for lf_name, label in LFG_MESH_exact(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    for lf_name, label in LFG_MESH_exact_drop_JJs(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    #if neg > 0 and pos == 0:
    if neg > pos:
        filtered += 1
        if L_gold_train[i] < 0:
            correct += 1

print filtered
print correct
print correct / float(filtered)

In [None]:
filtered = 0
correct  = 0
for i,c in enumerate(train):
    if i % 1000 == 0:
        if filtered > 0:
            print i, filtered, correct, correct / float(filtered)
        else:
            print i, filtered, correct
    
    if len(c.disease.get_attrib_tokens()) == 1 and c.disease.get_span().lower() not in cd.term_to_sids:
        filtered += 1
        if L_gold_train[i] < 0:
            correct += 1

print filtered
print correct
print correct / float(filtered)

# LF STATS ANALYSIS

In [None]:
lfs = L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))

In [None]:
gen_model.w

In [None]:
BAD_LFs = [578, 627, 603, 687, 573, 579]
for i in BAD_LFs:
    gen_model.w[i] = 0.0

In [None]:
lfs.nsmallest(50, "accuracy")

In [None]:
for lf in lfs.iterrows():
    lf_name, s = lf
    

# ERROR ANALYSIS

In [None]:
from random import shuffle
N_dev = L_gold_dev.shape[0]

fps = []
fns = []
for i in range(N_dev):
    if yp[i] > 0 and L_gold_dev[i] < 0:
        fps.append(i)
    elif yp[i] < 0 and L_gold_dev[i] > 0:
        fns.append(i)

shuffle(fps)
shuffle(fns)

print len(fps)
print len(fns)

from snorkel.viewer import SentenceNgramViewer
fn_cands = [F_dev.get_candidate(i) for i in fns[:100]]
svn      = SentenceNgramViewer(fn_cands, session)
svn

In [None]:
exact_match = 0
for i in fns:
    c = F_dev.get_candidate(i)
    if c.disease.get_span() in mesh_tree:
        exact_match += 1

In [None]:
exact_match

In [None]:
c = svn.get_selected()

mesh_tree[c.disease.get_span()]

In [None]:
c.disease.get_attrib_tokens('pos_tags')

In [None]:
mesh_tree['alcohol abuse']

In [None]:
from snorkel.models import Label

l = session.query(Label).filter(Label.candidate == c).one()
CID_to_MESH[l.value]

In [None]:
i = F_dev.get_row_index(c)
[(F_dev.get_key(k), disc_model.w[k]) for k in F_dev.getrow(i).nonzero()[1]]

In [None]:
F_dev.get_key(1)

* Why is Parkinson's disease not caught?

In [None]:
from snorkel.viewer import SentenceNgramViewer
fp_cands = [F_dev.get_candidate(i) for i in fps[:100]]
sv       = SentenceNgramViewer(fp_cands, session)
sv

In [None]:
NEG_PHRASES = [
    'stenosis',
    'further attention',
    'presence',
    'absence',
    'syndrome',
    'association',
    'strain',
    'progression'
]

NEG_END_WORDS = [
    'therapies',
    'muscles',
    'concentrations',
    'normal',
    'heart',
    'side',
    'sinus',
    'convulsants',
    'latencies',
    'findings',
    'doses',
    'remission'
]

def end_in_plural(c):
    pass

def body_part(c):
    pass

def not_exact_single_word(d):
    pass