# Disease Norm

In this example, we'll be writing an application to extract *mentions of* diseases from Pubmed abstracts, using annotations from the [BioCreative CDR Challenge](http://www.biocreative.org/resources/corpora/biocreative-v-cdr-corpus/).  This tutorial, which has 5 parts, walks through the process of constructing a model to classify _candidate_ disease mentions as either true (i.e., that it is truly a mention of a disease) or false.

# Loading Candidates + Annotations

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass

Disease = candidate_subclass('Disease', ['disease'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from snorkel.models import CandidateSet

train = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Training Candidates').one()
print len(train)
dev = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates').one()
print len(dev)

28087
27896


In [3]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

L_gold_train = label_manager.load(session, train, "CDR Training Label Set")
print L_gold_train.shape
L_gold_dev = label_manager.load(session, dev, "CDR Development Label Set")
print L_gold_dev.shape

(28087, 1)
(27896, 1)


# Process / Load Features

In [46]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

Process:

In [47]:
%time F_train = feature_manager.create(session, train, 'Train Features')


Loading sparse Feature matrix...
CPU times: user 18min 32s, sys: 18.3 s, total: 18min 50s
Wall time: 18min 38s


In [48]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', False)


Loading sparse Feature matrix...
CPU times: user 13min 29s, sys: 21.1 s, total: 13min 50s
Wall time: 13min 38s


Or, load if already processed:

In [None]:
%time F_train = feature_manager.load(session, train, 'Train Features')

In [None]:
%time F_dev = feature_manager.load(session, dev, 'Train Features')

# Load dictionaries

### Load the MESH ID -> CID mapping

In [4]:
from cPickle import load
MESH_to_CID = load(open('MESH_to_CID.pkl', 'rb'))
diseases    = load(open('diseases.pkl', 'rb'))

In [5]:
mesh_to_terms = defaultdict(set)
for term, mid in diseases.iteritems():
    mesh_to_terms[mid].add(term)

### Create a canonical dictionary (CD)

In [6]:
from utils import CanonicalDictionary
cd = CanonicalDictionary(MESH_to_CID)

### Add MESH to CD

In [7]:
# Load MESH
from utils import load_mesh_raw
mesh_entries = load_mesh_raw('data/desc2016.xml')

Loaded 27885 entries


In [8]:
# Add MESH to cd
for entry in mesh_entries:
    mid, ps, terms = entry
    paths = [[p[0]] + p[1:].split('.') for p in ps]
    for term in terms:
        cd.add_term(term, mid, tree_paths=paths)

len(cd.term_to_sids)

151006

### Add MEDIC to CD

Custom CTD diseases dictionary made from MESH category C + OMIM

In [9]:
from utils import load_MEDIC, load_mesh_raw
medic_entries, MEDIC_to_CID = load_MEDIC()

Loaded 11864 MEDIC entries


In [10]:
# Add MESH to cd
for entry in medic_entries:
    if entry.id.startswith("MESH"):
        mid = entry.id.split(":")[1]
    elif len(entry.parent_ids) > 0 and entry.parent_ids[0].startswith("MESH"):
        mid = entry.parent_ids[0].split(":")[1]
    else:
        raise KeyError(entry)
    
    paths = []
    for p in entry.tree_nums:
        x = p.split("/")[0]
        paths.append([x[0]] + x[1:].split('.'))
    
    terms = [entry.name] + entry.synonyms
    for term in terms:
        cd.add_term(term, mid, tree_paths=paths)
        
len(cd.term_to_sids)

178666

### Add UMLS to CD

This may or may not be all of the UMLS... file from Jason

In [11]:
with open('cui2mesh.tsv', 'rb') as f:
    for line in f:
        term, cui, mid = line.rstrip('\n').split('\t')
        cd.add_term(term, mid)

len(cd.term_to_sids)

801566

In [140]:
from cPickle import dump
dump(cd, open('cd.pkl', 'wb'))

## TODO: Add MESH supp?  No paths though?

# Writing some multinomial LFs

# NOTE: Beware of LF rollback bug!!!

## TYPE I LF: Subsets of MESH dictionary

In [None]:
SEEN_GLOBAL = defaultdict(set)

## MESH exact match

In [12]:
POS_DEPTH = 3
NEG_DEPTH = 3
def LFG_CD_match(c, p, key_mod=None, seen_global=None, max_paths_per_sid=1):
    """
    Given a candidate c, some transformed candidate disease phrase p,
    and an optional key name modifier key_mod to be appended, return a generator
    of key, value pairs
    """
    if p in cd.term_to_sids:
        for sid in cd.term_to_sids[p]:
            cid   = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
            paths = cd.tree_paths[sid]
            for path in paths[:max_paths_per_sid]:
                    
                # NOTE: path may be shorter than max depth if higher up in the tree (e.g. 'cancer', 'ischemia')!
                key = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                        
                # CHECK AGAINST / ADD TO GLOBAL SEEN SET!
                # To make sure that relaxations of an LF don't overlap with each other on any given candidate
                if seen_global is None or c.id not in seen_global[key]:
                    if seen_global is not None:
                        seen_global[key].add(c.id)
                    if key_mod:
                        key += "-" + key_mod
                    yield key, cid

In [13]:
def LFG_MESH_exact(c):
    p = c.disease.get_span().lower()
    return LFG_CD_match(c, p)

In [14]:
%time L_train_1 = label_manager.create(session, train, 'LF Training Labels 1', f=LFG_MESH_exact)
L_train_1


Loading sparse Label matrix...
CPU times: user 1min 34s, sys: 18.3 s, total: 1min 52s
Wall time: 1min 38s


<28087x382 sparse matrix of type '<type 'numpy.float64'>'
	with 7688 stored elements in Compressed Sparse Row format>

In [80]:
L_dev_1 = label_manager.create(session, dev, 'LF Development Labels 1', f=LFG_MESH_exact)


Loading sparse Label matrix...


In [None]:
# Or load if already computed:
%time L_train_1 = label_manager.load(session, train, 'LF Training Labels 1')
L_train_1

### Drop JJs

In [15]:
MOD_RGX = r'JJ.*|VB.*|RB.*'

def LFG_drop_leading_modifiers(c):
    words    = c.disease.get_attrib_tokens()
    pos_tags = c.disease.get_attrib_tokens('pos_tags')
    while re.match(MOD_RGX, pos_tags[0]):
        words    = words[1:]
        pos_tags = pos_tags[1:]
        p   = " ".join(words)
        
        # Hackey, but works for now...
        g = LFG_CD_match(c, p, key_mod="DJ")
        if len(list(g)) > 0:
            for key, cid in LFG_CD_match(c, p, key_mod="DJ"):
                yield key, cid

In [16]:
%time L_train_2 = label_manager.create(session, train, 'LF Training Labels 2', LFG_drop_leading_modifiers)
L_train_2


Loading sparse Label matrix...
CPU times: user 1min 39s, sys: 19.6 s, total: 1min 59s
Wall time: 1min 44s


<28087x282 sparse matrix of type '<type 'numpy.float64'>'
	with 3490 stored elements in Compressed Sparse Row format>

In [81]:
L_dev_2 = label_manager.create(session, dev, 'LF Development Labels 2', LFG_drop_leading_modifiers)


Loading sparse Label matrix...


In [None]:
# Or load if already computed:
%time L_train_2 = label_manager.load(session, train, 'LF Training Labels 2')
L_train_2

### Remove common words

In [17]:
REMOVE_COMMON = r'.*induced|patient.*|drug|inhibitor|\d+|human|mouse|mice|rats?|with|syndrome|famil.*|s$|low(er)?|upper|left|right|top|bottom|subjects?'
def remove_common(c):
    p = re.sub(r'\s\s+', ' ', re.sub(REMOVE_COMMON, '', c.disease.get_span().lower())).strip()
    return p

In [18]:
def LFG_MESH_exact_remove_common(c):
    p = remove_common(c)
    return LFG_CD_match(c, p, key_mod="RC")

In [19]:
%time L_train_3 = label_manager.create(session, train, 'LF Training Labels 3', LFG_MESH_exact_remove_common)
L_train_3


Loading sparse Label matrix...
CPU times: user 1min 33s, sys: 17 s, total: 1min 50s
Wall time: 1min 38s


<28087x341 sparse matrix of type '<type 'numpy.float64'>'
	with 6033 stored elements in Compressed Sparse Row format>

In [83]:
L_dev_3 = label_manager.create(session, dev, 'LF Development Labels 3.1', LFG_MESH_exact_remove_common)


Loading sparse Label matrix...


In [None]:
# Or load if already computed:
%time L_train_3 = label_manager.load(session, train, 'LF Training Labels 3')
L_train_3

## MESH TF-IDF cosine match: POSITIVE terms

Only cosine match with _positive_ (i.e. C, F03 MESH disease terms) here!

**Note: we want to avoid positive reinforcement amongst these, so track seen / only emit one:**

In [84]:
SEEN_COSINE_POS = defaultdict(set)

In [21]:
%%time
from entity_norm import CanonDictVectorizer 

# Create a vectorizer based around this 
cd_vectorizer = CanonDictVectorizer(cd.term_to_sids, other_phrases=[])

# Vectorize the dictionary
D_pos   = cd_vectorizer.vectorize_phrases(cd.pos_terms)
D_pos_t = D_pos.T
D_pos_t

|V| = 271512
CPU times: user 1min 25s, sys: 1.66 s, total: 1min 27s
Wall time: 1min 26s


In [25]:
POS_DEPTH = 3
NEG_DEPTH = 3
THRESH    = 0.75
def LFG_CD_cosine_match(c, p, vectorizer, D_t, terms, thresh=THRESH, seen_global=None, max_paths_per_sid=1, key_mod=None):
    cx = vectorizer.vectorize_phrases([p])
    m  = cx * D_t
    m  = m.tocoo()

    best_match = defaultdict(lambda : (0, None))
    for i, s in enumerate(m.data):
        if s > thresh:
            j = m.col[i]
            t = terms[j]
            for sid in cd.term_to_sids[t]:
                cid = cd.sid_to_cid[sid] if sid in cd.sid_to_cid else -1
                for path in cd.tree_paths[sid][:max_paths_per_sid]:
                    key  = "-".join(path[:POS_DEPTH]) if cid > 0 else "-".join(path[:NEG_DEPTH])
                    if s > best_match[key][0]:
                        best_match[key] = (s, cid)

    for key, x in best_match.iteritems():
        s, cid = x
        
        # CHECK AGAINST / ADD TO GLOBAL SEEN SET!
        # To make sure that relaxations of an LF don't overlap with each other on any given candidate
        if seen_global is None or c.id not in seen_global[key]:
            if seen_global is not None:
                seen_global[key].add(c.id)
            key += "-c"
            if key_mod:
                key += key_mod
            yield key, cid

In [26]:
def LFG_CD_cosine_match_pos(c):
    p = c.disease.get_span().lower()
    return LFG_CD_cosine_match(c,p, cd_vectorizer, D_pos_t, cd.pos_terms, seen_global=SEEN_COSINE_POS)

In [27]:
%time L_train_4 = label_manager.create(session, train, 'LF Training Labels 4.1', LFG_CD_cosine_match_pos)
L_train_4


Loading sparse Label matrix...
CPU times: user 3min 47s, sys: 21.8 s, total: 4min 9s
Wall time: 3min 53s


<28087x150 sparse matrix of type '<type 'numpy.float64'>'
	with 5745 stored elements in Compressed Sparse Row format>

In [85]:
L_dev_4 = label_manager.create(session, dev, 'LF Development Labels 4', LFG_CD_cosine_match_pos)


Loading sparse Label matrix...


In [None]:
# Or load if already computed:
%time L_train_4 = label_manager.load(session, train, 'LF Training Labels 4')
L_train_4

## Transform -> pos. cosine match

In [28]:
def LFG_CD_cosine_match_DJ(c):
    words    = c.disease.get_attrib_tokens()
    pos_tags = c.disease.get_attrib_tokens('pos_tags')
    while re.match(MOD_RGX, pos_tags[0]):
        words    = words[1:]
        pos_tags = pos_tags[1:]
        p   = " ".join(words)
        
        # Very hackey, but works for now...
        g = LFG_CD_cosine_match(c, p, cd_vectorizer, D_pos_t, cd.pos_terms, key_mod="-DJ", seen_global=SEEN_COSINE_POS)
        if len(list(g)) > 0:
            for key, cid in LFG_CD_cosine_match(c, p, cd_vectorizer, D_pos_t, cd.pos_terms, key_mod="-DJ", seen_global=SEEN_COSINE_POS):
                yield key, cid

In [29]:
%time L_train_4_T1 = label_manager.create(session, train, 'LF Training Labels 4 T1', LFG_CD_cosine_match_DJ)
L_train_4_T1


Loading sparse Label matrix...
CPU times: user 2min 30s, sys: 17.4 s, total: 2min 48s
Wall time: 2min 35s


<28087x0 sparse matrix of type '<type 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [None]:
L_train_4_T1 = label_manager.create(session, train, 'LF Training Labels 4 T1', LFG_CD_cosine_match_DJ)

In [86]:
L_dev_4_T1 = label_manager.create(session, dev, 'LF Development Labels 4 T1', LFG_CD_cosine_match_DJ)


Loading sparse Label matrix...


In [30]:
def LFG_CD_cosine_match_RC(c):
    p = remove_common(c)
    if p != c.disease.get_span().lower():
        for key, cid in LFG_CD_cosine_match(c, p, cd_vectorizer, D_pos_t, cd.pos_terms, key_mod="-RC", seen_global=SEEN_COSINE_POS):
            yield key, cid

In [31]:
%time L_train_4_T2 = label_manager.create(session, train, 'LF Training Labels 4 T2', LFG_CD_cosine_match_RC)
L_train_4_T2


Loading sparse Label matrix...
CPU times: user 2min 13s, sys: 16.9 s, total: 2min 29s
Wall time: 2min 17s


<28087x69 sparse matrix of type '<type 'numpy.float64'>'
	with 451 stored elements in Compressed Sparse Row format>

In [87]:
L_dev_4_T2 = label_manager.create(session, dev, 'LF Development Labels 4 T2', LFG_CD_cosine_match_RC)


Loading sparse Label matrix...


## Neg cosine match

In [32]:
# Vectorize the dictionary
D_neg   = cd_vectorizer.vectorize_phrases(cd.neg_terms)
D_neg_t = D_neg.T
D_neg_t

<271513x737772 sparse matrix of type '<type 'numpy.float64'>'
	with 2508478 stored elements in Compressed Sparse Column format>

In [33]:
def LFG_CD_cosine_match_neg(c):
    p = c.disease.get_span().lower()
    return LFG_CD_cosine_match(c, p, cd_vectorizer, D_neg_t, cd.neg_terms, thresh=0.85)

In [34]:
%time L_train_4_N = label_manager.create(session, train, 'LF Training Labels 4 N', LFG_CD_cosine_match_neg)
L_train_4_N


Loading sparse Label matrix...
CPU times: user 25min 30s, sys: 2min 43s, total: 28min 13s
Wall time: 27min 58s


<28087x406 sparse matrix of type '<type 'numpy.float64'>'
	with 11389 stored elements in Compressed Sparse Row format>

In [88]:
L_dev_4_N = label_manager.create(session, dev, 'LF Development Labels 4 N', LFG_CD_cosine_match_neg)


Loading sparse Label matrix...


# Putting in some negative LFs

In [35]:
import re
from lf_terms import *
from snorkel.lf_helpers import get_left_tokens, get_right_tokens
from utils import *
from Disease_Tagging_Tutorial_LFs import *
chemicals = load_chemdner_dictionary()

def LF_organs(c):
    phrase = " ".join(c[0].get_attrib_tokens()).lower()
    return -1 if phrase in organs else 0      

def LF_chemical_name(c):
    phrase = " ".join(c[0].get_attrib_tokens())
    return -1 if phrase in chemicals and not phrase.isupper() and phrase.lower() not in cd.term_to_sids else 0

def LF_bodypart(c):
    phrase = re.sub(r's$', '', " ".join(c[0].get_attrib_tokens()).lower())
    return -1 if phrase in bodypart else 0  

def LF_protein_chemical_abbrv(c):
    '''Gene/protein/chemical name'''
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("\d+",lemma) else 0

def LF_base_pair_seq(c): 
    lemma = " ".join(c[0].get_attrib_tokens('lemmas'))
    return -1 if re.search("^[GACT]{2,}$",lemma) else 0

LFs_false = [LF_chemical_name,
             LF_organs,
             LF_bodypart,
             LF_protein_chemical_abbrv,
             LF_base_pair_seq,
             #LF_too_vague,
             #LF_neg_surfix,
             LF_non_common_disease,
             LF_non_disease_acronyms,
             #LF_pos_in,
             LF_gene_chromosome_link,
             LF_right_window_incomplete,
             #LF_negative_indicator
            ]

In [36]:
%time L_train_5 = label_manager.create(session, train, 'LF Training Labels 5', LFs_false)
L_train_5


Loading sparse Label matrix...
CPU times: user 4min 12s, sys: 15.8 s, total: 4min 28s
Wall time: 4min 16s


<28087x9 sparse matrix of type '<type 'numpy.float64'>'
	with 2768 stored elements in Compressed Sparse Row format>

In [89]:
L_dev_5 = label_manager.create(session, dev, 'LF Development Labels 5', LFs_false)


Loading sparse Label matrix...


In [None]:
# Or load if already computed:
%time L_train_5 = label_manager.load(session, train, 'LF Training Labels 5.1')
L_train_5

### More neg. LFs

In [37]:
NEG_COMMON_RGX = r'(finding|disease|syndrome|marker|defecit|.*event|mean|median|mg)s?'
def LF_common_neg_phrases(c):
    p = c.disease.get_span().lower()
    return -1 if re.match(NEG_COMMON_RGX, p) else 0

NEG_AFTER_WORDS = frozenset(['of', 'to'])
def LF_neg_after(c):
    rw = get_right_tokens(c, window=1, attrib='lemmas')
    return -1 if len(rw) > 0 and rw[0] in NEG_AFTER_WORDS and c.disease.get_span().lower() not in cd.term_to_sids else 0

def LF_after_num(c):
    lw = get_left_tokens(c, window=1, attrib='lemmas')
    return -1 if len(lw) > 0 and re.match(r'\d+', lw[0]) else 0

def LF_too_short(c):
    p = c.disease.get_span().lower()
    return -1 if len(p) < 3 else 0

BAD_ENDINGS_RGX = r'(type|trait|cell)s?$'
def LF_bad_endings(c):
    p = c.disease.get_span().lower()
    return -1 if re.search(BAD_ENDINGS_RGX, p) else 0

BAD_MESH_TERMS = frozenset(['disease', 'diseases', 'conversion'])
def LF_bad_MESH_entries(c):
    p = c.disease.get_span().lower()
    return -1 if p in BAD_MESH_TERMS else 0

LFs_false_2 = [
    LF_common_neg_phrases,
    LF_neg_after,
    LF_after_num,
    LF_too_short,
    LF_bad_endings,
    LF_bad_MESH_entries
]

In [38]:
%time L_train_6 = label_manager.create(session, train, 'LF Training Labels 6', LFs_false_2)
L_train_6


Loading sparse Label matrix...
CPU times: user 3min 12s, sys: 15.6 s, total: 3min 28s
Wall time: 3min 16s


<28087x6 sparse matrix of type '<type 'numpy.float64'>'
	with 7946 stored elements in Compressed Sparse Row format>

In [90]:
L_dev_6 = label_manager.create(session, dev, 'LF Development Labels 6', LFs_false_2)


Loading sparse Label matrix...


# Running gen. model

In [95]:
from utils import binarize_LF_matrix, get_binarized_score
from snorkel.annotations import merge_annotations

L_train = merge_annotations([
        L_train_1
        , L_train_2
        , L_train_3
        , L_train_4
        #, L_train_4_T1
        , L_train_4_T2
        , L_train_4_N
        , L_train_5
        , L_train_6
    ])

L_train_b = binarize_LF_matrix(L_train)
L_train_b

<28087x1645 sparse matrix of type '<type 'numpy.float64'>'
	with 45510 stored elements in Compressed Sparse Row format>

In [123]:
from snorkel.learning import NaiveBayes

gen_model_train = NaiveBayes()
%time gen_model_train.train(L_train_b, n_iter=10000, rate=1e-1, verbose=True)

Training marginals (!= 0.5):	28087
Features:			1645
Begin training for rate=0.1, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.075126
	Learning epoch = 250	Gradient mag. = 0.095146
	Learning epoch = 500	Gradient mag. = 0.115617
	Learning epoch = 750	Gradient mag. = 0.142812
	Learning epoch = 1000	Gradient mag. = 0.148422
	Learning epoch = 1250	Gradient mag. = 0.137011
	Learning epoch = 1500	Gradient mag. = 0.136266
	Learning epoch = 1750	Gradient mag. = 0.137535
	Learning epoch = 2000	Gradient mag. = 0.133239
	Learning epoch = 2250	Gradient mag. = 0.129043
	Learning epoch = 2500	Gradient mag. = 0.124353
	Learning epoch = 2750	Gradient mag. = 0.120547
	Learning epoch = 3000	Gradient mag. = 0.117536
	Learning epoch = 3250	Gradient mag. = 0.113501
	Learning epoch = 3500	Gradient mag. = 0.109784
	Learning epoch = 3750	Gradient mag. = 0.105936
	Learning epoch = 4000	Gradient mag. = 0.102319
	Learning epoch = 4250	Gradient mag. = 0.099151
	Learning epoch = 4500	Gradient mag. = 0.094547
	Lea

In [124]:
yp_gt_train = gen_model_train.predict(L_train_b, b=0.5)
get_binarized_score(yp_gt_train, L_gold_train)

P :	0.862817089453
R :	0.667958656331
F1:	0.75298572677


In [None]:
# Print LF stats...
from snorkel.learning import odds_to_prob
lfs = L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))
lfs.nlargest(50, "coverage")

# Error analysis
_F1=63_

ROUND 1:
* _DONE: Take out 'F' category? [6]_
* _DONE: Take out supplemental entries [2]_
* _DONE: Don't emit all tree paths?  **This essentially gives many more votes if it has multiple tree paths!!!** [3]_

**_--> F1=69_**

ROUND 2:
* _DONE: Take out LF-negative-indicator: no longer appropriate here! [1]_
* _DONE: Common transforms [3]_
* _DONE: Remove leading VB(N|D) also! [2]_
* _DONE: Iterate over all JJs removed; stop at first (longest) match [2]_
* _DONE: Try "no reinforcing deps" setting where we avoid the positive reinforcement of cascading overlap?_

**_--> F1=72_**

ROUND 3:
* _DONE: Negative cosine matches tend to have super-high weight... **take these out??**_
    - _**In particular, we match e.g. the disease + associated proteins, body parts, etc...**_
    - _We are missing some **exact matches**...!!!  Just override model here?_
    - _Or, add multiple threshs again for cosine match (e.g. milk fever case...)?_
* _DONE: Weird words in chemicals dictionary ("hepatitis", "leprosy")? Take chemicals - diseases instead? [2]_

**_--> F1=73_**

ROUND 4:
* _DONE: `LF-neg-after` is broken! [2]_
* _DONE: Try to avoid `LF-after-num` errors by checking for exact match [2]_
* _DONE: Try running transforms through vector matcher as well?_

**_--> F1=74_**

ROUND 5:
* Positive reinforcement between vector transform matches seems to be an issue... [2]
* Check for the word before "X (Y)" [1]


#### Other ideas:
* "C-induced D", "C D": explicitly create LF for this case?

In [None]:
from random import shuffle
N_train = L_gold_train.shape[0]

fps    = []
fns    = []
fns_na = []
for i in range(N_train):
    if yp[i] > 0 and L_gold_train[i] < 0:
        fps.append(i)
    elif yp[i] <= 0 and L_gold_train[i] > 0:
        if yp[i] == 0:
            fns_na.append(i)
        else:
            fns.append(i)

shuffle(fps)
shuffle(fns)
shuffle(fns_na)

print len(fps)
print len(fns)
print len(fns_na)

In [None]:
from snorkel.viewer import SentenceNgramViewer
fp_cands = [L_train.get_candidate(i) for i in fps[:100]]
svp      = SentenceNgramViewer(fp_cands, session)
svp

In [None]:
c = svp.get_selected()
print c
print c.disease.get_attrib_tokens('pos_tags')

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
[stemmer.stem(w) for w in c.disease.get_attrib_tokens()]

In [None]:
from snorkel.models import Label
session.query(Label).filter(Label.candidate == c).all()

In [None]:
from snorkel.learning.gen_learning import odds_to_prob
i = L_train.get_row_index(c)

for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), odds_to_prob(gen_model.w[j]), int(L_train[i,j])

In [None]:
mesh_to_terms[cd.cid_to_sid[755]]

In [None]:
THRESH            = 0.75
def vector_matches(p, Dt, vectorizer, thresh=THRESH):
    cx  = cd_vectorizer.vectorize_phrases([p])
    m   = cx * Dt
    m   = m.tocoo()
    out = []
    for i, s in enumerate(m.data):
        if s > THRESH:
            j    = m.col[i]
            t    = cd.pos_terms[j]
            sids = cd.term_to_sids[t]
            out.append((s, j, t, sids))
    return out

In [None]:
# Find most common words
word_fs = [(len(cids), word) for word, cids in cd_vectorizer.word_to_cids.iteritems()]
word_fs.sort(key=lambda x : -x[0])
word_fs[:10]

# Running gen. model: Dev Set

In [91]:
from utils import binarize_LF_matrix, get_binarized_score
from snorkel.annotations import merge_annotations

L_dev = merge_annotations([
        L_dev_1
        , L_dev_2
        , L_dev_3
        , L_dev_4
        #, L_dev_4_T1
        , L_dev_4_T2
        , L_dev_4_N
        , L_dev_5
        , L_dev_6
    ])

L_dev_b = binarize_LF_matrix(L_dev)
L_dev_b

<27896x1557 sparse matrix of type '<type 'numpy.float64'>'
	with 44845 stored elements in Compressed Sparse Row format>

In [126]:
from snorkel.learning import NaiveBayes

gen_model_dev = NaiveBayes()
%time gen_model_dev.train(L_dev_b, n_iter=10000, rate=1e-1, verbose=True)

Training marginals (!= 0.5):	27896
Features:			1557
Begin training for rate=0.1, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.077622
	Learning epoch = 250	Gradient mag. = 0.097931
	Learning epoch = 500	Gradient mag. = 0.117168
	Learning epoch = 750	Gradient mag. = 0.141006
	Learning epoch = 1000	Gradient mag. = 0.157745
	Learning epoch = 1250	Gradient mag. = 0.153413
	Learning epoch = 1500	Gradient mag. = 0.149396
	Learning epoch = 1750	Gradient mag. = 0.142963
	Learning epoch = 2000	Gradient mag. = 0.134517
	Learning epoch = 2250	Gradient mag. = 0.127591
	Learning epoch = 2500	Gradient mag. = 0.122599
	Learning epoch = 2750	Gradient mag. = 0.118446
	Learning epoch = 3000	Gradient mag. = 0.113645
	Learning epoch = 3250	Gradient mag. = 0.109864
	Learning epoch = 3500	Gradient mag. = 0.105658
	Learning epoch = 3750	Gradient mag. = 0.102304
	Learning epoch = 4000	Gradient mag. = 0.099610
	Learning epoch = 4250	Gradient mag. = 0.097014
	Learning epoch = 4500	Gradient mag. = 0.094186
	Lea

In [127]:
yp_gd_dev = gen_model_dev.predict(L_dev_b, b=0.5)
get_binarized_score(yp_gd_dev, L_gold_dev)

P :	0.848303393214
R :	0.641348088531
F1:	0.730449727872


# Training Discriminative model

In [128]:
from snorkel.learning import LogReg

train_marginals = gen_model_train.marginals(L_train_b)

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-3, mu=1e-6)

Training marginals (!= 0.5):	20038
Features:			69885
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 13889.283204	Gradient magnitude = 9831.201766
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 3049.872338	Gradient magnitude = 365.791692
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 2926.206468	Gradient magnitude = 1374.522873
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 2493.111263	Gradient magnitude = 293.460134
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 2393.886256	Gradient magnitude = 31.987879
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 2336.523571	Gradient magnitude = 28.337040
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 2294.764978	Gradient magnitude = 25.766155
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 2263.040483	Gradient magnitude = 23.838157
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 2238.207216	Gradient magnitude = 22.339846
	L

In [129]:
yp_d_train = disc_model.predict(F_train)
get_binarized_score(yp_d_train, L_gold_train)

P :	0.84604233111
R :	0.754005167959
F1:	0.797376690805


In [130]:
yp_d_dev = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp_d_dev, L_gold_dev)

P :	0.837987112611
R :	0.686871227364
F1:	0.754941257775


## Directly-supervised ET baseline

In [73]:
disc_model = LogReg()

gold_marginals_train = np.array([1.0 if L_gold_train[i,0] > 0.0 else 0.0 for i in range(L_gold_train.shape[0])])

disc_model.train(F_train, gold_marginals_train, n_iter=2000, rate=1e-3, mu=1e-6)

Training marginals (!= 0.5):	28087
Features:			69885
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 19468.424860	Gradient magnitude = 14785.293512
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 1524.705832	Gradient magnitude = 296.134661
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 1049.483908	Gradient magnitude = 59.100446
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 837.540392	Gradient magnitude = 46.218910
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 711.786007	Gradient magnitude = 38.758125
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 628.601384	Gradient magnitude = 33.831222
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 569.803256	Gradient magnitude = 30.325525
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 526.304388	Gradient magnitude = 27.711398
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 493.015905	Gradient magnitude = 25.699114
	Learning 

In [74]:
yp = disc_model.predict(F_train)
get_binarized_score(yp, L_gold_train)

P :	0.989754098361
R :	0.998449612403
F1:	0.994082840237


In [75]:
yp = disc_model.predict(F_dev, b=0.5)
get_binarized_score(yp, L_gold_dev)

P :	0.901342833653
R :	0.827213279678
F1:	0.86268852459


In [60]:
L_gold_dev[1,0]

-1.0

In [None]:
supervised_labels

In [None]:
plt.hist(disc_model.marginals(F_dev))

In [None]:
from snorkel.learning.gen_learning import odds_to_prob
plt.hist(odds_to_prob(gen_model.w))

In [None]:
L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))

# Hard filtering

In [131]:
# Try overriding with any exact matches...
yp_d_dev_um = np.zeros(L_gold_dev.shape[0])
for i,c in enumerate(dev):
    pos = 0
    neg = 0
    for lf_name, label in LFG_MESH_exact(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    for lf_name, label in LFG_drop_leading_modifiers(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    for lf_name, label in LFG_MESH_exact_remove_common(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    if neg > 0 and pos == 0:
        yp_d_dev_um[i] = -1
    elif pos > 0 and neg == 0:
        yp_d_dev_um[i] = 1
    else:
        yp_d_dev_um[i] = yp_d_dev[i]

In [132]:
get_binarized_score(yp_d_dev_um, L_gold_dev)

P :	0.838038209296
R :	0.739185110664
F1:	0.785513831351


# Exact match baseline

In [78]:
# Try overriding with any exact matches...
yp_em = np.zeros(L_gold_dev.shape[0])
for i,c in enumerate(dev):
    pos = 0
    neg = 0
    for lf_name, label in LFG_MESH_exact(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    if neg > 0:
        yp_em[i] = -1
    elif pos > 0:
        yp_em[i] = 1

In [79]:
get_binarized_score(yp_em, L_gold_dev)

P :	0.879276952873
R :	0.342555331992
F1:	0.493031674208


# Now, getting CIDs in simple heuristic way

In [110]:
N = L_gold_train.shape[0]

In [111]:
pp_idxs = [i for i in range(N) if yp[i] == 1 and L_gold_train[i] > 0]
pp_idxs[:5]

[18, 19, 22, 30, 82]

In [113]:
i = 18
for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), gen_model.w[j], L_train[i,j]

AnnotationKey (C-17-800) 1.9844840388 61.0
AnnotationKey (C-17-800-RC) 1.94488694522 61.0
AnnotationKey (C-04-557-c) 0.432414162861 3163.0
AnnotationKey (C-17-800-c) 0.764422180699 61.0


In [114]:
i = 19
for j in L_train.getrow(i).nonzero()[1]:
    print L_train.get_key(j), gen_model.w[j], L_train[i,j]

AnnotationKey (C-04-182) 1.17840563711 2216.0
AnnotationKey (C-04-182-RC) 1.00830578462 2216.0
AnnotationKey (C-04-182-c) 1.06969451628 2216.0


In [116]:
L_gold_train[19,0]

2216.0

In [None]:
print L_train.get_key(558)
print gen_model.w[558]
print L_train[18,558]

# TODO:

Take the weighted-sum max (i.e. if same CID appears multiple times, add them up!!)

In [137]:
N         = L_gold_dev.shape[0]
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_dev[i,0] > 0:
        total += 1
    
    if yp_d_dev_um[i] > 0:
        predicted += 1
        
        scored = defaultdict(float)
        for j in L_dev.getrow(i).nonzero()[1]:
            scored[L_dev[i,j]] += gen_model_dev.w[j]
        scores = list(scored.iteritems())
        if len(scores) > 0:
            scores.sort(key=lambda x : -x[1])
            cid = scores[0][0]
        else:
            cid = -1
        
        if cid == L_gold_dev[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.670943826632
R:	0.591800804829
F1:	0.628892155553


In [139]:
N         = L_gold_dev.shape[0]
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_dev[i,0] > 0:
        total += 1
        
    scored = defaultdict(float)
    for j in L_dev.getrow(i).nonzero()[1]:
        scored[L_dev[i,j]] += gen_model_dev.w[j]
    scores = list(scored.iteritems())
    if len(scores) > 0:
        scores.sort(key=lambda x : -x[1])
        cid = scores[0][0]
    else:
        cid = -1
    
    if cid > 0:
        predicted += 1
        if cid == L_gold_dev[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.659133709981
R:	0.528169014085
F1:	0.586428371963


In [133]:
N         = L_gold_dev.shape[0]
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_dev[i,0] > 0:
        total += 1
    
    if yp_d_dev_um[i] > 0:
        predicted += 1
        
        max_w = 0.0
        cid   = -1
        for j in L_dev.getrow(i).nonzero()[1]:
            if gen_model_dev.w[j] > max_w:
                max_w = gen_model_dev.w[j]
                cid   = L_dev[i,j]
        
        if cid == L_gold_dev[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.651839178785
R:	0.574949698189
F1:	0.610984899105


In [135]:
N         = L_gold_dev.shape[0]
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_dev[i,0] > 0:
        total += 1
        
    max_w = 0.0
    cid   = -1
    for j in L_dev.getrow(i).nonzero()[1]:
        if gen_model_dev.w[j] > max_w:
            max_w = gen_model_dev.w[j]
            cid   = L_dev[i,j]
    
    if cid > 0:
        predicted += 1
        if cid == L_gold_dev[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.661619486504
R:	0.505533199195
F1:	0.573139435415


In [118]:
correct   = 0
predicted = 0
total     = 0
for i in range(N):
    if L_gold_train[i,0] > 0:
        total += 1
        
    max_w = 0.0
    cid   = -1
    for j in L_train.getrow(i).nonzero()[1]:
        if gen_model.w[j] > max_w:
            max_w = gen_model.w[j]
            cid   = L_train[i,j]
        
    if cid > 0:
        predicted += 1  
        if cid == L_gold_train[i,0]:
            correct += 1

prec   = correct / float(predicted)
recall = correct / float(total)

print "P:\t", prec
print "R:\t", recall
print "F1:\t", (2*prec*recall) / (prec+recall)

P:	0.677807486631
R:	0.524031007752
F1:	0.5910813174


# ETC

# Different cut levels:

Note: G = gen model on training set, D = disc. model on test set

* Pos: 1, Neg: 1, Pos-cosine: 1, Neg-cosine: 1, Thresh-cosine: 0.75 = 56 F1 G / 63 F1 D
* TODO...
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: 0.75 = 61 F1 G / 68 F1 D
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: [0.5, 0.75] = 63 F1 G / 65 F1 D
* Pos: 4, Neg: 4, Pos-cosine: 4, Neg-cosine: 4, Thresh-cosine: 0.75 = 60 F1 G / 64 F1 D

### Adding in drop_JJs + NEG LFs:

* Pos: 2, Neg: 2, Pos-cosine: 2, Neg-cosine: 2, Thresh-cosine: 0.75 = 69 F1 G / 71 F1 D
* Pos: 3, Neg: 3, Pos-cosine: 3, Neg-cosine: 3, Thresh-cosine: 0.75 = 70 F1 G / 73 F1 D


#### Note: we're not yet dealing with acronyms!!!

In [None]:
hf = np.ones(L_train_b.shape[0])

In [None]:
for i in range(L_train_b.shape[0]):
    

In [None]:
L_train_b.getrow(0)

In [None]:
L_train_b.sum(1).shape

In [None]:
from snorkel.utils import sparse_abs
sparse_abs(L_train_b).sum(1).shape

In [None]:
hf = np.where(L_train_b.sum(1) == sparse_abs(L_train_b).sum(1), np.sign(L_train_b.sum(1)), 0)

In [None]:
hf.shape

In [None]:
L_gold_train_b = np.sign(L_gold_train.todense())

In [None]:
L_gold_train_b

In [None]:
L_gold_train_b.T.dot(hf)

In [None]:
hf.sum

In [None]:
L_train_b_sum = L_train_b.sum(1)
L_train_b_abs_sum = sparse_abs(L_train_b).sum(1)
L_train_b_sum_abs = sparse_abs(L_train_b.sum(1))

In [None]:
L_train_b

In [None]:
correct = 0
wrong   = 0
for i in range(L_train_b.shape[0]):
    if L_train_b_sum[i] < 0 and L_train_b_sum_abs[i] == L_train_b_abs_sum[i]:
        if L_gold_train[i] < 0:
            correct += 1
        else:
            wrong += 1
            
print correct
print wrong
print correct / float(correct + wrong)

In [None]:
N = L_train.shape[0]
N

In [None]:
filtered = 0
correct  = 0
for i,c in enumerate(train):
    if i % 1000 == 0:
        if filtered > 0:
            print i, filtered, correct, correct / float(filtered)
        else:
            print i, filtered, correct
    
    pos = 0
    neg = 0
    for lf_name, label in LFG_MESH_exact(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    for lf_name, label in LFG_MESH_exact_drop_JJs(c):
        if label > 0:
            pos += 1
        else:
            neg += 1
    
    #if neg > 0 and pos == 0:
    if neg > pos:
        filtered += 1
        if L_gold_train[i] < 0:
            correct += 1

print filtered
print correct
print correct / float(filtered)

In [None]:
filtered = 0
correct  = 0
for i,c in enumerate(train):
    if i % 1000 == 0:
        if filtered > 0:
            print i, filtered, correct, correct / float(filtered)
        else:
            print i, filtered, correct
    
    if len(c.disease.get_attrib_tokens()) == 1 and c.disease.get_span().lower() not in cd.term_to_sids:
        filtered += 1
        if L_gold_train[i] < 0:
            correct += 1

print filtered
print correct
print correct / float(filtered)

# LF STATS ANALYSIS

In [None]:
lfs = L_train.lf_stats(labels=L_gold_train, est_accs=odds_to_prob(gen_model.w))

In [None]:
gen_model.w

In [None]:
BAD_LFs = [578, 627, 603, 687, 573, 579]
for i in BAD_LFs:
    gen_model.w[i] = 0.0

In [None]:
lfs.nsmallest(50, "accuracy")

In [None]:
for lf in lfs.iterrows():
    lf_name, s = lf
    

# ERROR ANALYSIS

In [None]:
from random import shuffle
N_dev = L_gold_dev.shape[0]

fps = []
fns = []
for i in range(N_dev):
    if yp[i] > 0 and L_gold_dev[i] < 0:
        fps.append(i)
    elif yp[i] < 0 and L_gold_dev[i] > 0:
        fns.append(i)

shuffle(fps)
shuffle(fns)

print len(fps)
print len(fns)

from snorkel.viewer import SentenceNgramViewer
fn_cands = [F_dev.get_candidate(i) for i in fns[:100]]
svn      = SentenceNgramViewer(fn_cands, session)
svn

In [None]:
exact_match = 0
for i in fns:
    c = F_dev.get_candidate(i)
    if c.disease.get_span() in mesh_tree:
        exact_match += 1

In [None]:
exact_match

In [None]:
c = svn.get_selected()

mesh_tree[c.disease.get_span()]

In [None]:
c.disease.get_attrib_tokens('pos_tags')

In [None]:
mesh_tree['alcohol abuse']

In [None]:
from snorkel.models import Label

l = session.query(Label).filter(Label.candidate == c).one()
CID_to_MESH[l.value]

In [None]:
i = F_dev.get_row_index(c)
[(F_dev.get_key(k), disc_model.w[k]) for k in F_dev.getrow(i).nonzero()[1]]

In [None]:
F_dev.get_key(1)

* Why is Parkinson's disease not caught?

In [None]:
from snorkel.viewer import SentenceNgramViewer
fp_cands = [F_dev.get_candidate(i) for i in fps[:100]]
sv       = SentenceNgramViewer(fp_cands, session)
sv

In [None]:
NEG_PHRASES = [
    'stenosis',
    'further attention',
    'presence',
    'absence',
    'syndrome',
    'association',
    'strain',
    'progression'
]

NEG_END_WORDS = [
    'therapies',
    'muscles',
    'concentrations',
    'normal',
    'heart',
    'side',
    'sinus',
    'convulsants',
    'latencies',
    'findings',
    'doses',
    'remission'
]

def end_in_plural(c):
    pass

def body_part(c):
    pass

def not_exact_single_word(d):
    pass