# Disease Norm

In this example, we'll be writing an application to extract *mentions of* diseases from Pubmed abstracts, using annotations from the [BioCreative CDR Challenge](http://www.biocreative.org/resources/corpora/biocreative-v-cdr-corpus/).  This tutorial, which has 5 parts, walks through the process of constructing a model to classify _candidate_ disease mentions as either true (i.e., that it is truly a mention of a disease) or false.

In [None]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Disease = candidate_subclass('Disease', ['disease'])

## Loading a MESH_ID -> CID mapping

For now, just store this as a pickle file...

In [None]:
from utils import load_mesh_dict
diseases = load_mesh_dict('data/desc2017.xml', tree_prefixes=['C', 'F'])
print "Loaded dictionary with %s entries" % len(diseases)

In [None]:
MESH_to_CID = {}
mesh_ids    = list(set(diseases.values()))
mesh_ids.sort()
for mid in mesh_ids:
    MESH_to_CID[mid] = len(MESH_to_CID) + 1  # Reserve CID = 0 for null vote
print len(MESH_to_CID)

In [None]:
from cPickle import dump
dump(MESH_to_CID, open('MESH_to_CID.pkl', 'wb'))

# Labeling the candidates based on the gold annotations

We look for the candidates which are equal to or contain the gold annotations, and then label them with the corresponding MESH ID

In [None]:
from utils import get_docs_xml, get_CD_mentions_by_MESHID
from snorkel.models import Document, TemporarySpan, Label, AnnotationKeySet, AnnotationKey, Span, CandidateSet
from snorkel.loaders import create_or_fetch
import os
ROOT = os.environ['SNORKELHOME'] + '/tutorials/disease_norm/data/'

def load_BioC_CDR_entity_labels(name, entity_class):
    seen  = set()
    
    candidates    = session.query(CandidateSet).filter(CandidateSet.name == 'CDR %s Candidates' % name).one()
    label_key_set = create_or_fetch(session, AnnotationKeySet, "CDR %s Label Set" % name)
    label_key     = create_or_fetch(session, AnnotationKey, "CDR %s Label" % name)
    if label_key not in label_key_set.keys:
        label_key_set.append(label_key)
    session.commit()
    
    # Get all the annotated Pubtator documents as XML trees
    file_name = 'CDR_%sSet.BioC.xml' % name
    doc_xmls  = get_docs_xml(ROOT + file_name)
    for doc_id, doc_xml in doc_xmls.iteritems():
    
        # Get the corresponding Document object
        stable_id = "%s::document:0:0" % doc_id
        doc       = session.query(Document).filter(Document.stable_id == stable_id).one()
    
        # Use custom script to extract the annotations as (sentence, char_start, char_end, text) tuples
        for mesh_id, mentions in get_CD_mentions_by_MESHID(doc_xml, doc.sentences)[entity_class.__name__].iteritems():
            
            # HACK HERE
            if mesh_id == "-1":
                continue
            elif "|" in mesh_id:
                mesh_id = mesh_id.split("|")[0]
            elif mesh_id not in MESH_to_CID:
                continue
            
            for sent, char_start, char_end, txt in mentions:
                
                # Instantiate the annotation as a temporary span
                g = TemporarySpan(parent=sent, char_start=char_start, char_end=char_end)
                
                # Get the candidates in our NP candidate set which are in the same sentence
                ds = session.query(Disease).join(Span)\
                    .filter(Disease.sets.contains(candidates))\
                    .filter(Span.parent == sent).all()
                    
                # Check for the superset candidate which contains the gold span
                for d in ds:
        
                    # Note that a small number of candidates contain > 1 gold candidate
                    # Just deal with heuristically here...
                    if char_start >= d.disease.char_start and char_end <= d.disease.char_end and d not in seen:
                        label = Label(key=label_key, candidate=d, value=MESH_to_CID[mesh_id])
                        session.add(label)
                        seen.add(d)
                        break
    
    # Label all other candidates as negative
    for d in candidates:
        if d not in seen:
            label = Label(key=label_key, candidate=d, value=-1)
            session.add(label)

In [None]:
%time load_BioC_CDR_entity_labels("Training", Disease)

In [None]:
%time load_BioC_CDR_entity_labels("Development", Disease)

In [None]:
%time load_BioC_CDR_entity_labels("Test", Disease)