The purpose of this notebook is to find candidates for explanations that do not have a candidate for consistency checking. For each explanation missing a candidate, loop through the candidates until you find one that matches. Confirm that the parse of the function is the right one, then move on.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'cdr',
    'debug': False,
    'postgres': False,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_cdr.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting domain=None to domain=cdr


In [5]:
candidates = session.query(pipe.candidate_class).all()

In [6]:
from tutorials.babble.cdr.cdr_examples import get_explanations, get_user_lists

explanations = get_explanations()
user_lists = get_user_lists()

Loading canonical ID ontologies...
Finished loading canonical ID ontologies.


In [7]:
from snorkel.contrib.babble.utils import link_explanation_candidates
explanations = link_explanation_candidates(explanations, candidates)

Building list of target candidate ids...
Collected 28 unique target candidate ids from 30 explanations.
Gathering desired candidates...
Found 28/28 desired candidates
Linking explanations to candidates...
Linked 28/30 explanations


In [8]:
from snorkel.contrib.babble import Babbler

babbler = Babbler(session,
                  mode='text', 
                  candidate_class=pipe.candidate_class, 
                  user_lists=user_lists)
babbler.apply(explanations, 
              split=1)

Flushing all parses from previous explanation set.
Created grammar with 592 rules
30 explanation(s) out of 30 were parseable.
106 parse(s) generated from 30 explanation(s).
82 parse(s) remain (24 parse(s) removed by DuplicateSemanticsFilter).
Note: 9 LFs did not have candidates and therefore could not be filtered.
53 parse(s) remain (29 parse(s) removed by ConsistencyFilter).
### Applying labeling functions to split 1

### Done in 5.6s.

44 parse(s) remain (9 parse(s) removed by UniformSignatureFilter: (9 None, 0 All)).
31 parse(s) remain (13 parse(s) removed by DuplicateSignatureFilter).
Added 31 parse(s) from 27 explanations to set. (Total # parses = 31)


In [9]:
parses = babbler.get_parses(translate=False)

In [10]:
for parse in parses:
    print(parse.explanation)
    print(parse.semantics)
    print(babbler.semparser.grammar.translate(parse.semantics))
    print("")

Explanation("LF_c_cause_d: True, between the chemical and the disease, there is a causal word and the word 'not' is not between them.")
('.root', ('.label', ('.bool', True), ('.and', ('.any', ('.map', ('.in', ('.extract_text', ('.between', ('.list', ('.arg', ('.int', 1)), ('.arg', ('.int', 2)))))), ('.user_list', ('.string', 'causal')))), ('.not', ('.call', ('.in', ('.extract_text', ('.between', ('.list', ('.arg', ('.int', 1)), ('.arg', ('.int', 2)))))), ('.string', 'not'))))))
return 1 if (any([s.in(text(between([X,Y]))) for s in user_list('causal')]) and not ('not'.in(text(between([X,Y]))))) else 0

Explanation("LF_c_d: True, the disease is immediately preceded by the chemical.")
('.root', ('.label', ('.bool', True), ('.call', ('.in', ('.extract_text', ('.right', ('.arg', ('.int', 1)), ('.string', '.eq'), ('.int', 1), ('.string', 'words')))), ('.arg_to_string', ('.arg', ('.int', 2))))))
return 1 if text(Y).in(text(exactly 1 word(s) to the right of X)) else 0

Explanation("LF_c_induce

In [11]:
from collections import defaultdict

seen = set()
explanation_map = defaultdict(list)
for parse in parses:
    if parse.explanation.candidate is None:
        print(parse.explanation)
        print("")
        for c in candidates:
            if parse.function(c):
                print((c[0].get_span(), c[1].get_span()))
                print("")
                print(c.get_parent().text)
                print("")
                break

In [12]:
# TRAIN = 0
# candidates = session.query(pipe.candidate_class).filter(
#     pipe.candidate_class.split == TRAIN)
# for c in candidates:
#     sentence = c.get_parent().text
#     if 'develop' in sentence and 'following' in sentence:
#         print(sentence)
#         print("")
#         print(c)
#         print(c.get_stable_id())
#         print("\n")