In [1]:
import os
os.remove('snorkel.db')

In [2]:
from snorkel import SnorkelSession
session = SnorkelSession()

import os, sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

In [3]:
import os
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path=os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/diseases.xhtml')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=1)

In [4]:
%time corpus = cp.parse_corpus(name='Sandbox', session=session)
print "%d Phrases in %s" % (
    len([phrase for doc in corpus.documents for phrase in doc.phrases]), corpus)


CPU times: user 295 ms, sys: 28.6 ms, total: 323 ms
Wall time: 2.47 s
32 Phrases in Corpus (Sandbox)


In [5]:
# for phrase in [phrase for doc in corpus.documents for phrase in doc.phrases]:
#     print phrase

In [6]:
from snorkel.models import candidate_subclass

# Year = candidate_subclass('Year', ['year'])
# Temp = candidate_subclass('Temp', ['temp'])
# Disease = candidate_subclass('Disease', ['disease'])
# Part = candidate_subclass('Part', ['part'])
Disease_Year = candidate_subclass('Disease_Year', ['disease','year'])

In [7]:
from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsPart, OmniNgramsTemp

omni_ngrams = OmniNgrams(n_max=3, split_tokens=None)
# omni_part = OmniNgramsPart(n_max=3, split_tokens=None)
# omni_temp = OmniNgramsTemp(n_max=3, split_tokens=None)

In [8]:
diseases = ['viruses', 'coughs', 'colds', 'brain', 'cancer', 'shingles', 'warts',
              'brain cancer', 'disease', 'location', 'polio', 'chicken pox', 
              'bubonic plague', 'plague', 'scurvy', 'infectious diseases', 
              'arthritis', 'yellow fever', 'fever', 'hypochondria', 'pneumonia']
print "Loaded %d diseases." % len(diseases)

Loaded 21 diseases.


In [9]:
from snorkel.matchers import DictionaryMatch, RegexMatchEach, RegexMatchSpan, DateMatcher

disease_matcher = DictionaryMatch(d=diseases, ignore_case=True)
year_matcher = DateMatcher()
# part_matcher = RegexMatchEach(rgx='BC.*')
# temp_matcher = RegexMatchSpan(rgx=r'-[5-7][05]')

In [10]:
from snorkel.candidates import CandidateExtractor

# ce = CandidateExtractor(Part, [omni_part], [part_matcher])
# ce = CandidateExtractor(Disease, [omni_ngrams], [disease_matcher])
# ce = CandidateExtractor(Temp, [omni_temp], [temp_matcher])
ce = CandidateExtractor(Disease_Year, 
                        [omni_ngrams, omni_ngrams], 
                        [disease_matcher, year_matcher])

In [11]:
%time candidates = ce.extract(corpus.documents, 'Sandbox Candidates', session)
print "%s contains %d Candidates" % (candidates, len(candidates))


CPU times: user 220 ms, sys: 10.1 ms, total: 230 ms
Wall time: 427 ms
Candidate Set (Sandbox Candidates) contains 51 Candidates


In [12]:
print candidates[31]

Disease_Year(Span("Infectious diseases", parent=20, chars=[9,27], words=[3,4]), Span("1901", parent=47, chars=[0,3], words=[0,0]))


In [28]:
# for c in candidates:
#     print c
c = candidates[14]
print c
print c.disease.parent.document
print c.disease.parent.table
# print c.disease.parent.row
# print c.disease.parent.col
print c.disease.parent.cell
print c.disease.parent

Disease_Year(Span("Polio", parent=38, chars=[0,4], words=[0,0]), Span("2012", parent=44, chars=[0,3], words=[0,0]))
Document diseases
Table(Document diseases, 0)
Cell(Document diseases, Table(Document diseases, 0), 3, u'<th>Polio is ?C cold.</th>')
Phrase(Document diseases, 6, 11, 0, u'Polio is %C cold.')


In [14]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, candidates, 'Sandbox Features')

# from snorkel.features import get_span_feats
# c = candidates[10]
# %prun for feat in get_span_feats(c): print feat


Loading sparse Feature matrix...
CPU times: user 5.88 s, sys: 85.4 ms, total: 5.96 s
Wall time: 6.22 s


In [15]:
c = candidates[25]
print c
print c.disease.parent.row_num
print c.disease.parent.col_num

Disease_Year(Span("colds", parent=2, chars=[30,34], words=[7,7]), Span("1901", parent=47, chars=[0,3], words=[0,0]))
None
None


In [16]:
from snorkel.utils import get_keys_by_candidate

print candidates[25]
print "Limited number of features being shown:"
for f in get_keys_by_candidate(F_train, F_train.get_candidate(4))[:10]: print f

Disease_Year(Span("colds", parent=2, chars=[30,34], words=[7,7]), Span("1901", parent=47, chars=[0,3], words=[0,0]))
Limited number of features being shown:
AnnotationKey (TAB_e1_POS_SEQ_[NNP])
AnnotationKey (TAB_e1_STARTS_WITH_CAPITAL)
AnnotationKey (TAB_e1_LENGTH_1)
AnnotationKey (TAB_e1_HTML_TAG_th)
AnnotationKey (TAB_e1_HTML_ANC_TAG_[html])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[body])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[table])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[tbody])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[tr])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[th])


In [17]:
# c = candidates[0]
# print c
# print c.year
# print c.year.parent
# print c.year.char_start
# print c.year.char_end
# print c.year.parent.words
# print c.year.parent.char_offsets
# print c.year.parent.lemmas
# print c.year.parent.pos_tags
# print c.year.parent.ner_tags
# print c.year.parent.dep_parents
# print c.year.parent.dep_labels
# print c.year.get_word_start()
# print c.year.get_word_end()
# # print get_row_ngrams(c.year)
# # print get_row_ngrams(c.year, infer=True)
# %time for i in range(1000): _get_aligned_cells((c.year).parent.cell, 'col', infer=True)

In [18]:
# from snorkel.models import Span, ImplicitSpan, TemporarySpan

# print isinstance(c.year, TemporarySpan)

In [19]:
# from snorkel.lf_helpers import get_between_ngrams, get_left_ngrams, get_right_ngrams
# from snorkel.lf_helpers import contains_token, contains_regex
# from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
# from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
# from snorkel.lf_helpers import same_document, same_table, same_cell, same_phrase
# from snorkel.lf_helpers import _get_aligned_cells, _get_nonempty_cell

In [20]:
# print get_left_ngrams(c.disease)
# print get_right_ngrams(c.disease)
# print contains_token(c, 'plague')
# print contains_regex(c, r'pla')
# print same_document(c)
# print same_table(c)
# print same_cell(c)
# print same_phrase(c)
# print get_phrase_ngrams(c.disease, n_min=1, n_max=3, case_sensitive=True)
# print get_cell_ngrams(c.disease, attrib='pos_tags')
# print get_neighbor_cell_ngrams(c.disease, dist=2, directions=True)
# print get_row_ngrams(c.disease)
# print get_col_ngrams(c.disease)
# print get_aligned_ngrams(c.disease)
# print get_aligned_ngrams(c.disease, infer=True)

In [21]:
# from hardware_utils import expand_implicit_text 

# for part in expand_implicit_text(''.join(['BC856/857/858', '/', '859/860'])): print part

In [22]:
# def foo(n):
#     phrase = 'repeat me'
#     pmul = phrase * n
#     pjoi = ''.join([phrase for x in xrange(n)])
#     pinc = ''
#     for x in xrange(n):
#         pinc += phrase
#     del pmul, pjoi, pinc

In [23]:
# %lprun -f foo foo(1000)

In [24]:
# from hardware_utils import get_gold_dict

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# gold_dict = get_gold_dict(filename, 'stg_temp_min')

In [25]:
# print len(gold_dict.values())
# print gold_dict.values().count(1)

In [26]:
# from hardware_utils import count_hardware_labels

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# %time (certain, maybe) = count_hardware_labels(loader, candidates, filename, attrib='stg_temp_min', attrib_class='temp')