In [1]:
import os
os.remove('snorkel.db')

In [2]:
from snorkel import SnorkelSession
session = SnorkelSession()

import os, sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

In [3]:
import os
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path=os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/diseases.xhtml')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=1)

In [4]:
%time corpus = cp.parse_corpus(name='Sandbox', session=session)
print "%d Phrases in %s" % (
    len([phrase for doc in corpus.documents for phrase in doc.phrases]), corpus)


CPU times: user 368 ms, sys: 29.2 ms, total: 398 ms
Wall time: 1.24 s
32 Phrases in Corpus (Sandbox)


In [5]:
# for cell in corpus.documents[0].cells: print cell.text[:10]

In [6]:
# for phrase in [phrase for doc in corpus.documents for phrase in doc.phrases]:
#     print phrase

In [7]:
# (a,b,c) = (2,(3 if True else 5) ,4)

In [8]:
from snorkel.models import candidate_subclass

# Year = candidate_subclass('Year', ['year'])
# Temp = candidate_subclass('Temp', ['temp'])
# Disease = candidate_subclass('Disease', ['disease'])
# Part = candidate_subclass('Part', ['part'])
Disease_Year = candidate_subclass('Disease_Temp', ['disease','temp'])

In [9]:
from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsPart, OmniNgramsTemp

omni_ngrams = OmniNgrams(n_max=3, split_tokens=None)
# omni_part = OmniNgramsPart(n_max=3)
omni_temp = OmniNgramsTemp(n_max=3)

In [10]:
diseases = ['viruses', 'coughs', 'colds', 'brain', 'cancer', 'shingles', 'warts',
              'brain cancer', 'disease', 'location', 'polio', 'chicken pox', 
              'bubonic plague', 'plague', 'scurvy', 'infectious diseases', 
              'arthritis', 'yellow fever', 'fever', 'hypochondria', 'pneumonia',
              'whooping cough', 'heart attack']
print "Loaded %d diseases." % len(diseases)

Loaded 23 diseases.


In [11]:
from snorkel.matchers import DictionaryMatch, RegexMatchEach, RegexMatchSpan, DateMatcher

disease_matcher = DictionaryMatch(d=diseases, ignore_case=True)
# year_matcher = DateMatcher()
# part_matcher = RegexMatchEach(rgx='BC.*')
temp_matcher = RegexMatchSpan(rgx=r'-[5-7][05]')

In [12]:
from snorkel.candidates import CandidateExtractor

# ce = CandidateExtractor(Part, [omni_part], [part_matcher])
# ce = CandidateExtractor(Disease, [omni_ngrams], [disease_matcher])
# ce = CandidateExtractor(Temp, [omni_temp], [temp_matcher])
ce = CandidateExtractor(Disease_Year, 
                        [omni_ngrams, omni_temp], 
                        [disease_matcher, temp_matcher])

In [13]:
%time candidates = ce.extract(corpus.documents, 'Sandbox Candidates', session)
print "%s contains %d Candidates" % (candidates, len(candidates))


CPU times: user 86.4 ms, sys: 8.22 ms, total: 94.6 ms
Wall time: 95.2 ms
Candidate Set (Sandbox Candidates) contains 20 Candidates


In [14]:
example = candidates[0].temp
print example

ImplicitSpan("-55", parent=61, words=[4,4], position=[0])


In [15]:
c = candidates[0]
print c

Disease_Temp(Span("Arthritis", parent=65, chars=[0,8], words=[0,0]), ImplicitSpan("-55", parent=61, words=[4,4], position=[0]))


In [16]:
from snorkel.lf_helpers import *

# print [cell.text[:8] for cell in _get_aligned_cells(root_phrase.cell, axis, infer=infer)]
# print [phrase.text for cell in _get_aligned_cells(root_phrase.cell, axis, infer=infer) for phrase in cell.phrases]
print c.disease.parent
print list(get_left_ngrams(c.temp, n_max=1, window=10, lower=False))
print list(get_right_ngrams(c.temp, n_max=1, window=5))
print list(get_phrase_ngrams(c.temp, n_max=1))
print list(get_cell_ngrams(c.temp, n_max=1))
print list(get_row_ngrams(c.temp, n_max=1, direct=True, infer=True))
print list(get_head_ngrams(c.temp, 'row'))
print list(get_head_ngrams(c.temp, 'col'))

Phrase(Doc: diseases, Table: 1, Row: 1, Col: 0, Position: 0, Text: Arthritis)
[u'Junction', u'and', u'Storage', u'Temperature']
[u'to', u'150', u'o', u'c']
[u'junction', u'and', u'storage', u'temperature', u'to', u'150', u'o', u'c']
[u'junction', u'and', u'storage', u'temperature', u'to', u'150', u'o', u'c']
[u'junction', u'and', u'storage', u'temperature', u'to', u'150', u'o', u'c', u'scurvy', u'annapolis']
[u'scurvy']
[u'year']


In [17]:
from snorkel.models import CandidateSet
train = CandidateSet(name='train', candidates=candidates[:25])
dev = CandidateSet(name='test', candidates=candidates[25:])

In [18]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Sandbox Features')
F_train

# from snorkel.features import get_span_feats
# c = candidates[10]
# %prun for feat in get_span_feats(c): print feat

Generating annotations for 20 candidates...

Loading sparse Feature matrix...
CPU times: user 3.73 s, sys: 30.5 ms, total: 3.76 s
Wall time: 3.85 s


<20x635 sparse matrix of type '<type 'numpy.float64'>'
	with 3039 stored elements in Compressed Sparse Row format>

In [19]:
from snorkel.utils import get_keys_by_candidate

c = candidates[0]
print c
print
for f in get_keys_by_candidate(F_train, c)[:]: print f

Disease_Temp(Span("Arthritis", parent=65, chars=[0,8], words=[0,0]), ImplicitSpan("-55", parent=61, words=[4,4], position=[0]))

AnnotationKey (TAB_e1_HTML_TAG_th)
AnnotationKey (TAB_e1_HTML_ANC_TAG_[html])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[body])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[table])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[tbody])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[tr])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[th])
AnnotationKey (TAB_e1_ROW_NUM_[1])
AnnotationKey (TAB_e1_COL_NUM_[0])
AnnotationKey (TAB_e1_ROW_HEAD_WORDS_[pokemon])
AnnotationKey (TAB_e1_ROW_HEAD_WORDS_[pokemon go])
AnnotationKey (TAB_e1_ROW_HEAD_WORDS_[go])
AnnotationKey (TAB_e1_COL_HEAD_WORDS_[problem])
AnnotationKey (TAB_e1_ROW_WORDS_[pokemon])
AnnotationKey (TAB_e1_ROW_WORDS_[pokemon go])
AnnotationKey (TAB_e1_ROW_WORDS_[go])
AnnotationKey (TAB_e1_ROW_WORDS_[free])
AnnotationKey (TAB_e1_COL_WORDS_[problem])
AnnotationKey (TAB_e1_COL_WORDS_[yellow])
AnnotationKey (TAB_e1_COL_WORDS_[yellow fever])
AnnotationKey (TAB_e

In [20]:
# c = candidates[0]
# print c
# print c.year
# print c.year.parent
# print c.year.char_start
# print c.year.char_end
# print c.year.parent.words
# print c.year.parent.char_offsets
# print c.year.parent.lemmas
# print c.year.parent.pos_tags
# print c.year.parent.ner_tags
# print c.year.parent.dep_parents
# print c.year.parent.dep_labels
# print c.year.get_word_start()
# print c.year.get_word_end()
# # print get_row_ngrams(c.year)
# # print get_row_ngrams(c.year, infer=True)
# %time for i in range(1000): _get_aligned_cells((c.year).parent.cell, 'col', infer=True)

In [21]:
# from snorkel.models import Span, ImplicitSpan, TemporarySpan

# print isinstance(c.year, TemporarySpan)

In [22]:
# from snorkel.lf_helpers import get_between_ngrams, get_left_ngrams, get_right_ngrams
# from snorkel.lf_helpers import contains_token, contains_regex
# from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
# from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
# from snorkel.lf_helpers import same_document, same_table, same_cell, same_phrase
# from snorkel.lf_helpers import _get_aligned_cells, _get_nonempty_cell

In [23]:
from snorkel.lf_helpers import *
c = candidates[2]
print list(get_head_ngrams(c.disease, axis='col'))
# print get_left_ngrams(c.disease)
# print get_right_ngrams(c.disease)
# print contains_token(c, 'plague')
# print contains_regex(c, r'pla')
# print same_document(c)
# print same_table(c)
# print same_cell(c)
# print same_phrase(c)
# print get_phrase_ngrams(c.disease, n_min=1, n_max=3, case_sensitive=True)
# print get_cell_ngrams(c.disease, attrib='pos_tags')
# print get_neighbor_cell_ngrams(c.disease, dist=2, directions=True)
# print get_row_ngrams(c.disease)
# print get_col_ngrams(c.disease)
# print get_aligned_ngrams(c.disease)
# print get_aligned_ngrams(c.disease, infer=True)

[]


In [24]:
# from hardware_utils import expand_implicit_text 

# for part in expand_implicit_text(''.join(['BC856/857/858', '/', '859/860'])): print part

In [25]:
# def foo(n):
#     phrase = 'repeat me'
#     pmul = phrase * n
#     pjoi = ''.join([phrase for x in xrange(n)])
#     pinc = ''
#     for x in xrange(n):
#         pinc += phrase
#     del pmul, pjoi, pinc

In [26]:
# %lprun -f foo foo(1000)

In [27]:
# from hardware_utils import get_gold_dict

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# gold_dict = get_gold_dict(filename, 'stg_temp_min')

In [28]:
# print len(gold_dict.values())
# print gold_dict.values().count(1)

In [29]:
# from hardware_utils import count_hardware_labels

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# %time (certain, maybe) = count_hardware_labels(loader, candidates, filename, attrib='stg_temp_min', attrib_class='temp')

In [30]:
# %time 
# for i in range(10000): 
#     for j in range(100):
#         1 + 1

In [31]:
# from snorkel.utils import ProgressBar;

In [32]:
# %%time
# n = 100000
# pb = ProgressBar(n)
# for i in xrange(n):
#     pb.bar(i)
# pb.close()

In [33]:
# N = 235
# N = 10
# ticks = set([int(i * N/100.0) for i in range(1,101)])
# print ticks