# Tutorial: Extracting Formation Measurements from Paleontology Literature

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import json

PARALLEL = 1
os.environ['SNORKELDB'] = 'sqlite:///gwas_nature'

In [2]:
from snorkel.contrib.fonduer import SnorkelSession

session = SnorkelSession()

In [3]:
from snorkel.contrib.fonduer.models import candidate_subclass

Pvalue = candidate_subclass('Pvalue', ['pvalue'])

In [4]:
from snorkel.contrib.fonduer import HTMLPreprocessor, OmniParser

docs_path = os.environ['DATA'] + '/gwas/db/papers'

max_docs = 30
doc_preprocessor = HTMLPreprocessor(docs_path, max_docs=max_docs)

In [5]:
corpus_parser = OmniParser(structural=True, lingual=True, visual=False)
%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 1min 43s, sys: 5.26 s, total: 1min 48s
Wall time: 4min 27s


In [6]:
from snorkel.contrib.fonduer.models import Document, Phrase

print "Documents:", session.query(Document).count()
print "Phrases:", session.query(Phrase).count()

Documents: 30
Phrases: 66728


In [7]:
docs = session.query(Document).order_by(Document.name).all()

In [8]:
from snorkel.matchers import RegexMatchSpan, Union

# p-value matcher
rgx1 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[\xd7\xb7\*][\s\u2009]*10[\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher1 = RegexMatchSpan(rgx=rgx1)
rgx2 = u'[1-9]\d?[\xb7\.]?\d*[\s\u2009]*[eE][\s\u2009]*[-\u2212\u2013\u2012][\s\u2009]*\d+'
pval_rgx_matcher2 = RegexMatchSpan(rgx=rgx2)
rgx3 = u'0\.0000+\d+'
pval_rgx_matcher3 = RegexMatchSpan(rgx=rgx3)
pval_rgx_matcher = Union(pval_rgx_matcher1, pval_rgx_matcher2, pval_rgx_matcher3)

In [9]:
from snorkel.contrib.fonduer.fonduer.candidates import OmniNgrams

heptagrams = OmniNgrams(n_max=7, split_tokens=[])

In [10]:
from snorkel.contrib.fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(Pvalue, [heptagrams], [pval_rgx_matcher])

%time candidate_extractor.apply(docs, split=0, parallelism=PARALLEL)

Clearing existing...
Running UDF...
CPU times: user 32.2 s, sys: 633 ms, total: 32.8 s
Wall time: 32.4 s


Here we specified that these `Candidates` belong to the training set by specifying `split=0`; recall that we're referring to train/dev/test as splits 0/1/2.

In [11]:
candidates = session.query(Pvalue).filter(Pvalue.split == 0).all()
print "Number of candidates:", len(candidates)

Number of candidates: 763


In [15]:
import re

pvalue_rgx = u'{}|{}|{}'.format(rgx1, rgx2, rgx3)
pvalue_matcher = re.compile(pvalue_rgx, flags=(re.I|re.UNICODE))

def overlap(a1, b1, a2, b2):
    return not (b1 < a2 or a1 > b2)


def extract_metadata(phrase):
    return [p.text.encode('utf-8') for p in phrase.table.phrases if 
        p.row_end <= 2 and
        overlap(p.col_start, p.col_end, phrase.col_start, phrase.col_end) and 
        not pvalue_matcher.match(p.text)]


def make_line(phrase): 
    doc_id = phrase.document.name
    table_idx = phrase.table.position + 1 # make table_idx 1-indexed
    row = phrase.row_start
    col = phrase.col_start
    pvalue = c[0].get_span().ljust(10) # pad for uniform width
    metadata = extract_metadata(phrase)
#     print('{}: {}\t{}'.format(doc_id, pvalue, metadata))
    line = map(lambda x: unicode(x).encode('utf-8'), [doc_id, table_idx, row, col, pvalue, metadata])
    return line

In [16]:
import csv

OUTFILE = 'pvalue_metadata.tsv'

with open(OUTFILE, 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["doc_id", "table_index", "rows", "cols", "p-value", "metadata"])
    for c in candidates:
        phrase = c.get_parent()
        if not phrase.table:
            continue
        line = make_line(phrase)
        print(line)
        writer.writerow(line)

['18604267', '1', '19', '12', '2.1E-12   ', "['Combined', 'P-Value']"]
['18604267', '1', '12', '12', '4.8E-25   ', "['Combined', 'P-Value']"]
['18604267', '1', '16', '12', '3.6E-23   ', "['Combined', 'P-Value']"]
['18604267', '4', '2', '5', '1.7E-95   ', "['P-Value']"]
['18604267', '2', '3', '11', '2.0E-16   ', "['Combined', 'c', 'P-Value', '<2.0E-16']"]
['18604267', '1', '16', '10', '3.2E-07   ', "['WGHS-2', 'P-Value']"]
['18604267', '1', '18', '10', '5.2E-06   ', "['WGHS-2', 'P-Value']"]
['18604267', '1', '7', '12', '5.1E-29   ', "['Combined', 'P-Value']"]
['18604267', '2', '3', '7', '2.0E-16   ', "['WGHS-1', 'c', 'P-Value', '<2.0E-16']"]
['18604267', '1', '14', '10', '2.7E-07   ', "['WGHS-2', 'P-Value']"]
['18604267', '1', '13', '8', '5.7E-17   ', "['WGHS-1', 'P-Value']"]
['18604267', '2', '4', '7', '1.3E-08   ', "['WGHS-1', 'c', 'P-Value', '<2.0E-16']"]
['18604267', '1', '14', '12', '5.9E-23   ', "['Combined', 'P-Value']"]
['18604267', '1', '15', '10', '4.9E-05   ', "['WGHS-2', 'P-

We express several of these simple patterns below as a set of labeling functions: