# Table Tutorial

## Part I: Preprocessing

In [1]:
# If necessary:
import os
os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

## Parse the Train `Corpus`

In [3]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path='data/hardware/hardware1000_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

In [4]:
%time corpus = cp.parse_corpus(name='Hardware Training', session=session)


CPU times: user 2min 6s, sys: 6.21 s, total: 2min 13s
Wall time: 5min 18s


In [5]:
for doc in corpus.documents[:3]: print doc

Document 1N4006-Fairchild-datasheet-16963
Document 1N4003-Fairchild-Semiconductor-datasheet-33025849
Document 1N4005-Fairchild-Semiconductor-datasheet-37002469


### Saving the `Corpus`
Finally, we persist the parsed corpus in Snorkel's database backend:

In [6]:
session.add(corpus)
session.commit()

### Reloading the `Corpus`
If the corpus has already been parsed, load it here:

In [7]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Training) contains 100 Documents


## Parse the Dev/Test `Corpus`

In [8]:
doc_parser = HTMLParser(path='data/hardware/hardware100_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

In [9]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)


CPU times: user 4min 8s, sys: 14.2 s, total: 4min 22s
Wall time: 9min 59s


In [10]:
session.add(corpus)
session.commit()

### Split the `Corpus` into Train/Dev/Test

In [11]:
from snorkel.utils import split_corpus, get_ORM_instance

split_corpus(session, corpus, train=0, development=0.5, test=0.5, seed=1)

50 Documents added to corpus Hardware Development
50 Documents added to corpus Hardware Test


In [14]:
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus

corpus_dev = get_ORM_instance(Corpus, session, 'Hardware Development')
print "%s contains %d Documents" % (corpus_dev, len(corpus_dev))

corpus_test = get_ORM_instance(Corpus, session, 'Hardware Test')
print "%s contains %d Documents" % (corpus_test, len(corpus_test))

Corpus (Hardware Development) contains 50 Documents
Corpus (Hardware Test) contains 50 Documents


In [15]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

Next, in Part 2, we will look at how to extract `Candidate` relations from our saved `Corpus`.