# Table Tutorial

## Part I: Preprocessing

In [1]:
# If necessary:
import os
os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

## Parse the Train `Corpus`

In [3]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path='data/hardware/hardware100_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

In [4]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)


CPU times: user 4min 9s, sys: 13.5 s, total: 4min 23s
Wall time: 10min 28s


In [5]:
for doc in corpus.documents[:3]: print doc

Document 2N3906-D
Document BC182-D
Document PNJIS01453-1


### Saving the `Corpus`
Finally, we persist the parsed corpus in Snorkel's database backend:

In [6]:
session.add(corpus)
session.commit()

### Reloading the `Corpus`
If the corpus has already been parsed, load it here:

In [7]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware) contains 100 Documents


## Parse the Dev/Test `Corpus`

In [8]:
# doc_parser = HTMLParser(path='data/hardware/hardware100_html/')
# context_parser = OmniParser()
# cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

In [9]:
# %time corpus = cp.parse_corpus(name='Hardware', session=session)

In [10]:
# session.add(corpus)
# session.commit()

### Split the `Corpus` into Train/Dev/Test

In [None]:
from snorkel.utils import split_corpus, get_ORM_instance

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=3)

> /Users/bradenhancock/snorkel/snorkel/utils.py(187)split_corpus()
-> if num_train > 0:
(Pdb) print num_train
16
(Pdb) print num_development
4
(Pdb) print num_test
19.0
(Pdb) print n
20


In [12]:
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware Training')
print "%s contains %d Documents" % (corpus, len(corpus))

corpus = get_ORM_instance(Corpus, session, 'Hardware Development')
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Training) contains 80 Documents
Corpus (Hardware Development) contains 20 Documents


In [13]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

Next, in Part 2, we will look at how to extract `Candidate` relations from our saved `Corpus`.