# Table Tutorial

## Part I: Preprocessing

In [1]:
# If necessary:
# import os
# os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Parse the `Corpus`

In [3]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path='data/hardware/hardware_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=101)

In [4]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)


CPU times: user 5min 5s, sys: 13.2 s, total: 5min 18s
Wall time: 11min 8s


In [5]:
for doc in corpus.documents[:5]: print doc

Document UTCLS01324-1
Document MOTOS04676-1
Document LITES00424-1
Document PHGLS20267-1
Document MMMCS17742-1


### Saving the `Corpus`
Finally, we persist the parsed corpus in Snorkel's database backend:

In [6]:
session.add(corpus)
session.commit()

### Reloading the `Corpus`
If the corpus has already been parsed, load it here:

In [7]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware) contains 100 Documents


### Split the `Corpus` into Train/Dev/Test

In [9]:
# If necessary:
# train = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
# session.delete(train)

# dev = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
# session.delete(dev)

# test = session.query(Corpus).filter(Corpus.name == 'Hardware Test').one()
# session.delete(test)

# session.commit()

In [10]:
from snorkel.utils import split_corpus

split_corpus(session, corpus, train=0.5, development=0.25, test=0.25, seed=0)

50 Documents added to corpus Hardware Training
25 Documents added to corpus Hardware Development
25 Documents added to corpus Hardware Test


In [11]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

Next, in Part 2, we will look at how to extract `Candidate` relations from our saved `Corpus`.