In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
config = {
    'domain': 'spouse',
    'split': 1,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

In [5]:
from snorkel.models import candidate_subclass

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])

In [6]:
candidates = session.query(Spouse).filter(Spouse.split == config['split']).order_by(
    Spouse.id).all()
print("Candidates: {}".format(len(candidates)))

Candidates: 2796


### Write Input File

In [7]:
from snorkel.annotations import load_gold_labels

# L_gold = load_gold_labels(session, annotator_name='gold', split=0)
L_gold = load_gold_labels(session, annotator_name='gold', split=1)
L_gold

<2796x1 sparse matrix of type '<type 'numpy.int64'>'
	with 2796 stored elements in Compressed Sparse Row format>

In [8]:
labels = [L_gold[L_gold.get_row_index(c),0] for c in candidates]
assert(len(labels) == len(candidates))

In [9]:
from gradturk_processing import GradTurkHelper
helper = GradTurkHelper(candidates, labels, 
                        pct_positive=0.5, 
                        num_hits=75, candidates_per_hit=5)

Found 196 positive, 2600 negative, 0 unknown candidates.
Using 187 positive, 188 negative candidates.


In [10]:
import os
index_path = (os.environ['SNORKELHOME'] + 
              '/tutorials/babble/spouse/data/gradturk_candidate_index.csv')
helper.write_candidate_index(fpath=index_path)

In [16]:
html_path = (os.environ['SNORKELHOME'] + 
              '/tutorials/babble/spouse/data/gradturk_candidate_html.html')
helper.write_candidate_html(fpath=html_path)

In [None]:
# input_csv_path= os.environ['SNORKELHOME'] + '/tutorials/babble/spouse/data/mturk_spouse_04_in.csv'
# helper.preprocess(input_csv_path)

### Read Output File

In [None]:
from mturk_processing import MTurkHelper
helper = MTurkHelper()
output_csv_path= os.environ['SNORKELHOME'] + '/tutorials/babble/spouse/data/mturk_spouse_all_out.csv'
explanations = helper.postprocess(output_csv_path, candidates=candidates, verbose=True)

In [None]:
exp_iterator = iter(explanations)

In [None]:
exp = exp_iterator.next()
from snorkel.viewer import SentenceNgramViewer
print(exp.condition)
sv = SentenceNgramViewer([exp.candidate], session, n_per_page=3, height=150)
sv

In [None]:
for exp in explanations[:25]: print(exp.condition)

### Write Explanations File

In [None]:
import os
from snorkel.contrib.babble import ExplanationIO

# fpath = os.environ['SNORKELHOME'] + '/tutorials/babble/spouse/data/grad_explanations.tsv'
fpath = os.environ['SNORKELHOME'] + '/tutorials/babble/spouse/data/mturk_explanations_all.tsv'

expio = ExplanationIO()
expio.write(explanations, fpath)

###  Read Explanations File

In [None]:
from pprint import pprint

explanations = expio.read(fpath)
pprint(explanations[:10])