In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from os.path import realpath
sys.path.insert(0, realpath('../src'))

## Load Datasets

In [3]:
from common.event_script import ScriptCorpus

2018-04-26 01:36:16,308 - INFO - 'pattern' package not found; tag filters are not available for English


In [4]:
with open('../data/ontonotes/on_short_scripts.txt', 'r') as fin:
    on_short_scripts = ScriptCorpus.from_text(fin.read())
print 'Found {} scripts in OnShort dataset'.format(on_short_scripts.num_scripts)

Found 1027 scripts in OnShort dataset


In [5]:
with open('../data/ontonotes/on_long_scripts.txt', 'r') as fin:
    on_long_scripts = ScriptCorpus.from_text(fin.read())
print 'Found {} scripts in OnLong dataset'.format(on_long_scripts.num_scripts)

Found 597 scripts in OnLong dataset


## Create SeqWord2Vec Evaluator

In [6]:
from os.path import join

from utils import Word2VecModel

word2vec_dir = '/home/pengxiang/workspace/corpora/enwiki-20160901/word2vec/spaces/sample_1e-4_min_500'
word2vec = Word2VecModel.load_model(
    join(word2vec_dir, 'min_500_dim300vecs.bin'),
    fvocab=join(word2vec_dir, 'min_500_dim300vecs.vocab'))

2018-04-26 01:36:21,690 - INFO - loading word counts from /home/pengxiang/workspace/corpora/enwiki-20160901/word2vec/spaces/sample_1e-4_min_500/min_500_dim300vecs.vocab
2018-04-26 01:36:21,779 - INFO - loading projection weights from /home/pengxiang/workspace/corpora/enwiki-20160901/word2vec/spaces/sample_1e-4_min_500/min_500_dim300vecs.bin
2018-04-26 01:36:22,218 - INFO - loaded (53345, 300) matrix from /home/pengxiang/workspace/corpora/enwiki-20160901/word2vec/spaces/sample_1e-4_min_500/min_500_dim300vecs.bin
2018-04-26 01:36:22,221 - INFO - precomputing L2-norms of word weight vectors


In [7]:
from evaluate.seq_word2vec_evaluator import SeqWord2VecEvaluator

word2vec_evaluator = SeqWord2VecEvaluator(
    use_lemma=True, include_type=True, ignore_first_mention=False, filter_stop_events=True,
    filter_repetitive_prep=True, use_max_score=True)

word2vec_evaluator.set_model(word2vec)

Using cuDNN version 7004 on context None
Mapped name None to device cuda: GeForce GTX 1080 Ti (0000:01:00.0)
2018-04-26 01:36:24,865 - INFO - set embedding model: min_500_dim300vecs


## Evaluate Baselines on OnShort

In [8]:
word2vec_evaluator.evaluate(on_short_scripts.scripts)

2018-04-26 01:36:27,021 - INFO - evaluation based on word2vec (sequential), with embedding model min_500_dim300vecs
2018-04-26 01:36:27,023 - INFO - embedding configs: use_lemma = True, include_type = True
2018-04-26 01:36:27,024 - INFO - general configs: include_all_pobj = False, ignore_first_mention = False, filter_stop_events = True
2018-04-26 01:36:27,025 - INFO - evaluator specific configs: use_max_score = True
2018-04-26 01:36:27,026 - INFO - general configs: filter_repetitive_prep = True
Processed: 100%|████████████████████████████████████████████████| 1027/1027 [01:09<00:00, 14.73it/s]


+-------------------------------------------------------------------------------+
|                    # Cases     # Correct     Accuracy (%)      Avg # Choices  |
|       All            7781         2845           36.56             25.70      |
+-------------------------------------------------------------------------------+
+-------------------------------------------------------------------------------+
|    Arg Type        # Cases     # Correct     Accuracy (%)      Avg # Choices  |
|      SUBJ            5080         1909           37.58             26.18      |
|       OBJ            1681         581            34.56             24.13      |
|      POBJ            1020         355            34.80             25.90      |
+-------------------------------------------------------------------------------+
+-------------------------------------------------------------------------------+
|       POS          # Cases     # Correct     Accuracy (%)      Avg # Choices  |
|      Noun    




## Evaluate Baselines on OnLong

In [9]:
word2vec_evaluator.evaluate(on_long_scripts.scripts)

2018-04-26 01:41:33,203 - INFO - evaluation based on word2vec (sequential), with embedding model min_500_dim300vecs
2018-04-26 01:41:33,204 - INFO - embedding configs: use_lemma = True, include_type = True
2018-04-26 01:41:33,205 - INFO - general configs: include_all_pobj = False, ignore_first_mention = False, filter_stop_events = True
2018-04-26 01:41:33,206 - INFO - evaluator specific configs: use_max_score = True
2018-04-26 01:41:33,208 - INFO - general configs: filter_repetitive_prep = True
Processed: 100%|██████████████████████████████████████████████████| 597/597 [09:54<00:00,  1.00it/s]


+-------------------------------------------------------------------------------+
|                    # Cases     # Correct     Accuracy (%)      Avg # Choices  |
|       All           10539         2509           23.81             70.63      |
+-------------------------------------------------------------------------------+
+-------------------------------------------------------------------------------+
|    Arg Type        # Cases     # Correct     Accuracy (%)      Avg # Choices  |
|      SUBJ            7605         1898           24.96             70.84      |
|       OBJ            2036         409            20.09             69.11      |
|      POBJ            898          202            22.49             72.34      |
+-------------------------------------------------------------------------------+
+-------------------------------------------------------------------------------+
|       POS          # Cases     # Correct     Accuracy (%)      Avg # Choices  |
|      Noun    


