Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Initial commit
  • Loading branch information
yuhaozhang committed Sep 26, 2018
0 parents commit 49c848f
Show file tree
Hide file tree
Showing 21 changed files with 1,507 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .gitignore
@@ -0,0 +1,9 @@
__pycache__/
*.py[cod]
*$py.class

.DS_Store
*.env
*.tmp
*.conf

13 changes: 13 additions & 0 deletions LICENSE
@@ -0,0 +1,13 @@
Copyright 2018 The Board of Trustees of The Leland Stanford Junior University

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
144 changes: 144 additions & 0 deletions data/loader.py
@@ -0,0 +1,144 @@
"""
Data loader for TACRED json files.
"""

import json
import random
import torch
import numpy as np

from utils import constant, helper, vocab

class DataLoader(object):
"""
Load data from json files, preprocess and prepare batches.
"""
def __init__(self, filename, batch_size, opt, vocab, evaluation=False):
self.batch_size = batch_size
self.opt = opt
self.vocab = vocab
self.eval = evaluation
self.label2id = constant.LABEL_TO_ID

with open(filename) as infile:
data = json.load(infile)
self.raw_data = data
data = self.preprocess(data, vocab, opt)

# shuffle for training
if not evaluation:
indices = list(range(len(data)))
random.shuffle(indices)
data = [data[i] for i in indices]
self.id2label = dict([(v,k) for k,v in self.label2id.items()])
self.labels = [self.id2label[d[-1]] for d in data]
self.num_examples = len(data)

# chunk into batches
data = [data[i:i+batch_size] for i in range(0, len(data), batch_size)]
self.data = data
print("{} batches created for {}".format(len(data), filename))

def preprocess(self, data, vocab, opt):
""" Preprocess the data and convert to ids. """
processed = []
for d in data:
tokens = list(d['token'])
if opt['lower']:
tokens = [t.lower() for t in tokens]
# anonymize tokens
ss, se = d['subj_start'], d['subj_end']
os, oe = d['obj_start'], d['obj_end']
tokens[ss:se+1] = ['SUBJ-'+d['subj_type']] * (se-ss+1)
tokens[os:oe+1] = ['OBJ-'+d['obj_type']] * (oe-os+1)
tokens = map_to_ids(tokens, vocab.word2id)
pos = map_to_ids(d['stanford_pos'], constant.POS_TO_ID)
ner = map_to_ids(d['stanford_ner'], constant.NER_TO_ID)
deprel = map_to_ids(d['stanford_deprel'], constant.DEPREL_TO_ID)
head = [int(x) for x in d['stanford_head']]
assert any([x == 0 for x in head])
l = len(tokens)
subj_positions = get_positions(d['subj_start'], d['subj_end'], l)
obj_positions = get_positions(d['obj_start'], d['obj_end'], l)
subj_type = [constant.SUBJ_NER_TO_ID[d['subj_type']]]
obj_type = [constant.OBJ_NER_TO_ID[d['obj_type']]]
relation = self.label2id[d['relation']]
processed += [(tokens, pos, ner, deprel, head, subj_positions, obj_positions, subj_type, obj_type, relation)]
return processed

def gold(self):
""" Return gold labels as a list. """
return self.labels

def __len__(self):
return len(self.data)

def __getitem__(self, key):
""" Get a batch with index. """
if not isinstance(key, int):
raise TypeError
if key < 0 or key >= len(self.data):
raise IndexError
batch = self.data[key]
batch_size = len(batch)
batch = list(zip(*batch))
assert len(batch) == 10

# sort all fields by lens for easy RNN operations
lens = [len(x) for x in batch[0]]
batch, orig_idx = sort_all(batch, lens)

# word dropout
if not self.eval:
words = [word_dropout(sent, self.opt['word_dropout']) for sent in batch[0]]
else:
words = batch[0]

# convert to tensors
words = get_long_tensor(words, batch_size)
masks = torch.eq(words, 0)
pos = get_long_tensor(batch[1], batch_size)
ner = get_long_tensor(batch[2], batch_size)
deprel = get_long_tensor(batch[3], batch_size)
head = get_long_tensor(batch[4], batch_size)
subj_positions = get_long_tensor(batch[5], batch_size)
obj_positions = get_long_tensor(batch[6], batch_size)
subj_type = get_long_tensor(batch[7], batch_size)
obj_type = get_long_tensor(batch[8], batch_size)

rels = torch.LongTensor(batch[9])

return (words, masks, pos, ner, deprel, head, subj_positions, obj_positions, subj_type, obj_type, rels, orig_idx)

def __iter__(self):
for i in range(self.__len__()):
yield self.__getitem__(i)

def map_to_ids(tokens, vocab):
ids = [vocab[t] if t in vocab else constant.UNK_ID for t in tokens]
return ids

def get_positions(start_idx, end_idx, length):
""" Get subj/obj position sequence. """
return list(range(-start_idx, 0)) + [0]*(end_idx - start_idx + 1) + \
list(range(1, length-end_idx))

def get_long_tensor(tokens_list, batch_size):
""" Convert list of list of tokens to a padded LongTensor. """
token_len = max(len(x) for x in tokens_list)
tokens = torch.LongTensor(batch_size, token_len).fill_(constant.PAD_ID)
for i, s in enumerate(tokens_list):
tokens[i, :len(s)] = torch.LongTensor(s)
return tokens

def sort_all(batch, lens):
""" Sort all fields by descending order of lens, and return the original indices. """
unsorted_all = [lens] + [range(len(lens))] + list(batch)
sorted_all = [list(t) for t in zip(*sorted(zip(*unsorted_all), reverse=True))]
return sorted_all[2:], sorted_all[1]

def word_dropout(tokens, dropout):
""" Randomly dropout tokens (IDs) and replace them with <UNK> tokens. """
return [constant.UNK_ID if x != constant.UNK_ID and np.random.random() < dropout \
else x for x in tokens]

4 changes: 4 additions & 0 deletions dataset/tacred/README.md
@@ -0,0 +1,4 @@
TACRED Data
=================

Note that the full TACRED data will be licensed by and distributed via LDC. For completeness this repo only contains sampled data from TACRED.
1 change: 1 addition & 0 deletions dataset/tacred/dev.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions dataset/tacred/test.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions dataset/tacred/train.json

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions download.sh
@@ -0,0 +1,14 @@
#!/bin/bash

cd dataset; mkdir glove
cd glove

echo "==> Downloading glove vectors..."
wget http://nlp.stanford.edu/data/glove.840B.300d.zip

echo "==> Unzipping glove vectors..."
unzip glove.840B.300d.zip
rm glove.840B.300d.zip

echo "==> Done."

66 changes: 66 additions & 0 deletions eval.py
@@ -0,0 +1,66 @@
"""
Run evaluation with saved models.
"""
import random
import argparse
from tqdm import tqdm
import torch

from data.loader import DataLoader
from model.trainer import GCNTrainer
from utils import torch_utils, scorer, constant, helper
from utils.vocab import Vocab

parser = argparse.ArgumentParser()
parser.add_argument('model_dir', type=str, help='Directory of the model.')
parser.add_argument('--model', type=str, default='best_model.pt', help='Name of the model file.')
parser.add_argument('--data_dir', type=str, default='dataset/tacred')
parser.add_argument('--dataset', type=str, default='test', help="Evaluate on dev or test.")

parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
parser.add_argument('--cpu', action='store_true')
args = parser.parse_args()

torch.manual_seed(args.seed)
random.seed(1234)
if args.cpu:
args.cuda = False
elif args.cuda:
torch.cuda.manual_seed(args.seed)

# load opt
model_file = args.model_dir + '/' + args.model
print("Loading model from {}".format(model_file))
opt = torch_utils.load_config(model_file)
trainer = GCNTrainer(opt)
trainer.load(model_file)

# load vocab
vocab_file = args.model_dir + '/vocab.pkl'
vocab = Vocab(vocab_file, load=True)
assert opt['vocab_size'] == vocab.size, "Vocab size must match that in the saved model."

# load data
data_file = opt['data_dir'] + '/{}.json'.format(args.dataset)
print("Loading data from {} with batch size {}...".format(data_file, opt['batch_size']))
batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True)

helper.print_config(opt)
label2id = constant.LABEL_TO_ID
id2label = dict([(v,k) for k,v in label2id.items()])

predictions = []
all_probs = []
batch_iter = tqdm(batch)
for i, b in enumerate(batch_iter):
preds, probs, _ = trainer.predict(b)
predictions += preds
all_probs += probs

predictions = [id2label[p] for p in predictions]
p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True)
print("{} set evaluate result: {:.2f}\t{:.2f}\t{:.2f}".format(args.dataset,p,r,f1))

print("Evaluation ended.")

0 comments on commit 49c848f

Please sign in to comment.