added evaluation scripts

qkaren · Jul 27, 2019 · 7bce86f · 7bce86f
1 parent d92bae5
commit 7bce86f
Show file tree

Hide file tree

Showing 15 changed files with 16,210 additions and 2 deletions.
diff --git a/data/Makefile b/data/Makefile
@@ -37,8 +37,8 @@ $(TARGET)/train/merged.convos.txt: $(OFFICIAL_TRAIN_CONVOS)
 $(TARGET)/train/merged.facts.txt: $(OFFICIAL_TRAIN_FACTS)
 	cat $+ > $@
 
-#data-official-test/test.refs.txt: $(OFFICIAL_TEST_REFS)
-#	cat $+ | sort | uniq > $@
+data-official-test/test.refs.txt: $(OFFICIAL_TEST_REFS)
+	cat $+ | sort | uniq > $@
 
 data-official-valid/valid.convos.txt: $(OFFICIAL_VALID_CONVOS)
 	cat $+ | sort | uniq > $@

diff --git a/evaluation/3rdparty/.create b/evaluation/3rdparty/.create
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -0,0 +1,50 @@
+# Evaluation
+
+## Requirements
+* Works fine for both Python 2.7 and 3.6
+* Please **downloads** the following 3rd-party packages and save in a new folder `3rdparty`:
+	* [**mteval-v14c.pl**](https://goo.gl/YUFajQ) to compute [NIST](http://www.mt-archive.info/HLT-2002-Doddington.pdf). You may need to install the following [perl](https://www.perl.org/get.html) modules (e.g. by `cpan install`): XML:Twig, Sort:Naturally and String:Util.
+	* [**meteor-1.5**](http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz) to compute [METEOR](http://www.cs.cmu.edu/~alavie/METEOR/index.html). It requires [Java](https://www.java.com/en/download/help/download_options.xml).
+
+
+## Create test data:
+
+Please refer to the [data extraction page](https://github.com/qkaren/converse_reading_cmr/tree/master/data) to create the data. To create validation and test data, please run the following command:
+
+```make -j4 valid test refs```
+
+This will create the multi-reference file, along with followng four files:
+
+* Validation data: ``valid.convos.txt`` and ``valid.facts.txt``
+* Test data: ``test.convos.txt`` and ``test.facts.txt``
+
+These files are in exactly the same format as ``train.convos.txt`` and ``train.facts.txt`` already explained [here](https://github.com/qkaren/converse_reading_cmr/tree/master/data). The only difference is that the ``response`` field of test.convos.txt has been replaced with the strings ``__UNDISCLOSED__``.
+
+Notes: 
+* The two validation files are optional and you can skip them if you want (e.g., no need to send us system outputs for them). We provide them so that you can run your own automatic evaluation (BLEU, etc.) by comparing the ``response`` field with your own system outputs. 
+* Data creation should take about 1-4 days (depending on your internet connection, etc.). If you run into trouble creating the data, please contact us.
+
+### Data statistics
+
+Number of conversational responses: 
+* Validation (valid.convos.txt): 4542 lines
+* Test (test.convos.txt): 13440 lines
+
+Due to the way the data is created by querying Common Crawl, there may be small differences between your version of the data and our own. To make pairwise comparisons between systems of each pair of participants, we will rely on the largest subset of the test set that is common to both participants.  **However, if your file test.convos.txt contains less than 13,000 lines, this might be an indication of a problem so please contact us immediately**.
+
+## Prepare your system output for evaluation:
+
+To create a system output for evaluation, keep the ``test.convos.txt`` and relace ``__UNDISCLOSED__`` with your own system output.
+
+## Evaluation script:
+
+Steps:
+1) Make sure you 'git pull' the latest changes, including changes in ../data.
+2) cd to `../data` and type make. This will create the multi-reference file used by the metrics (`../data/test.refs`).
+3) Install 3rd party software as instructed above (METEOR and mteval-v14c.pl).
+5) Run the following command, where `[SUBMISSION]` is the submission file you want to evaluate: (same format as the one you submitted on Oct 8.)
+```
+python dstc.py -c [SUBMISSION] --refs ../data/test.refs
+```
+
+Important: the results printed by dstc.py might differ slightly from the official results, if part of your test set failed to download.
diff --git a/evaluation/automatic-evaluation.sh b/evaluation/automatic-evaluation.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Automatic evaluation script:
+
+# Name of any of the files submitted to DSTC7 Task2
+# (or any more recent file)
+SUBMISSION=$1
+#SUBMISSION=systems/constant-baseline.txt
+
+# Make sure this file exists:
+REFS=$2
+#REFS=../../data_extraction/test.refs
+
+if [ ! -f $REFS ]; then
+	echo "Reference file not found. Please move to ../../data_extraction and type make."
+else
+	python dstc.py -c $SUBMISSION --refs $REFS
+fi
+
diff --git a/evaluation/demo.py b/evaluation/demo.py
@@ -0,0 +1,21 @@
+from metrics import *
+from tokenizers import *
+
+# evaluation
+
+
+nist, bleu, meteor, entropy, diversity, avg_len = nlp_metrics(
+	path_refs=['demo/ref0.txt', 'demo/ref1.txt'], 
+	path_hyp='demo/hyp.txt')
+
+print(nist)
+print(bleu)
+print(meteor)
+print(entropy)
+print(diversity)
+print(avg_len)
+
+# tokenization 
+
+s = " I don't know:). how about this?https://github.com/golsun/deep-RL-time-series"
+print(clean_str(s))
diff --git a/evaluation/demo/hyp.txt b/evaluation/demo/hyp.txt
@@ -0,0 +1,3 @@
+i do n't know .
+he is a rocket scientist .
+i love it !
diff --git a/evaluation/demo/ref0.txt b/evaluation/demo/ref0.txt
@@ -0,0 +1,3 @@
+ok that 's fine
+he is a trader
+i love it !
diff --git a/evaluation/demo/ref1.txt b/evaluation/demo/ref1.txt
@@ -0,0 +1,3 @@
+well it 's ok
+he is an engineer
+i 'm not a fan
diff --git a/evaluation/dstc.py b/evaluation/dstc.py
@@ -0,0 +1,154 @@
+# author: Xiang Gao @ Microsoft Research, Oct 2018
+
+from util import *
+from metrics import *
+from tokenizers import *
+
+def extract_cells(path_in, path_hash):
+	keys = [line.strip('\n') for line in open(path_hash)]
+	cells = dict()
+	for line in open(path_in, encoding='utf-8'):
+		c = line.strip('\n').split('\t')
+		k = c[0]
+		if k in keys:
+			cells[k] = c[1:]
+	return cells
+
+
+def extract_hyp_refs(raw_hyp, raw_ref, path_hash, fld_out, n_refs=6, clean=False, vshuman=-1):
+	cells_hyp = extract_cells(raw_hyp, path_hash)
+	cells_ref = extract_cells(raw_ref, path_hash)
+	if not os.path.exists(fld_out):
+		os.makedirs(fld_out)
+
+	def _clean(s):
+		if clean:
+			return clean_str(s)
+		else:
+			return s
+
+	keys = sorted(cells_hyp.keys())
+	with open(fld_out + '/hash.txt', 'w', encoding='utf-8') as f:
+		f.write(unicode('\n'.join(keys)))
+
+	lines = [_clean(cells_hyp[k][-1]) for k in keys]
+	path_hyp = fld_out + '/hyp.txt'
+	with open(path_hyp, 'w', encoding='utf-8') as f:
+		f.write(unicode('\n'.join(lines)))
+
+	lines = []
+	for _ in range(n_refs):
+		lines.append([])
+	for k in keys:
+		refs = cells_ref[k]
+		for i in range(n_refs):
+			idx = i % len(refs)
+			if idx == vshuman:
+			    idx = (idx + 1) % len(refs)
+			lines[i].append(_clean(refs[idx].split('|')[1]))
+
+	path_refs = []
+	for i in range(n_refs):
+		path_ref = fld_out + '/ref%i.txt'%i
+		with open(path_ref, 'w', encoding='utf-8') as f:
+			f.write(unicode('\n'.join(lines[i])))
+		path_refs.append(path_ref)
+
+	return path_hyp, path_refs
+
+
+def eval_one_system(submitted, keys, multi_ref, n_refs=6, n_lines=None, clean=False, vshuman=-1, PRINT=True):
+
+	print('evaluating %s' % submitted)
+
+	fld_out = submitted.replace('.txt','')
+	if clean:
+		fld_out += '_cleaned'
+	path_hyp, path_refs = extract_hyp_refs(submitted, multi_ref, keys, fld_out, n_refs, clean=clean, vshuman=vshuman)
+	nist, bleu, meteor, entropy, div, avg_len = nlp_metrics(path_refs, path_hyp, fld_out, n_lines=n_lines)
+
+	if n_lines is None:
+		n_lines = len(open(path_hyp, encoding='utf-8').readlines())
+
+	if PRINT:
+		print('n_lines = '+str(n_lines))
+		print('NIST = '+str(nist))
+		print('BLEU = '+str(bleu))
+		print('METEOR = '+str(meteor))
+		print('entropy = '+str(entropy))
+		print('diversity = ' + str(div))
+		print('avg_len = '+str(avg_len))
+
+	return [n_lines] + nist + bleu + [meteor] + entropy + div + [avg_len]
+
+
+def eval_all_systems(files, path_report, keys, multi_ref, n_refs=6, n_lines=None, clean=False, vshuman=False):
+	# evaluate all systems (*.txt) in each folder `files`
+
+	with open(path_report, 'w') as f:
+		f.write('\t'.join(
+				['fname', 'n_lines'] + \
+				['nist%i'%i for i in range(1, 4+1)] + \
+				['bleu%i'%i for i in range(1, 4+1)] + \
+				['meteor'] + \
+				['entropy%i'%i for i in range(1, 4+1)] +\
+				['div1','div2','avg_len']
+			) + '\n')
+
+	for fl in files:
+		if fl.endswith('.txt'):
+			submitted = fl
+			results = eval_one_system(submitted, keys=keys, multi_ref=multi_ref, n_refs=n_refs, clean=clean, n_lines=n_lines, vshuman=vshuman, PRINT=False)
+			with open(path_report, 'a') as f:
+				f.write('\t'.join(map(str, [submitted] + results)) + '\n')
+		else:
+			for fname in os.listdir(fl):
+				if fname.endswith('.txt'):
+					submitted = fl + '/' + fname
+					results = eval_one_system(submitted, keys=keys, multi_ref=multi_ref, n_refs=n_refs, clean=clean, n_lines=n_lines, vshuman=vshuman, PRINT=False)
+					with open(path_report, 'a') as f:
+						f.write('\t'.join(map(str, [submitted] + results)) + '\n')
+
+	print('report saved to: '+path_report, file=sys.stderr)
+
+
+if __name__ == '__main__':
+
+	parser = argparse.ArgumentParser()
+	parser.add_argument('submitted')	# if 'all' or '*', eval all teams listed in dstc/teams.txt
+	                                    # elif endswith '.txt', eval this single file
+	                                    # else, eval all *.txt in folder `submitted_fld`
+
+	parser.add_argument('--clean', '-c', action='store_true')     # whether to clean ref and hyp before eval
+	parser.add_argument('--n_lines', '-n', type=int, default=-1)  # eval all lines (default) or top n_lines (e.g., for fast debugging)
+	parser.add_argument('--n_refs', '-r', type=int, default=6)    # number of references
+	parser.add_argument('--vshuman', '-v', type=int, default='1') # when evaluating against human performance (N in refN.txt that should be removed) 
+	                                                                      # in which case we need to remove human output from refs
+	parser.add_argument('--refs', '-g', default='dstc/test.refs')
+	parser.add_argument('--keys', '-k', default='keys/test.2k.txt')
+	parser.add_argument('--teams', '-i', type=str, default='dstc/teams.txt')
+	parser.add_argument('--report', '-o', type=str, default=None)
+	args = parser.parse_args()
+	print('Args: %s\n' % str(args), file=sys.stderr)
+
+	if args.n_lines < 0:
+		n_lines = None	# eval all lines
+	else:
+		n_lines = args.n_lines	# just eval top n_lines
+
+	if args.submitted.endswith('.txt'):
+		eval_one_system(args.submitted, keys=args.keys, multi_ref=args.refs, clean=args.clean, n_lines=n_lines, n_refs=args.n_refs, vshuman=args.vshuman)
+	else:
+		fname_report = 'report_ref%i'%args.n_refs
+		if args.clean:
+			fname_report += '_cleaned'
+		fname_report += '.tsv'
+		if args.submitted == 'all' or args.submitted == '*':
+			files = ['dstc/' + line.strip('\n') for line in open(args.teams)]
+			path_report = 'dstc/' + fname_report
+		else:
+			files = [args.submitted]
+			path_report = args.submitted + '/' + fname_report
+		if args.report != None:
+			path_report = args.report
+		eval_all_systems(files, path_report, keys=args.keys, multi_ref=args.refs, clean=args.clean, n_lines=n_lines, n_refs=args.n_refs, vshuman=args.vshuman)