From 48096b6c7b3722928662e9a12b1d64612e4a8a80 Mon Sep 17 00:00:00 2001 From: Rudolf Kadlec Date: Thu, 11 Feb 2016 17:44:09 +0100 Subject: [PATCH 1/3] It is now possible to generate more training examples from each dialog. Max context size is now parameter of the script. --- src/create_ubuntu_dataset.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/create_ubuntu_dataset.py b/src/create_ubuntu_dataset.py index f790ff6..36343e8 100644 --- a/src/create_ubuntu_dataset.py +++ b/src/create_ubuntu_dataset.py @@ -1,5 +1,4 @@ import argparse -import cPickle as pickle import os import unicodecsv import random @@ -11,7 +10,9 @@ """ Script for generation of train, test and valid datasets from Ubuntu Corpus 1 on 1 dialogs. -Copyright IBM 2015 +Copyright IBM Corporation 2016 +LICENSE: Apache License 2.0 URL: ttp://www.apache.org/licenses/LICENSE-2.0 +Contact: Rudolf Kadlec (rudolf_kadlec@cz.ibm.com) """ dialog_end_symbol = "__dialog_end__" @@ -163,7 +164,7 @@ def create_single_dialog_train_example(context_dialog_path, candidate_dialog_pat return context_str, response, label -def create_single_dialog_test_example(context_dialog_path, candidate_dialog_paths, rng, distractors_num): +def create_single_dialog_test_example(context_dialog_path, candidate_dialog_paths, rng, distractors_num, max_context_length): """ Creates a single example for testing or validation. Each line contains a context, one positive example and N negative examples. :param context_dialog_path: @@ -175,7 +176,7 @@ def create_single_dialog_test_example(context_dialog_path, candidate_dialog_path dialog = translate_dialog_to_lists(context_dialog_path) - context_str, next_utterance_ix = create_random_context(dialog, rng) + context_str, next_utterance_ix = create_random_context(dialog, rng, max_context_length=max_context_length) # use the next utterance as positive example positive_response = singe_user_utterances_to_string(dialog[next_utterance_ix]) @@ -184,7 +185,7 @@ def create_single_dialog_test_example(context_dialog_path, candidate_dialog_path return context_str, positive_response, negative_responses -def create_examples_train(candidate_dialog_paths, rng, positive_probability=0.5): +def create_examples_train(candidate_dialog_paths, rng, positive_probability=0.5, max_context_length=20): """ Creates single training example. :param candidate_dialog_paths: @@ -198,11 +199,12 @@ def create_examples_train(candidate_dialog_paths, rng, positive_probability=0.5) if i % 1000 == 0: print str(i) dialog_path = candidate_dialog_paths[i] - examples.append(create_single_dialog_train_example(dialog_path, candidate_dialog_paths, rng, positive_probability)) + examples.append(create_single_dialog_train_example(dialog_path, candidate_dialog_paths, rng, positive_probability, + max_context_length=max_context_length)) i+=1 #return map(lambda dialog_path : create_single_dialog_train_example(dialog_path, candidate_dialog_paths, rng, positive_probability), candidate_dialog_paths) -def create_examples(candidate_dialog_paths, creator_function): +def create_examples(candidate_dialog_paths, examples_num, creator_function): """ Creates a list of training examples from a list of dialogs and function that transforms a dialog to an example. :param candidate_dialog_paths: @@ -211,7 +213,10 @@ def create_examples(candidate_dialog_paths, creator_function): """ i = 0 examples = [] - for context_dialog in candidate_dialog_paths: + unique_dialogs_num = len(candidate_dialog_paths) + + while i < examples_num: + context_dialog = candidate_dialog_paths[i % unique_dialogs_num] # counter for tracking progress if i % 1000 == 0: print str(i) @@ -280,7 +285,10 @@ def create_eval_dataset(args, file_list_csv): dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f)) data_set = create_examples(dialog_paths, - lambda context_dialog, candidates : create_single_dialog_test_example(context_dialog, candidates, rng, args.n)) + args.examples, + lambda context_dialog, candidates : + create_single_dialog_test_example(context_dialog, candidates, rng, + args.n, args.create_single_dialog_test_example)) # output the dataset w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8') # header @@ -302,9 +310,12 @@ def train_cmd(args): f = open(os.path.join("meta", "trainfiles.csv"), 'r') dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f)) - dialog_paths = dialog_paths[:args.examples] - train_set = create_examples(dialog_paths, lambda context_dialog, candidates : create_single_dialog_train_example(context_dialog, candidates, rng, args.p)) + train_set = create_examples(dialog_paths, + args.examples, + lambda context_dialog, candidates : + create_single_dialog_train_example(context_dialog, candidates, rng, + args.p, max_context_length=args.max_context_length)) # output the dataset w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8') @@ -331,6 +342,9 @@ def test_cmd(args): parser.add_argument('--seed', type=int, default=1234, help='seed for random number generator') + parser.add_argument('--max_context_length', type=int, default=20, + help='maximum number of dialog turns in the context') + parser.add_argument('-o', '--output', default=None, help='output csv') From 1b10fc79613e7e8e209bfcaa92db48217093afdd Mon Sep 17 00:00:00 2001 From: Rudolf Kadlec Date: Wed, 17 Feb 2016 13:31:44 +0100 Subject: [PATCH 2/3] Solves crashes during test and valid set generation introduced in last commit. Fixes #1 --- src/create_ubuntu_dataset.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/create_ubuntu_dataset.py b/src/create_ubuntu_dataset.py index 36343e8..0e49870 100644 --- a/src/create_ubuntu_dataset.py +++ b/src/create_ubuntu_dataset.py @@ -285,10 +285,9 @@ def create_eval_dataset(args, file_list_csv): dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f)) data_set = create_examples(dialog_paths, - args.examples, - lambda context_dialog, candidates : - create_single_dialog_test_example(context_dialog, candidates, rng, - args.n, args.create_single_dialog_test_example)) + len(dialog_paths), + lambda context_dialog, candidates : create_single_dialog_test_example(context_dialog, candidates, rng, + args.n, args.max_context_length)) # output the dataset w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8') # header @@ -337,7 +336,7 @@ def test_cmd(args): "The script downloads 1on1 dialogs from internet and then it randomly samples all the datasets with positive and negative examples.") parser.add_argument('--data_root', default='.', - help='directory where 1on1 dialogs will downloaded and extracted, the data will be downloaded from cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/ubuntu_dialogs.tgz') + help='directory where 1on1 dialogs will be downloaded and extracted, the data will be downloaded from cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/ubuntu_dialogs.tgz') parser.add_argument('--seed', type=int, default=1234, help='seed for random number generator') From 043e00c51f505d2efb18da17c8fb9497058f9a96 Mon Sep 17 00:00:00 2001 From: ryan-lowe Date: Sat, 20 Feb 2016 15:58:59 -0500 Subject: [PATCH 3/3] Updated readme with baseline results, named entity changes --- README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/README.md b/README.md index 4b0f059..addcae7 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,13 @@ real life implementation, where you are training a model on past data to predict (between 2 and the max context size). This increases the average context length, which we consider desirable since we would like to model long-term dependencies. +-Changed the tokenization and entity replacement procedure. After complaints stating v1 was too aggressive, we've decided to remove these. +It is up to each person using the dataset to come up with their own tokenization/ entity replacement scheme. We plan to use twokenize internally. + +-Added differentiation between the end of an utterance (__eou__) and end of turn (__eot__). In the original dataset, we concatenated all consecutive +utterances by the same user in to one utterance, and put __EOS__ at the end. Here, we also denote where the original utterances were (with __eou__). Also, the +terminology should now be consistent between the training and test set (instead of both __EOS__ and ). + -Fixed a bug that caused the distribution of false responses in the test and validation sets to be different from the true responses. In particular, the number of words in the false responses was shorter on average than for the true responses, which could have been exploited by some models. @@ -76,3 +83,94 @@ Contains the test set. Formatted in the same way as the validation set. When gen vocabulary size of 115,623. +##BASELINE RESULTS + +####Dual Encoder LSTM model: +1 in 2: + recall@1: 0.868730970907 +1 in 10: + recall@1: 0.552213717862 + recall@2: 0.72099120433, + recall@5: 0.924285351827 + +####Dual Encoder RNN model: +1 in 2: + recall@1: 0.776539210705, +1 in 10: + recall@1: 0.379139142954, + recall@2: 0.560689786585, + recall@5: 0.836350355691, + +####TF-IDF model: +1 in 2: + recall@1: 0.749260042283 +1 in 10: + recall@1: 0.48810782241 + recall@2: 0.587315010571 + recall@5: 0.763054968288 + + +##HYPERPARAMETERS USED + +Code for the model can be found here (might not be up to date with the new dataset): https://github.com/npow/ubottu + +####Dual Encoder LSTM model: + +act_penalty=500 +batch_size=256 +conv_attn=False +corr_penalty=0.0 +emb_penalty=0.001 +fine_tune_M=True +fine_tune_W=False +forget_gate_bias=2.0 +hidden_size=200 +is_bidirectional=False +lr=0.001 +lr_decay=0.95 +max_seqlen=160 +n_epochs=100 +n_recurrent_layers=1 +optimizer='adam' +penalize_activations=False +penalize_emb_drift=False +penalize_emb_norm=False +pv_ndims=100 +seed=42 +shuffle_batch=False +sort_by_len=False +sqr_norm_lim=1 +use_pv=False +xcov_penalty=0.0 + +####Dual Encoder RNN model: + +act_penalty=500 +batch_size=512 +conv_attn=False +corr_penalty=0.0 +emb_penalty=0.001 +fine_tune_M=False +fine_tune_W=False +forget_gate_bias=2.0 +hidden_size=100 +is_bidirectional=False +lr=0.0001 +lr_decay=0.95 +max_seqlen=160 +n_epochs=100 +n_recurrent_layers=1 +optimizer='adam' +penalize_activations=False +penalize_emb_drift=False +penalize_emb_norm=False +pv_ndims=100 +seed=42 +shuffle_batch=False +sort_by_len=False +sqr_norm_lim=1 +use_pv=False +xcov_penalty=0.0 + + +