From 48096b6c7b3722928662e9a12b1d64612e4a8a80 Mon Sep 17 00:00:00 2001
From: Rudolf Kadlec <rudolf_kadlec@cz.ibm.com>
Date: Thu, 11 Feb 2016 17:44:09 +0100
Subject: [PATCH 1/3] It is now possible to generate more training examples
 from each dialog. Max context size is now parameter of the script.

---
 src/create_ubuntu_dataset.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/src/create_ubuntu_dataset.py b/src/create_ubuntu_dataset.py
index f790ff6..36343e8 100644
--- a/src/create_ubuntu_dataset.py
+++ b/src/create_ubuntu_dataset.py
@@ -1,5 +1,4 @@
 import argparse
-import cPickle as pickle
 import os
 import unicodecsv
 import random
@@ -11,7 +10,9 @@
 
 """
 Script for generation of train, test and valid datasets from Ubuntu Corpus 1 on 1 dialogs.
-Copyright IBM 2015
+Copyright IBM Corporation 2016
+LICENSE: Apache License 2.0  URL: ttp://www.apache.org/licenses/LICENSE-2.0
+Contact: Rudolf Kadlec (rudolf_kadlec@cz.ibm.com)
 """
 
 dialog_end_symbol = "__dialog_end__"
@@ -163,7 +164,7 @@ def create_single_dialog_train_example(context_dialog_path, candidate_dialog_pat
     return context_str, response, label
 
 
-def create_single_dialog_test_example(context_dialog_path, candidate_dialog_paths, rng, distractors_num):
+def create_single_dialog_test_example(context_dialog_path, candidate_dialog_paths, rng, distractors_num, max_context_length):
     """
     Creates a single example for testing or validation. Each line contains a context, one positive example and N negative examples.
     :param context_dialog_path:
@@ -175,7 +176,7 @@ def create_single_dialog_test_example(context_dialog_path, candidate_dialog_path
 
     dialog = translate_dialog_to_lists(context_dialog_path)
 
-    context_str, next_utterance_ix = create_random_context(dialog, rng)
+    context_str, next_utterance_ix = create_random_context(dialog, rng, max_context_length=max_context_length)
 
     # use the next utterance as positive example
     positive_response = singe_user_utterances_to_string(dialog[next_utterance_ix])
@@ -184,7 +185,7 @@ def create_single_dialog_test_example(context_dialog_path, candidate_dialog_path
     return context_str, positive_response, negative_responses
 
 
-def create_examples_train(candidate_dialog_paths, rng, positive_probability=0.5):
+def create_examples_train(candidate_dialog_paths, rng, positive_probability=0.5, max_context_length=20):
     """
     Creates single training example.
     :param candidate_dialog_paths:
@@ -198,11 +199,12 @@ def create_examples_train(candidate_dialog_paths, rng, positive_probability=0.5)
         if i % 1000 == 0:
             print str(i)
         dialog_path = candidate_dialog_paths[i]
-        examples.append(create_single_dialog_train_example(dialog_path, candidate_dialog_paths, rng, positive_probability))
+        examples.append(create_single_dialog_train_example(dialog_path, candidate_dialog_paths, rng, positive_probability,
+                                                           max_context_length=max_context_length))
         i+=1
     #return map(lambda dialog_path : create_single_dialog_train_example(dialog_path, candidate_dialog_paths, rng, positive_probability), candidate_dialog_paths)
 
-def create_examples(candidate_dialog_paths, creator_function):
+def create_examples(candidate_dialog_paths, examples_num, creator_function):
     """
     Creates a list of training examples from a list of dialogs and function that transforms a dialog to an example.
     :param candidate_dialog_paths:
@@ -211,7 +213,10 @@ def create_examples(candidate_dialog_paths, creator_function):
     """
     i = 0
     examples = []
-    for context_dialog in candidate_dialog_paths:
+    unique_dialogs_num = len(candidate_dialog_paths)
+
+    while i < examples_num:
+        context_dialog = candidate_dialog_paths[i % unique_dialogs_num]
         # counter for tracking progress
         if i % 1000 == 0:
             print str(i)
@@ -280,7 +285,10 @@ def create_eval_dataset(args, file_list_csv):
         dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f))
 
         data_set = create_examples(dialog_paths,
-                                   lambda context_dialog, candidates : create_single_dialog_test_example(context_dialog, candidates, rng, args.n))
+                                   args.examples,
+                                   lambda context_dialog, candidates :
+                                   create_single_dialog_test_example(context_dialog, candidates, rng,
+                                                                     args.n, args.create_single_dialog_test_example))
         # output the dataset
         w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8')
         # header
@@ -302,9 +310,12 @@ def train_cmd(args):
 
         f = open(os.path.join("meta", "trainfiles.csv"), 'r')
         dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f))
-        dialog_paths = dialog_paths[:args.examples]
 
-        train_set = create_examples(dialog_paths, lambda context_dialog, candidates : create_single_dialog_train_example(context_dialog, candidates, rng, args.p))
+        train_set = create_examples(dialog_paths,
+                                    args.examples,
+                                    lambda context_dialog, candidates :
+                                    create_single_dialog_train_example(context_dialog, candidates, rng,
+                                                                       args.p, max_context_length=args.max_context_length))
 
         # output the dataset
         w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8')
@@ -331,6 +342,9 @@ def test_cmd(args):
     parser.add_argument('--seed', type=int, default=1234,
                         help='seed for random number generator')
 
+    parser.add_argument('--max_context_length', type=int, default=20,
+                        help='maximum number of dialog turns in the context')
+
     parser.add_argument('-o', '--output', default=None,
                         help='output csv')
 

From 1b10fc79613e7e8e209bfcaa92db48217093afdd Mon Sep 17 00:00:00 2001
From: Rudolf Kadlec <rudolf_kadlec@cz.ibm.com>
Date: Wed, 17 Feb 2016 13:31:44 +0100
Subject: [PATCH 2/3] Solves crashes during test and valid set generation
 introduced in last commit. Fixes #1

---
 src/create_ubuntu_dataset.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/create_ubuntu_dataset.py b/src/create_ubuntu_dataset.py
index 36343e8..0e49870 100644
--- a/src/create_ubuntu_dataset.py
+++ b/src/create_ubuntu_dataset.py
@@ -285,10 +285,9 @@ def create_eval_dataset(args, file_list_csv):
         dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f))
 
         data_set = create_examples(dialog_paths,
-                                   args.examples,
-                                   lambda context_dialog, candidates :
-                                   create_single_dialog_test_example(context_dialog, candidates, rng,
-                                                                     args.n, args.create_single_dialog_test_example))
+                                   len(dialog_paths),
+                                   lambda context_dialog, candidates : create_single_dialog_test_example(context_dialog, candidates, rng,
+                                                                     args.n, args.max_context_length))
         # output the dataset
         w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8')
         # header
@@ -337,7 +336,7 @@ def test_cmd(args):
                                                  "The script downloads 1on1 dialogs from internet and then it randomly samples all the datasets with positive and negative examples.")
 
     parser.add_argument('--data_root', default='.',
-                        help='directory where 1on1 dialogs will downloaded and extracted, the data will be downloaded from cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/ubuntu_dialogs.tgz')
+                        help='directory where 1on1 dialogs will be downloaded and extracted, the data will be downloaded from cs.mcgill.ca/~jpineau/datasets/ubuntu-corpus-1.0/ubuntu_dialogs.tgz')
 
     parser.add_argument('--seed', type=int, default=1234,
                         help='seed for random number generator')

From 043e00c51f505d2efb18da17c8fb9497058f9a96 Mon Sep 17 00:00:00 2001
From: ryan-lowe <ryan.lowe_@hotmail.com>
Date: Sat, 20 Feb 2016 15:58:59 -0500
Subject: [PATCH 3/3] Updated readme with baseline results, named entity
 changes

---
 README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/README.md b/README.md
index 4b0f059..addcae7 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,13 @@ real life implementation, where you are training a model on past data to predict
 (between 2 and the max context size). This increases the average context length, which we consider desirable since we would like to
 model long-term dependencies.
 
+-Changed the tokenization and entity replacement procedure. After complaints stating v1 was too aggressive, we've decided to remove these.
+It is up to each person using the dataset to come up with their own tokenization/ entity replacement scheme. We plan to use twokenize internally.
+
+-Added differentiation between the end of an utterance (__eou__) and end of turn (__eot__). In the original dataset, we concatenated all consecutive
+utterances by the same user in to one utterance, and put __EOS__ at the end. Here, we also denote where the original utterances were (with __eou__). Also, the
+terminology should now be consistent between the training and test set (instead of both __EOS__ and </s>).
+
 -Fixed a bug that caused the distribution of false responses in the test and validation sets to be different from the true responses.
 In particular, the number of words in the false responses was shorter on average than for the true responses, which could have been
 exploited by some models.
@@ -76,3 +83,94 @@ Contains the test set. Formatted in the same way as the validation set. When gen
 vocabulary size of 115,623.
 
 
+##BASELINE RESULTS
+
+####Dual Encoder LSTM model:
+1 in 2:
+	recall@1: 0.868730970907
+1 in 10:
+	recall@1: 0.552213717862 
+	recall@2: 0.72099120433, 
+	recall@5: 0.924285351827 
+
+####Dual Encoder RNN model:
+1 in 2:
+	recall@1: 0.776539210705,
+1 in 10:
+	recall@1: 0.379139142954, 
+	recall@2: 0.560689786585, 
+	recall@5: 0.836350355691,
+
+####TF-IDF model:
+1 in 2:
+	recall@1:  0.749260042283
+1 in 10:
+	recall@1:  0.48810782241
+	recall@2:  0.587315010571
+	recall@5:  0.763054968288
+
+
+##HYPERPARAMETERS USED
+
+Code for the model can be found here (might not be up to date with the new dataset): https://github.com/npow/ubottu
+
+####Dual Encoder LSTM model:
+
+act_penalty=500
+batch_size=256
+conv_attn=False 
+corr_penalty=0.0
+emb_penalty=0.001
+fine_tune_M=True
+fine_tune_W=False
+forget_gate_bias=2.0
+hidden_size=200
+is_bidirectional=False
+lr=0.001
+lr_decay=0.95
+max_seqlen=160
+n_epochs=100
+n_recurrent_layers=1
+optimizer='adam'
+penalize_activations=False
+penalize_emb_drift=False
+penalize_emb_norm=False
+pv_ndims=100
+seed=42
+shuffle_batch=False
+sort_by_len=False
+sqr_norm_lim=1
+use_pv=False
+xcov_penalty=0.0
+
+####Dual Encoder RNN model:
+
+act_penalty=500
+batch_size=512
+conv_attn=False
+corr_penalty=0.0
+emb_penalty=0.001
+fine_tune_M=False
+fine_tune_W=False
+forget_gate_bias=2.0
+hidden_size=100
+is_bidirectional=False
+lr=0.0001
+lr_decay=0.95
+max_seqlen=160
+n_epochs=100
+n_recurrent_layers=1
+optimizer='adam'
+penalize_activations=False
+penalize_emb_drift=False
+penalize_emb_norm=False
+pv_ndims=100
+seed=42
+shuffle_batch=False
+sort_by_len=False
+sqr_norm_lim=1
+use_pv=False
+xcov_penalty=0.0
+
+
+