In [1]:
!which python

/sw/centos/anaconda3/2019.10/bin/python


In [2]:
import os
import json
import numpy as np

# DailyDialog

In [3]:
daily_dialog_emotion = {0: "neutral", 1: "anger", 2: "disgust", 3: "fear", 4: "happiness", 5: "sadness", 6: "surprise"}

## Parse Train Set

In [4]:
# Load text in trainset
trainset_text = []
with open('./ijcnlp_dailydialog/train/dialogues_train.txt', 'r') as f:
    for line in f:
        assert isinstance(line, str)
        trainset_text.append(line.strip())

In [5]:
print("Totally {} lines of data in the training set, each line forms a dialogue".format(len(trainset_text)))

Totally 11118 lines of data in the training set, each line forms a dialogue


In [6]:
trainset_text[6]

'Frank ’ s getting married , do you believe this ? __eou__ Is he really ? __eou__ Yes , he is . He loves the girl very much . __eou__ Who is he marring ? __eou__ A girl he met on holiday in Spain , I think . __eou__ Have they set a date for the wedding ? __eou__ Not yet . __eou__'

In [7]:
trainset_utterance = []
for i in range(len(trainset_text)):
    cur_utterance = trainset_text[i].split("__eou__")
    clean_utterance = []
    for i in range(len(cur_utterance)):
        if cur_utterance[i] != "":
            clean_utterance.append(cur_utterance[i].strip())
        else:
            pass
    trainset_utterance.append(clean_utterance)
assert len(trainset_text) == len(trainset_utterance)

In [8]:
# Load emotion label in trainset
trainset_emotions = []
with open('./ijcnlp_dailydialog/train/dialogues_emotion_train.txt', 'r') as f:
    for line in f:
        trainset_emotions.append(line.strip().split())
print("Totally {} lines of emotion label in the training set, each line relates to a dialogue".format(len(trainset_emotions)))

Totally 11118 lines of emotion label in the training set, each line relates to a dialogue


In [9]:
trainset_emotions[0]

['0', '0', '0', '0', '0', '0', '4', '4', '4', '4']

In [10]:
# make sure that the number of emotion label for each dialogue is the same as the number of utterance in the diaglogue'
assert len(trainset_emotions) == len(trainset_text)
for i in range(len(trainset_text)):
    if len(trainset_emotions[i]) != len(trainset_utterance[i]):
        print(trainset_text)
        print(trainset_utterance)
        print(trainset_emotions)

This shows that in the train set, each utterance already being labeled by an emotion label

In [11]:
# Let's count how many data instance should there be
cnt_data_instance = 0
for i in range(len(trainset_emotions)):
    cnt_data_instance += (len(trainset_emotions[i]) - 1)
print("There should be {} data instances".format(cnt_data_instance))

There should be 76052 data instances


In [12]:
user_num = 0
for i in range(len(trainset_utterance)):
    for j in range(len(trainset_utterance[i])):
        if user_num == 0:
            # add <user0>
            trainset_utterance[i][j] = "<user0> " + trainset_utterance[i][j]
        else:
            # add <user1>
            trainset_utterance[i][j] = "<user1> " + trainset_utterance[i][j]
        user_num = (user_num + 1) % 2

In [13]:
trainset_utterance[0]

['<user0> Say , Jim , how about going for a few beers after dinner ?',
 '<user1> You know that is tempting but is really not good for our fitness .',
 '<user0> What do you mean ? It will help us to relax .',
 "<user1> Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?",
 "<user0> I guess you are right.But what shall we do ? I don't feel like sitting at home .",
 '<user1> I suggest a walk over to the gym where we can play singsong and meet some of our friends .',
 "<user0> That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them .",
 '<user1> Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too .',
 "<user0> Good.Let ' s go now .",
 '<user1> All right .']

In [14]:
trainset_utterance[1]

['<user0> Can you do push-ups ?',
 "<user1> Of course I can . It's a piece of cake ! Believe it or not , I can do 30 push-ups a minute .",
 "<user0> Really ? I think that's impossible !",
 '<user1> You mean 30 push-ups ?',
 '<user0> Yeah !',
 "<user1> It's easy . If you do exercise everyday , you can make it , too ."]

In [16]:
trainset_instances = []
for i in range(len(trainset_utterance)):
    assert len(trainset_utterance[i]) == len(trainset_emotions[i])
    assert len(trainset_utterance[i]) > 1
    for j in range(1, len(trainset_utterance[i])):
        src_text = " __eou__ ".join(trainset_utterance[i][:j])
        trg_text = trainset_utterance[i][j]
        trg_emotion = daily_dialog_emotion[int(trainset_emotions[i][j])]
        cur_instance = {"src": src_text, "trg": trg_text, "trg_emotion": trg_emotion}
        trainset_instances.append(cur_instance)

In [17]:
trainset_instances[:3]

[{'src': '<user0> Say , Jim , how about going for a few beers after dinner ?',
  'trg': '<user1> You know that is tempting but is really not good for our fitness .',
  'trg_emotion': 'neutral'},
 {'src': '<user0> Say , Jim , how about going for a few beers after dinner ? __eou__ <user1> You know that is tempting but is really not good for our fitness .',
  'trg': '<user0> What do you mean ? It will help us to relax .',
  'trg_emotion': 'neutral'},
 {'src': '<user0> Say , Jim , how about going for a few beers after dinner ? __eou__ <user1> You know that is tempting but is really not good for our fitness . __eou__ <user0> What do you mean ? It will help us to relax .',
  'trg': "<user1> Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ?",
  'trg_emotion': 'neutral'}]

In [18]:
# Construct the training json file
""" Format of the training json file. Each line in the json file is a dict which contains:
    {"src": text in the historty, "trg": text need to be generated in response, "trg_emotion": target emotion}
"""
cnt_lines = 0
with open('./ijcnlp_dailydialog/train/train.json', 'w') as f:
    for trainset_instance in trainset_instances:
        json.dump(trainset_instance, f)
        f.write('\n')
        cnt_lines += 1
print("Totally {} lines of data in the trainset.".format(cnt_lines))
assert cnt_lines == cnt_data_instance

Totally 76052 lines of data in the trainset.


## Parse Valid Set

In [19]:
# Load text in validset
validset_text = []
with open('./ijcnlp_dailydialog/validation/dialogues_validation.txt', 'r') as f:
    for line in f:
        assert isinstance(line, str)
        validset_text.append(line.strip())

In [20]:
print("Totally {} lines of data in the validation set, each line forms a dialogue".format(len(validset_text)))

Totally 1000 lines of data in the validation set, each line forms a dialogue


In [21]:
validset_text[0]

"Good morning , sir . Is there a bank near here ? __eou__ There is one . 5 blocks away from here ? __eou__ Well , that's too far.Can you change some money for me ? __eou__ Surely , of course . What kind of currency have you got ? __eou__ RIB . __eou__ How much would you like to change ? __eou__ 1000 Yuan.Here you are . __eou__"

In [22]:
validset_utterance = []
for i in range(len(validset_text)):
    cur_utterance = validset_text[i].split("__eou__")
    clean_utterance = []
    for i in range(len(cur_utterance)):
        if cur_utterance[i] != "":
            clean_utterance.append(cur_utterance[i].strip())
        else:
            pass
    validset_utterance.append(clean_utterance)
assert len(validset_text) == len(validset_utterance)

In [23]:
# Load emotion label in trainset
validset_emotions = []
with open('./ijcnlp_dailydialog/validation/dialogues_emotion_validation.txt', 'r') as f:
    for line in f:
        validset_emotions.append(line.strip().split())
print("Totally {} lines of emotion label in the validation set, each line relates to a dialogue".format(len(validset_emotions)))

Totally 1000 lines of emotion label in the validation set, each line relates to a dialogue


In [24]:
validset_emotions[0]

['0', '0', '0', '0', '0', '0', '0']

In [25]:
# make sure that the number of emotion label for each dialogue is the same as the number of utterance in the diaglogue
assert len(validset_emotions) == len(validset_text)
for i in range(len(validset_text)):
    if len(validset_emotions[i]) != len(validset_utterance[i]):
        print(validset_text)
        print(validset_utterance)
        print(validset_emotions)

In [26]:
user_num = 0
for i in range(len(validset_utterance)):
    for j in range(len(validset_utterance[i])):
        if user_num == 0:
            # add <user0>
            validset_utterance[i][j] = "<user0> " + validset_utterance[i][j]
        else:
            # add <user1>
            validset_utterance[i][j] = "<user1> " + validset_utterance[i][j]
        user_num = (user_num + 1) % 2

In [27]:
validset_utterance[0]

['<user0> Good morning , sir . Is there a bank near here ?',
 '<user1> There is one . 5 blocks away from here ?',
 "<user0> Well , that's too far.Can you change some money for me ?",
 '<user1> Surely , of course . What kind of currency have you got ?',
 '<user0> RIB .',
 '<user1> How much would you like to change ?',
 '<user0> 1000 Yuan.Here you are .']

In [29]:
validset_instances = []
for i in range(len(validset_utterance)):
    assert len(validset_utterance[i]) == len(validset_emotions[i])
    assert len(validset_utterance[i]) > 1
    for j in range(1, len(validset_utterance[i])):
        src_text = " __eou__ ".join(validset_utterance[i][:j])
        trg_text = validset_utterance[i][j]
        trg_emotion = daily_dialog_emotion[int(validset_emotions[i][j])]
        cur_instance = {"src": src_text, "trg": trg_text, "trg_emotion": trg_emotion}
        validset_instances.append(cur_instance)

In [30]:
validset_instances[:3]

[{'src': '<user0> Good morning , sir . Is there a bank near here ?',
  'trg': '<user1> There is one . 5 blocks away from here ?',
  'trg_emotion': 'neutral'},
 {'src': '<user0> Good morning , sir . Is there a bank near here ? __eou__ <user1> There is one . 5 blocks away from here ?',
  'trg': "<user0> Well , that's too far.Can you change some money for me ?",
  'trg_emotion': 'neutral'},
 {'src': "<user0> Good morning , sir . Is there a bank near here ? __eou__ <user1> There is one . 5 blocks away from here ? __eou__ <user0> Well , that's too far.Can you change some money for me ?",
  'trg': '<user1> Surely , of course . What kind of currency have you got ?',
  'trg_emotion': 'neutral'}]

In [31]:
# Construct the training json file
""" Format of the training json file. Each line in the json file is a dict which contains:
    {"src": text in the historty, "trg": text need to be generated in response, "trg_emotion": target emotion}
"""
# Let's first compute how many data instance should there be
cnt_data_instance = 0
for i in range(len(validset_emotions)):
    cnt_data_instance += len(validset_emotions[i]) - 1
print('There should be {} lines of data in valid set'.format(cnt_data_instance))
cnt_lines = 0
with open('./ijcnlp_dailydialog/validation/valid.json', 'w') as f:
    for validset_instance in validset_instances:
        json.dump(validset_instance, f)
        f.write('\n')
        cnt_lines += 1
assert cnt_lines == cnt_data_instance
print("Totally {} lines of data in the validset.".format(cnt_lines))

There should be 7069 lines of data in valid set
Totally 7069 lines of data in the validset.


## Parse Test Set

In [32]:
# Load text in testset
testset_text = []
with open('./ijcnlp_dailydialog/test/dialogues_test.txt', 'r') as f:
    for line in f:
        assert isinstance(line, str)
        testset_text.append(line.strip())

In [33]:
print("Totally {} lines of data in the test set, each line forms a dialogue".format(len(testset_text)))

Totally 1000 lines of data in the test set, each line forms a dialogue


In [34]:
testset_text[0]

'Hey man , you wanna buy some weed ? __eou__ Some what ? __eou__ Weed ! You know ? Pot , Ganja , Mary Jane some chronic ! __eou__ Oh , umm , no thanks . __eou__ I also have blow if you prefer to do a few lines . __eou__ No , I am ok , really . __eou__ Come on man ! I even got dope and acid ! Try some ! __eou__ Do you really have all of these drugs ? Where do you get them from ? __eou__ I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free . __eou__ Sounds good ! Let ’ s see , I want . __eou__ Yeah ? __eou__ I want you to put your hands behind your head ! You are under arrest ! __eou__'

In [35]:
testset_utterance = []
for i in range(len(testset_text)):
    cur_utterance = testset_text[i].split("__eou__")
    clean_utterance = []
    for i in range(len(cur_utterance)):
        if cur_utterance[i] != "":
            clean_utterance.append(cur_utterance[i].strip())
        else:
            pass
    testset_utterance.append(clean_utterance)
assert len(testset_text) == len(testset_utterance)

In [36]:
# Load emotion label in trainset
testset_emotions = []
with open('./ijcnlp_dailydialog/test/dialogues_emotion_test.txt', 'r') as f:
    for line in f:
        testset_emotions.append(line.strip().split())
print("Totally {} lines of emotion label in the test set, each line relates to a dialogue".format(len(testset_emotions)))

Totally 1000 lines of emotion label in the test set, each line relates to a dialogue


In [37]:
testset_emotions[0]

['0', '6', '0', '0', '0', '0', '0', '0', '0', '0', '3', '0']

In [38]:
# make sure that the number of emotion label for each dialogue is the same as the number of utterance in the diaglogue'
assert len(testset_emotions) == len(testset_text)
for i in range(len(testset_text)):
    if len(testset_emotions[i]) != len(testset_utterance[i]):
        print(testset_text)
        print(testset_utterance)
        print(testset_emotions)

In [39]:
user_num = 0
for i in range(len(testset_utterance)):
    for j in range(len(testset_utterance[i])):
        if user_num == 0:
            # add <user0>
            testset_utterance[i][j] = "<user0> " + testset_utterance[i][j]
        else:
            # add <user1>
            testset_utterance[i][j] = "<user1> " + testset_utterance[i][j]
        user_num = (user_num + 1) % 2

In [40]:
testset_utterance[0]

['<user0> Hey man , you wanna buy some weed ?',
 '<user1> Some what ?',
 '<user0> Weed ! You know ? Pot , Ganja , Mary Jane some chronic !',
 '<user1> Oh , umm , no thanks .',
 '<user0> I also have blow if you prefer to do a few lines .',
 '<user1> No , I am ok , really .',
 '<user0> Come on man ! I even got dope and acid ! Try some !',
 '<user1> Do you really have all of these drugs ? Where do you get them from ?',
 '<user0> I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free .',
 '<user1> Sounds good ! Let ’ s see , I want .',
 '<user0> Yeah ?',
 '<user1> I want you to put your hands behind your head ! You are under arrest !']

In [41]:
testset_instances = []
for i in range(len(testset_utterance)):
    assert len(testset_utterance[i]) == len(testset_emotions[i])
    assert len(testset_utterance[i]) > 1
    for j in range(1, len(testset_utterance[i])):
        src_text = " __eou__ ".join(testset_utterance[i][:j])
        trg_text = testset_utterance[i][j]
        trg_emotion = daily_dialog_emotion[int(testset_emotions[i][j])]
        cur_instance = {"src": src_text, "trg": trg_text, "trg_emotion": trg_emotion}
        testset_instances.append(cur_instance)

In [42]:
testset_instances[:3]

[{'src': '<user0> Hey man , you wanna buy some weed ?',
  'trg': '<user1> Some what ?',
  'trg_emotion': 'surprise'},
 {'src': '<user0> Hey man , you wanna buy some weed ? __eou__ <user1> Some what ?',
  'trg': '<user0> Weed ! You know ? Pot , Ganja , Mary Jane some chronic !',
  'trg_emotion': 'neutral'},
 {'src': '<user0> Hey man , you wanna buy some weed ? __eou__ <user1> Some what ? __eou__ <user0> Weed ! You know ? Pot , Ganja , Mary Jane some chronic !',
  'trg': '<user1> Oh , umm , no thanks .',
  'trg_emotion': 'neutral'}]

In [43]:
# Construct the training json file
""" Format of the training json file. Each line in the json file is a dict which contains:
    {"utterance": [list of text], "emotion": [list of emotion label]}
"""
# Let's first compute how many data instance should there be
cnt_data_instance = 0
for i in range(len(testset_emotions)):
    cnt_data_instance += len(testset_emotions[i]) - 1
print('There should be {} lines of data in valid set'.format(cnt_data_instance))
cnt_lines = 0
with open('./ijcnlp_dailydialog/test/test.json', 'w') as f:
    for testset_instance in testset_instances:
        json.dump(testset_instance, f)
        f.write('\n')
        cnt_lines += 1
assert cnt_lines == cnt_data_instance
print("Totally {} lines of data in the testset.".format(cnt_lines))

There should be 6740 lines of data in valid set
Totally 6740 lines of data in the testset.


# Empathetic Dialogues

In [44]:
import numpy as np

In [45]:
def load_file(path):
    with open(path) as f:
        cache = f.readline().split(',')[0] 
        corpus, history = [], []
        emotion = []
        user = 0
        for line in f.readlines():
            items = line.strip().split(',')
            utterance = f'<user{user}> ' + items[5].replace('_comma_', ',')
            utterance = utterance.strip()
            current_emotion = items[2]
            if items[0] == cache:
                history.append(utterance)
                emotion.append(current_emotion)
            else:
                if history:
                    assert len(history) == len(emotion)
                    corpus.append((history, emotion))    # append the dialogue
                history = [utterance]
                emotion = [current_emotion]
            user = 1 if user == 0 else 0
            cache = items[0]

    avg_turn = np.mean([len(i[0]) for i in corpus])
    max_turn = max([len(i[0]) for i in corpus])
    min_turn = min([len(i[0]) for i in corpus])
    print(f'[!] find {len(corpus)} dialogue, turns(avg/max/min): {avg_turn}/{max_turn}/{min_turn}')
    return corpus

In [46]:
def write_file(mode, corpus):
    with open('{}.json'.format(mode), 'w') as f:
        for data_chunk in corpus:
            dialog = data_chunk[0]
            emotion = data_chunk[1]
            for i in range(1, len(dialog)):
                src_text = ' __eou__ '.join(dialog[:i])
                trg_text = dialog[i]
                cur_emotion = emotion[i]
                cur_data_instance = {'src':src_text, 'trg':trg_text, 'trg_emotion':cur_emotion}
                json.dump(cur_data_instance, f)
                f.write('\n')

    print(f'[!] write into {mode} file over ...')

In [47]:
train_data = load_file('EmpatheticDialog/train.csv')
write_file('train', train_data)

[!] find 19532 dialogue, turns(avg/max/min): 4.309082531230801/8/1
[!] write into train file over ...


In [48]:
valid_data = load_file('EmpatheticDialog/valid.csv')
write_file('valid', valid_data)
test_data = load_file('EmpatheticDialog/test.csv')
write_file('test', test_data)

[!] find 2769 dialogue, turns(avg/max/min): 4.360418923799205/8/1
[!] write into valid file over ...
[!] find 2546 dialogue, turns(avg/max/min): 4.308326787117046/8/2
[!] write into test file over ...
