In [1]:
import json
import random
import pathlib
import joblib
from collections import defaultdict
import plotly.express as px

In [2]:
def get_data_path(output_key='topicalchat', feature_name='discourse_features', n=2):     
    # output_key = 'multi-woz2'
    # output_key = 'topicalchat'
    # features_name = 'dialog_tagger_features'
    # features_name = 'discourse_features'
    # features_name = 'topic_model_features'    
    output_folder = f'data/results/{output_key}_{n}_{feature_name}/'    
    data_path = output_folder + 'data.joblib'
    return data_path

def get_output_path(output_key='topicalchat', feature_name='discourse_features', n=2):     
    # output_key = 'multi-woz2'
    # output_key = 'topicalchat'
    # features_name = 'dialog_tagger_features'
    # features_name = 'discourse_features'
    # features_name = 'topic_model_features'    
    output_folder = f'data/results/{output_key}_{n}_{feature_name}/'    
    return output_folder

In [3]:
def load_data(data_path):
    data = joblib.load(data_path)
    return data

def jaccard_similarity(s1, s2):    
    if not s1 and not s2:
        return 0
    res = round(len(s1.intersection(s2)) / len(s1.union(s2)), 5)
    if res < 0:
        res = 0
    if res > 1:
        res = 1
    return res

In [4]:
topicalchat_data_path = get_data_path('topicalchat', feature_name='topic_model_features')
multiwoz_data_path = get_data_path('multi-woz2', feature_name='topic_model_features')
topicalchat_output_path = get_output_path('topicalchat', feature_name='topic_model_features')
multiwoz_output_path = get_output_path('multi-woz2', feature_name='topic_model_features')
topicalchat_data = load_data(topicalchat_data_path)
multiwoz_data = load_data(multiwoz_data_path)

In [5]:
reverse_index_topicalchat = {i: c for c, i in sorted(topicalchat_data['new_clusters'].items(), key=lambda x: x[1])} 
reverse_index_multiwoz = {i: c for c, i in sorted(multiwoz_data['new_clusters'].items(), key=lambda x: x[1])} 

In [6]:
def get_thread_key(n):    
    key = f'thread'
    return key


def calc_cluster_cluster_usage_distribution(dialogs, n):
    key = get_thread_key(n)
    cluster_usage_distribution = defaultdict(int)
    for dialog_id, dialog in dialogs.items():
        if key not in dialog:
            continue
        thread = dialog[key]
        for ind, row in enumerate(thread):
            cluster_usage_distribution[row['cluster_id']] += 1
    return cluster_usage_distribution

topicalchat_usage = calc_cluster_cluster_usage_distribution(topicalchat_data['dialog'], 2)
multiwoz_usage = calc_cluster_cluster_usage_distribution(multiwoz_data['dialog'], 2)

In [7]:
# topical_ids = list(sorted(reverse_index_topicalchat.keys()))
# multiwoz_ids = list(sorted(reverse_index_multiwoz.keys()))
top_k = 25
topical_ids = [cluster_id for cluster_id, freq in sorted(topicalchat_usage.items(), key=lambda x: x[1], reverse=True)[:top_k]]
multiwoz_ids = [cluster_id for cluster_id, freq in sorted(multiwoz_usage.items(), key=lambda x: x[1], reverse=True)[:top_k]]
# topical_ids = sorted(reverse_index_topicalchat.keys())
# multiwoz_ids = sorted(reverse_index_multiwoz.keys())
sims = []
for cluster_id_top in topical_ids:
    features_topical = reverse_index_topicalchat[cluster_id_top]
    sim_arr = []
    for cluster_id_mult in multiwoz_ids:
        features_multiwoz = reverse_index_multiwoz[cluster_id_mult]
        sim_arr.append(jaccard_similarity(features_topical, features_multiwoz))
    sims.append(sim_arr)

In [8]:
len(topical_ids), len(multiwoz_ids)

(25, 25)

In [9]:
def get_cluster_examples(dialogs, n):
    key = get_thread_key(n)
    cluster_examples = defaultdict(list)
    for dialog_id, dialog in dialogs.items():
            if key not in dialog:
                continue
            thread = dialog[key]
            for ind, row in enumerate(thread):                
                prev_texts = []
                for i in range(0, n):
                    text_key = f'text{i}'
                    prev_texts.append(row[text_key])
                cluster_examples[row['cluster_id']].append(prev_texts)
    return cluster_examples

topicalchat_cluster_examples = get_cluster_examples(topicalchat_data['dialog'], 2)
multiwoz_cluster_examples = get_cluster_examples(multiwoz_data['dialog'], 2)

In [18]:
tk = 14
mw = 6
print(jaccard_similarity(reverse_index_topicalchat[tk], reverse_index_multiwoz[mw]))
print(reverse_index_topicalchat[tk], reverse_index_multiwoz[mw])

print("-----------multiwoz-------------")
print(multiwoz_cluster_examples[mw][:5])

print("-----------topicalchat-----------")
print(topicalchat_cluster_examples[tk][:5])

0.13333
frozenset({'like', 'read', 'think', 'know', 'book', '$', 'Amazon'}) frozenset({'leave', 'reference', 'arrive', 'like', 'need', 'train', 'Cambridge', 'book', 'number', 'help'})
-----------multiwoz-------------
[['I meant Cambridge please. I would like to arrive in Cambridge on Monday after 21:15.', 'Ok I will book that for you and get you a confirmation number'], ['Ok I will book that for you and get you a confirmation number', "I don't need anything booked.  I just need to get the arrival time, travel time and price of a train from norwich to cambridge leaving after 21:15."], ['The price is 16.50 pounds and the train will arrive in Peterborough by 13:56 Wednesday.  Would you like me to book a ticket for you?', 'Yes, one ticket please, can I also get the reference number?'], ['Yes, one ticket please, can I also get the reference number?', 'I booked you one seat on TR1879 departing Cambridge for Peterborough on Wednesday at 13:06, Reference: xu1qlhvw. Can I help further today?'],

In [11]:
# 0.42857
# topic model example
# -----------multiwoz-------------
# [['No, that will be all. Good bye.', 'Thank you for using our services.'], ['You were great. Goodbye.', 'We are happy to help. Have a good day!'], ['You are welcome.  Is there anything else I can help you with today?', 'No, I am all set.  Have a nice day.  Bye.'], ['You are more than welcome!\n', 'Ok, have a good day. Goodbye.'], ["That's all I need, thanks so much for all of your help! Have a great day!", 'you are welcome ']]
# -----------topicalchat-----------
# [['its been nice talking to you', 'You as well!  Have a great day!'], ['It was very nice chatting with you. Have a great day', 'You as well.  Have a great day.'], ['You as well.  Have a great day.', 'thanks'], ['I agree, Something to be recognized!! Great chat!!', 'Great chat, cat!'], ['Great chat, cat!', 'Have a great day!!']]

In [12]:
import numpy as np
z = np.array(sims)

In [13]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
        z=z.T,
        x=[f't_{e}' for e in topical_ids],
        y=[f'm_{e}' for e in multiwoz_ids],
        colorscale='Viridis'))
fig.write_html(topicalchat_output_path + 'scatter_plot.html', auto_open=True)