In [1]:
import json
import random
import pathlib
import joblib
from collections import defaultdict
import plotly.express as px
import networkx as nx
import numpy as np

In [2]:
def get_data_path(output_key='topicalchat', feature_name='discourse_features', n=2):     
    # output_key = 'multi-woz2'
    # output_key = 'topicalchat'
    # features_name = 'dialog_tagger_features'
    # features_name = 'discourse_features'
    # features_name = 'topic_model_features'    
    output_folder = f'data/results/{output_key}_{n}_{feature_name}/'    
    data_path = output_folder + 'data.joblib'
    return data_path

def get_output_path(output_key='topicalchat', feature_name='discourse_features', n=2):     
    # output_key = 'multi-woz2'
    # output_key = 'topicalchat'
    # features_name = 'dialog_tagger_features'
    # features_name = 'discourse_features'
    # features_name = 'topic_model_features'    
    output_folder = f'data/results/{output_key}_{n}_{feature_name}/'    
    return output_folder

In [3]:
def load_data(data_path):
    data = joblib.load(data_path)
    return data

def jaccard_similarity(s1, s2):    
    if not s1 and not s2:
        return 0
    res = round(len(s1.intersection(s2)) / len(s1.union(s2)), 5)
    if res < 0:
        res = 0
    if res > 1:
        res = 1
    return res

In [4]:
dataset_name = 'topicalchat'
feature_name_1 = 'dialog_tagger_features'
n = 1
f1_data_path = get_data_path(dataset_name, feature_name=feature_name_1, n=n)
f1_output_path = get_output_path(dataset_name, feature_name=feature_name_1, n=n)
f1_data = load_data(f1_data_path)


In [5]:
reverse_index_f1 = {i: c for c, i in sorted(f1_data['new_clusters'].items(), key=lambda x: x[1])} 

In [6]:
def normalize_graph_dict(graph_dict):
    weights = [v for k, v in graph_dict.items()]
    max_weight = max(weights)
    min_weight = min(weights)
#     second_max_weight = sorted(weights)[-2]
#     denom = max_weight - min_weight
    denom = max_weight
#     denom = second_max_weight
#     return {(k[0], k[1]): round(v / denom, 10) for k, v in graph_dict.items()}
    return {(k[0], k[1]): np.log(v+1) for k, v in graph_dict.items()}

In [7]:
nodes1 = ['Statement',
 'Directive',
 'Thanking',
 'CheckQ',
 'Feedback',
 'Salutation',
 'Commissive',
 'Apology',
 'SetQ',
 'PropQ']

nodes2 = ['Statement',
 'Directive',
 'Thanking',
 'CheckQ',
 'Feedback',
 'Salutation',
 'Commissive',
 'Apology',
 'SetQ',
 'PropQ']

In [8]:
def build_graph(save_filename, dialogs, step_max=150000):
    
    graph_dict = defaultdict(int)
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']

        current_ind = 1
        prev_row = None
        row = None
        for ind in range(len(thread)):
            if prev_row:
                row = thread[ind]

            if not prev_row:
                prev_row = thread[ind]
                row = None

            if row and current_ind < step_max:
                prev_cluster_name = "---".join(reverse_index_f1[prev_row['cluster_id']])
                cur_cluster_name = "---".join(reverse_index_f1[row['cluster_id']])
                if prev_cluster_name == 'Undetected' or cur_cluster_name == 'Undetected':
                    pass
                else:
                    edge = (prev_cluster_name, cur_cluster_name)
                    graph_dict[edge] += 1
                current_ind += 1
                prev_row = row
                row = None
#     graph_dict = normalize_graph_dict(graph_dict)
    G = nx.DiGraph()
    weighted_edges = [(k[0], k[1], v) for k, v in graph_dict.items()]
    G.add_weighted_edges_from(weighted_edges)

    m = nx.adjacency_matrix(G).todense().astype(float)
    m = np.squeeze(np.asarray(m))
    m = np.log(m + 1)
    m = m / np.linalg.norm(m)
    nodes = list(G.nodes)
    G = nx.from_numpy_matrix(m, create_using=nx.DiGraph)
    mapping = dict(zip(range(len(nodes)), nodes))
    G = nx.relabel_nodes(G, mapping)

    for arr in m.tolist():
        str_arr = [str(e) for e in arr]            
    with open(save_filename, 'w') as f:        
        print(";" + ";".join(nodes1), file=f)
        for node1 in nodes1:   
            sim_arr = [node1]
            for node2 in nodes2:
                k = (node1, node2)
                if k in G.edges:
                    sim_arr.append(str(G.edges[k]['weight']))
                else:
                    sim_arr.append(str(0))
            print(";".join(sim_arr), file=f)
    return G

In [9]:
csv_graph_path = f1_output_path + f'graph_{dataset_name}_{feature_name_1}_{n}.csv'
graph = build_graph(csv_graph_path, f1_data['dialog'])

In [10]:
sims = []
for node1 in nodes1:
    sim_arr = []
    for node2 in nodes2:
        k = (node1, node2)
        if k in graph.edges:
            sim_arr.append(graph.edges[k]['weight'])
        else:
            sim_arr.append(None)
    sims.append(sim_arr)

In [11]:
import numpy as np
z = np.array(sims)

In [12]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
        z=z.T,
        x=nodes1,
        y=nodes2,
        colorscale='Viridis'))
fig.update_layout(
    title=f"Dialog act adjacency matrix scatterplot",
    xaxis_title=" Nodes From",
    yaxis_title=" Nodes To",
)
fig.write_html(f1_output_path + f'{dataset_name}_{feature_name_1}_{n}_graph_scatter_plot.html', auto_open=True)

- X - cluster count in Multiwoz
- Y - cluster count in TopicalChat
- cluster_i = (X,Y) - one same cluster in MultiWoz and topicalchat

In [13]:
def get_cluster_counts_in_dataset(dataset_name):
    feature_name = 'dialog_tagger_features'
    n = 1
    f1_data_path = get_data_path(dataset_name, feature_name=feature_name_1, n=n)
    f1_output_path = get_output_path(dataset_name, feature_name=feature_name_1, n=n)
    f1_data = load_data(f1_data_path)
    reverse_index_f1 = {i: c for c, i in sorted(f1_data['new_clusters'].items(), key=lambda x: x[1])} 
    cluster_counts = defaultdict(int)
    for dialog_id, dialog in f1_data['dialog'].items():
        thread = dialog['thread']
        for row in thread:                
            cluster_name = "---".join(reverse_index_f1[row['cluster_id']])
            cluster_counts[cluster_name] += 1
    return cluster_counts

In [14]:
cluster_counts_multiwoz = get_cluster_counts_in_dataset('multi-woz2')
cluster_counts_topicalchat = get_cluster_counts_in_dataset('topicalchat')

In [15]:
all_nodes = nodes1

In [16]:
xs, ys, texts = [], [], []
for node in all_nodes:
#     x = cluster_counts_multiwoz[node] / max(cluster_counts_multiwoz.values())
#     y = cluster_counts_topicalchat[node] / max(cluster_counts_topicalchat.values())
    x = np.log(cluster_counts_multiwoz[node])
    y = np.log(cluster_counts_topicalchat[node])
    text = node
    xs.append(x)
    ys.append(y)
    texts.append(text)

In [17]:

fig = go.Figure(data=go.Scatter(x=xs,
                                y=ys,
                                mode='markers',
                                text=texts,))

fig.update_layout(title='Multiwoz and topicalchat data counts', 
                  xaxis_title=" Multiwoz data count",
                  yaxis_title=" Topicalchat data count",)
fig.write_html(f1_output_path + f'counts_comparison_{dataset_name}_{feature_name_1}_{n}_plot.html', auto_open=True)

In [18]:
def print_table(graph, cluster_counts, prefix):
    weighted_out_degree = graph.out_degree(weight='weight') 
    weighted_in_degree = graph.in_degree(weight='weight') 
    weighted_degree = graph.degree(weight='weight') 
    print("Node", "Count", "OutDeg", "InDeg", "Deg", "WeighOutDeg", "WeighInDeg", "WeighDeg")        
    for node, _ in sorted(graph.degree(weight='weight'), key=lambda x: x[1], reverse=True):    
        print(f"{prefix}_{node}", cluster_counts[node], graph.out_degree[node], graph.in_degree[node], graph.degree[node], weighted_out_degree[node], weighted_in_degree[node], weighted_degree[node])

In [19]:
def print_table_normalized(graph):
    in_degrees = graph.in_degree()
    out_degrees = graph.out_degree()
    degrees = graph.degree()
    
    out_degree_max = max(dict(out_degrees).values())
    in_degree_max = max(dict(in_degrees).values())
    degree_max = max(dict(degrees).values())
    
    weighted_out_degree = graph.out_degree(weight='weight') 
    weighted_in_degree = graph.in_degree(weight='weight') 
    weighted_degree = graph.degree(weight='weight') 
    
    w_out_degree_max = max(dict(weighted_out_degree).values())
    w_in_degree_max = max(dict(weighted_in_degree).values())
    w_degree_max = max(dict(weighted_degree).values())
    
    print("Node", "OutDeg", "InDeg", "Deg", "WeighOutDeg", "WeighInDeg", "WeighDeg")
    for node in all_nodes:
        print(node, out_degrees[node]/out_degree_max, in_degrees[node]/in_degree_max, degrees[node] / degree_max, 
                    weighted_out_degree[node] / w_out_degree_max, weighted_in_degree[node] / w_in_degree_max, weighted_degree[node] / w_degree_max)

In [20]:
dataset_name = 'topicalchat'
topical_data_path = get_data_path(dataset_name, feature_name=feature_name_1, n=n)
topical_data = load_data(topical_data_path)

csv_graph_path = '/tmp/' + f'graph_{dataset_name}_{feature_name_1}_{n}.csv'
graph_topical = build_graph(csv_graph_path, topical_data['dialog'])

dataset_name = 'multi-woz2'
multiwoz_data_path = get_data_path(dataset_name, feature_name=feature_name_1, n=n)
multiwoz_data = load_data(multiwoz_data_path)

csv_graph_path = '/tmp/' + f'graph_{dataset_name}_{feature_name_1}_{n}.csv'
graph_multiwoz = build_graph(csv_graph_path, multiwoz_data['dialog'])

In [21]:
graph_topical

<networkx.classes.digraph.DiGraph at 0x7f253d517650>

In [22]:
print_table(graph_topical, cluster_counts_topicalchat, 'TC')

Node Count OutDeg InDeg Deg WeighOutDeg WeighInDeg WeighDeg
TC_Statement 138531 11 11 22 1.6030474709459104 1.6312620551644925 3.234309526110403
TC_Feedback 19883 11 10 21 1.1439962774506378 1.1546102552433706 2.2986065326940084
TC_Undetected 12342 10 10 20 1.0771940988759996 1.1230417624749822 2.200235861350982
TC_Directive 11682 10 10 20 1.102454348727868 1.097442641833719 2.1998969905615873
TC_Thanking 2133 9 10 19 0.7620443166832148 0.846852042898295 1.6088963595815098
TC_Salutation 2223 9 9 18 0.7946234263726227 0.7715268522247241 1.566150278597347
TC_CheckQ 526 9 9 18 0.45599551934575916 0.5436089222575134 0.9996044416032726
TC_Commissive 305 8 8 16 0.4438722067610031 0.4311781498955639 0.875050356656567
TC_SetQ 223 8 7 15 0.46940767537286837 0.24032824251237994 0.7097359178852483
TC_Apology 141 5 5 10 0.27060435674766525 0.27226690581788043 0.5428712625655456
TC_PropQ 5 1 2 3 0.039010788643072426 0.05013265560370088 0.08914344424677331


In [23]:
print_table(graph_multiwoz, cluster_counts_multiwoz, 'MW')

Node Count OutDeg InDeg Deg WeighOutDeg WeighInDeg WeighDeg
MW_Statement 85923 11 11 22 1.5523486305168581 1.5103553244930377 3.062703955009896
MW_Directive 24321 10 10 20 1.228708473723932 1.2432442906813264 2.471952764405258
MW_Thanking 16137 10 11 21 1.1963608059726218 1.1294023723415394 2.325763178314161
MW_CheckQ 6779 10 10 20 0.9825125115216726 0.9627280044250204 1.945240515946693
MW_Undetected 3095 11 9 20 0.8219991198594876 0.8219606365864054 1.643959756445893
MW_Salutation 2908 10 9 19 0.7392901643641677 0.7368476860722901 1.4761378504364577
MW_Feedback 1725 10 9 19 0.7421285326069114 0.7031488968072367 1.4452774294141482
MW_Commissive 1280 10 10 20 0.6940766238252151 0.7024346286767051 1.39651125250192
MW_Apology 331 10 10 20 0.45820064981258823 0.4824623983119287 0.940663048124517
MW_SetQ 546 6 9 15 0.3803946458121342 0.5034359196200989 0.8838305654322332
MW_PropQ 3 2 2 4 0.03579606919116487 0.03579606919116487 0.07159213838232974


In [24]:
for node, _ in sorted(graph_multiwoz.degree(weight='weight'), key=lambda x: x[1], reverse=True):    
    print(node, cluster_counts_multiwoz[node])
print()
for node, _ in sorted(graph_topical.degree(weight='weight'), key=lambda x: x[1], reverse=True):    
    print(node, cluster_counts_topicalchat[node])

Statement 85923
Directive 24321
Thanking 16137
CheckQ 6779
Undetected 3095
Salutation 2908
Feedback 1725
Commissive 1280
Apology 331
SetQ 546
PropQ 3

Statement 138531
Feedback 19883
Undetected 12342
Directive 11682
Thanking 2133
Salutation 2223
CheckQ 526
Commissive 305
SetQ 223
Apology 141
PropQ 5


In [25]:
def get_cluster_places_in_dialogs(dialogs):
    cluster_places = defaultdict(list)
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for ind, row in enumerate(thread):               
            cluster_name = "---".join(reverse_index_f1[row['cluster_id']])
            if cluster_name == 'Undetected':
                continue
            if ind == 0:
                cluster_places['start'].append(cluster_name)
            elif ind == len(thread) - 1:
                cluster_places['end'].append(cluster_name)
            else:
                cluster_places['in_between'].append(cluster_name)
    return cluster_places

In [26]:
cluster_places = get_cluster_places_in_dialogs(topical_data['dialog'])

In [29]:
counts_dict = defaultdict(int)
for e in cluster_places['start']:
    counts_dict[e] += 1
print(counts_dict)

defaultdict(<class 'int'>, {'Thanking': 219, 'Directive': 2142, 'Statement': 4805, 'Salutation': 1109, 'CheckQ': 52, 'SetQ': 154, 'Undetected': 82, 'Feedback': 54, 'Commissive': 3})


In [34]:
def get_examples(dialogs, node_from, node_to):
    examples = []
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        prev_row = None
        for ind in range(len(thread)):
            if prev_row:
                row = thread[ind]

            if not prev_row:
                prev_row = thread[ind]
                row = None

            if row:
                prev_cluster_name = "---".join(reverse_index_f1[prev_row['cluster_id']])
                cur_cluster_name = "---".join(reverse_index_f1[row['cluster_id']])
                if prev_cluster_name == 'Undetected' or cur_cluster_name == 'Undetected':
                    pass
                else:
                    if node_from == prev_cluster_name and node_to == cur_cluster_name:
                        examples.append((prev_row['text0'], row['text0']))                                            
                prev_row = row
                row = None
    return examples

In [102]:
n1 = 'CheckQ'
n2 = 'Thanking'
# n1 = 'SetQ'
# n2 = 'Directive'
get_examples(topical_data['dialog'], n1, n2)[:5]

[('Did you know the first number the white house used was 1?',
  'No didnt know about that theres a lot of interesting facts about the White House and presidents thanks for all the facts its been great chatting with you'),
 ('Hi! Do you like horses? Did you know Clint Eastwood has a mild horse allergy?',
  'Hello,  yes love horses,  so beautiful.  Thats so interesting considering he was in all those western movies lol Did you know 95% of all modern thoroughbred racehorses can trace their y chromosome to one horse?')]

In [103]:
get_examples(multiwoz_data['dialog'], n1, n2)[:5]

[('You are welcome.  Is there anything else I can assist you with today?',
  'That is all. Thank you for your help! '),
 ('I was able to book you 6 tickets for that train. Your reference number is  PSQ9JOOI. Is there anything else I can help with? ',
  'No, I do not think so. Thank you for your help. '),
 ("Aylesbray postcode is cd17sr and Rosa's postcode is cb22ha. Is there anything else I can help you with today?",
  "No thanks. That's all the help I need. Take care. Bye. "),
 ('Okay, will that be all today or is there anything else I can help you with?',
  'That is all. Thank you!'),
 ("The address is king's parade, is there anything else I can do?",
  'No thank you. Thank you for your help.')]

In [45]:
topical_chat= np.array([[1.638, 1.675],
[1.152, 1.164],
[1.122, 1.119],
[0.760, 0.851],
[0.808, 0.784],
[0.442, 0.546],
[0.436, 0.421],
[0.476, 0.245],
[0.247, 0.262],
[0.045, 0.058]])

multiwoz_arr = np.array([
    [1.529, 1.488],
[1.210,1.225],
[1.155,1.115],
[0.977,0.944],
[0.710,0.720],
[0.725,0.698],
[0.694,0.697],
[0.460,0.495],
[0.414,0.505],
[0.039,0.024]
])

In [55]:
for e in topical_chat / np.linalg.norm(topical_chat):
    print(e[1])

0.4393693711832147
0.3053289242132906
0.2935249709576222
0.22322587156830787
0.20565109672097928
0.14322129950211057
0.11043254045858708
0.06426596772530602
0.06872523895522521
0.015213984196194895


In [57]:
for e in multiwoz_arr / np.linalg.norm(multiwoz_arr):
    print(e[1])

0.37402117814673025
0.30791394034257025
0.28026452529140067
0.23728225280276433
0.1809779894258372
0.17544810641560327
0.1751967480969563
0.12442236773026308
0.12693595091673304
0.006032599647527907
