In [1]:
import json
import random
from collections import defaultdict

In [2]:
random.seed(42)

In [3]:
input_json = 'data/convai1.spacy.dialogact.discourse.dialogtagger.3009.json'
# input_json = 'data/convai2.spacy.dialogact.discourse.dialogtagger.0110.json'
# input_json = 'data/multi-woz2.spacy.dialogact.discourse.dialogtagger.0110.json'

In [4]:
with open(input_json, 'r') as f:
    dialogs = json.load(f)

In [5]:
def get_features(utterance):
    dialog_act_features = [p[0] for p in utterance['predictions'] if '_dci' in p[0]]
    pos_features = "|".join([feats['pos'] for feats in utterance['features_dict']])
    single_discourse_type = utterance['single_discourse_type']
    pair_discourse_type = utterance.get('pair_discourse_type')
    
    dialog_tagger_features = [f"dim_{p['dimension']} comm_func_{p['communicative_function']}" for p in utterance['SVM_predictions']]

    features = dialog_act_features  + dialog_tagger_features + [single_discourse_type] #+ [pos_features]
    if pair_discourse_type and pair_discourse_type != 'PAIR_NONE':
        features += [pair_discourse_type]
    return features

# def get_features(utterance):
#     dialog_act_features = [p[0] for p in utterance['predictions']]
#     pos_features = "|".join([feats['pos'] for feats in utterance['features_dict']])
#     single_discourse_type = utterance['single_discourse_type']
#     pair_discourse_type = utterance.get('pair_discourse_type')
    
#     dialog_tagger_features = [f"dim_{p['dimension']} comm_func_{p['communicative_function']}" for p in utterance['SVM_predictions']]

#     features = dialog_act_features  + dialog_tagger_features + [single_discourse_type] + [pos_features]
#     if pair_discourse_type:
#         features += [pair_discourse_type]
#     return features

In [6]:
for dialog_id, dialog in dialogs.items():
    thread = dialog['thread']
    for row in thread:
        features = get_features(row)
        row['final_features'] = frozenset(features)

In [7]:
def collect_clusters(dialogs):
    clusters = set()
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for ind, row in enumerate(thread):
            clusters.add(row['final_features'])
    clusters = sorted(list(clusters), key=lambda x: str(sorted(list(x))))
    clusters_index = {f: i for i, f in enumerate(clusters)}
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for ind, row in enumerate(thread):
            row['cluster_id'] = clusters_index[row['final_features']]
    return clusters_index

In [26]:
def collect_duo_cluster(dialogs):
    clusters = set()    
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        duo_thread = []
        for ind, row in enumerate(thread):
            if ind > 0:
                duo_row = {}
                prev_row = thread[ind - 1]
                duo_cluster = frozenset.union(row['final_features'], prev_row['final_features'])
                duo_row['final_features'] = duo_cluster
                duo_row['text1'] = prev_row['text']
                duo_row['text2'] = row['text']
                clusters.add(duo_cluster)
                duo_thread.append(duo_row)
        dialog['duo_thread'] = duo_thread
    clusters = sorted(list(clusters), key=lambda x: str(sorted(list(x))))
    clusters_index = {f: i for i, f in enumerate(clusters)}
    for dialog_id, dialog in dialogs.items():
        thread = dialog['duo_thread']
        for ind, row in enumerate(thread):
            row['cluster_id'] = clusters_index[row['final_features']]
    return clusters_index

In [8]:
def jaccard_similarity(s1, s2):
    res = round(len(s1.intersection(s2)) / len(s1.union(s2)), 5)
    if res < 0:
        res = 0
    if res > 1:
        res = 1
    return res

In [9]:
uno_clusters = collect_clusters(dialogs)

In [33]:
duo_clusters = collect_duo_cluster(dialogs)

In [36]:
len(uno_clusters), len(duo_clusters)

(248, 597)

In [10]:
def get_sims_info(clusters, for_hist=False):
    used_inds = set()
    sims_info = []
    assert len(clusters.values()) == len(set(clusters.values()))
    reverse_index = {i: c for c, i in sorted(clusters.items(), key=lambda x: x[1])}            
    for i in range(len(clusters)):
        max_sim = 0
        max_ind = 0
        for j in range(i+1, len(clusters)):        
            cur_sim = jaccard_similarity(reverse_index[i], reverse_index[j])
            if max_sim <= cur_sim:
                max_sim = cur_sim
                max_ind = j   
        if (i not in used_inds) and (max_ind not in used_inds) and not for_hist:
            sims_info.append({'source_ind': i, 'target_ind': max_ind, 'sim': max_sim})
            used_inds.add(i)
            used_inds.add(max_ind)
        elif for_hist:
            sims_info.append({'source_ind': i, 'target_ind': max_ind, 'sim': max_sim})
    return sims_info

In [11]:
sims_info = get_sims_info(uno_clusters)
source_inds = [e['source_ind'] for e in sims_info]
target_inds = [e['target_ind'] for e in sims_info]
sum(source_inds), sum(target_inds)

(7149, 16576)

In [12]:
import plotly.express as px

# Между уникальными кластерами
sims_info_for_hist = get_sims_info(uno_clusters, True)
fig = px.histogram(sims_info_for_hist, x="sim", nbins=len(sims_info_for_hist))
fig.write_html('sims_hist.html', auto_open=True)

In [13]:
min_thresh = 0.74

In [14]:
def filter_sims_and_merge_clusters(sims_info, clusters, threshold, dialogs):
    new_clusters = {}
    cluster_id = 0
    used_inds = []
    is_merged = False
    cluster_id_mapping = {}
    assert len(clusters.values()) == len(set(clusters.values()))
    reverse_index = {i: c for c, i in sorted(clusters.items(), key=lambda x: x[1])} 
    for e in sims_info:
        if e['sim'] >= threshold:
            new_cluster = frozenset().union(reverse_index[e['source_ind']], reverse_index[e['target_ind']])
            if new_cluster not in new_clusters:                
                new_clusters[new_cluster] = cluster_id
                cluster_id += 1
                is_merged = True
            used_inds += [e['source_ind'], e['target_ind']]
            cluster_id_mapping[e['source_ind']] = new_clusters[new_cluster]
            cluster_id_mapping[e['target_ind']] = new_clusters[new_cluster]
    used_inds = set(used_inds)
    for ind in reverse_index.keys():
        if ind not in used_inds:
            if reverse_index[ind] not in new_clusters:                
                new_clusters[reverse_index[ind]] = cluster_id
                cluster_id += 1
            cluster_id_mapping[ind] = new_clusters[reverse_index[ind]]    
    if is_merged is True:
        for dialog_id, dialog in dialogs.items():
            thread = dialog['thread']
            for ind, row in enumerate(thread):
                row['cluster_id'] = cluster_id_mapping[row['cluster_id']]
        return filter_sims_and_merge_clusters(get_sims_info(new_clusters), new_clusters, threshold, dialogs)
    return new_clusters

In [15]:
new_clusters = filter_sims_and_merge_clusters(sims_info, uno_clusters, min_thresh, dialogs)

In [16]:
def check_correctness(new_clusters, dialogs):
    cluster_ids = set()
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for ind, row in enumerate(thread):
            cluster_ids.add(row['cluster_id'])
    assert len(cluster_ids) == len(new_clusters)

check_correctness(new_clusters, dialogs)

In [17]:
print(len(uno_clusters), len(new_clusters), list(new_clusters.items())[:3])
reverse_index = {i: c for c, i in sorted(new_clusters.items(), key=lambda x: x[1])} 

248 140 [(frozenset({'PAIR_CONN', 'SINGLE_S_COORD', 'General_ChatIntent_dci', 'dim_Task comm_func_Statement', 'ClarificationIntent_dci'}), 0), (frozenset({'PAIR_CONN', 'Information_DeliveryIntent_dci', 'General_ChatIntent_dci', 'dim_Task comm_func_Statement', 'SINGLE_VP_COORD'}), 1), (frozenset({'PAIR_ANAPHORA', 'SINGLE_RELATIVE', 'General_ChatIntent_dci', 'dim_Task comm_func_Statement', 'Information_RequestIntent_dci'}), 2)]


In [18]:
cluster_usage_distribution = defaultdict(int)
for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for ind, row in enumerate(thread):
            cluster_usage_distribution[row['cluster_id']] += 1

In [19]:
xs = [f"c_{e}" for e in list(cluster_usage_distribution.keys())]
fig = px.histogram(x=xs, y=list(cluster_usage_distribution.values()), nbins=len(cluster_usage_distribution.keys()), labels={'x': 'cluster id'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.write_html('usage_hist.html', auto_open=True)

In [20]:
cluster_examples = defaultdict(list)
for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for ind, row in enumerate(thread):
            prev_text = None
            if ind > 0:
                prev_text = thread[ind - 1]['text']
            cluster_examples[row['cluster_id']].append([prev_text, row['text']])

In [21]:
top_cluster_ids = []
for cluster_id, freq in sorted(cluster_usage_distribution.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"Cluster id: {cluster_id}; Frequency: {freq}")
    print("-------------")
    for e in sorted(reverse_index[cluster_id]):
        print(e)
    print("Samples: ")
    for s in random.sample(cluster_examples[cluster_id], 10):
        s1, s2 = s        
        if s1:
            print(f"- S1: {s1}")
        print(f"- S2: {s2}")
    print("-------------")
    print()
    top_cluster_ids.append(cluster_id)

Cluster id: 0; Frequency: 161
-------------
ClarificationIntent_dci
General_ChatIntent_dci
PAIR_CONN
SINGLE_S_COORD
dim_Task comm_func_Statement
Samples: 
- S1: Infact I did mentioned the same
- S2: How old are you?
- S1: The Atlantic
- S2: It is incorrect! Hint: first 3 letters is atl. Try again.
- S1: What
- S2: Are you confused?
- S1: That is a tautology
- S2: A tautology is a terrible thing to waste.
- S1: I'm not gon na let you go.
- S2: I am not happy with ur use of language
- S1: How much of the law did the economist describe to the free market?
- S2: I dont know
- S1: 3?
- S2: It is incorrect:) Hint: first 3 letters is thr. Try again
- S1: Did the front range really become the front range?
- S2: It's not that hard.
- S1: Are you shy?
- S2: no, are you?
- S1: I'm so sorry for the loss of your soul.
- S2: It is okay... I forgive myself. I still have my soul sir... it's is my spirit I'm working to reconcile.
-------------

Cluster id: 17; Frequency: 153
-------------
General_ChatI


- General_ChatIntent_dci - confidence (freq in cluster)
- SINGLE_APPOSITION - confidence
- Turn - distribution , mean
- Попробовать двойки, четверки

- Описать все признаки

- Попробовать Левенштейна

In [23]:
reverse_index[0], reverse_index[3], reverse_index[5], reverse_index[19]

(frozenset({'ClarificationIntent_dci',
            'General_ChatIntent_dci',
            'PAIR_CONN',
            'SINGLE_S_COORD',
            'dim_Task comm_func_Statement'}),
 frozenset({'General_ChatIntent_dci',
            'Information_RequestIntent_dci',
            'PAIR_ANAPHORA',
            'SINGLE_VP_COORD',
            'dim_Task comm_func_Statement'}),
 frozenset({'General_ChatIntent_dci',
            'Information_DeliveryIntent_dci',
            'Information_RequestIntent_dci',
            'SINGLE_S_COORD',
            'dim_Task comm_func_Statement'}),
 frozenset({'General_ChatIntent_dci',
            'Opinion_RequestIntent_dci',
            'PAIR_ANAPHORA',
            'SINGLE_APPOSITION',
            'dim_Task comm_func_Directive'}))

Имея разметку по реплике можно построить разметки по N реплик
- thread => `{'1': [{'text': blabla, cluster_id: 232}, ...], '2': [{'text1': 'blabla', 'text2': blabla, cluster_id: 211}, ...]}`
- thread =>  [{'text': blabla, cluster_id: 232, final_features_before_clustering: frozenset(..)}, ...]
- thread2 => [{'text1': 'blabla', 'text2': blabla, cluster_id: 211, final_features_before_clustering: frozenset(..)}, ...]

In [25]:
thread[0]

{'userId': 'Bot',
 'evaluation': 0,
 'text': 'What’s up?',
 'skill_id': '9',
 'true_skill_id': '9',
 'predictions': [['Other_dct', '0.9473551'],
  ['Phatic_ct', '0.99975026'],
  ['General_ChatIntent_dci', '0.98970103']],
 'features_dict': [{'lemma': 'what',
   'pos': 'PRON',
   'tag': 'WP',
   'dep': 'nsubj',
   'shape': 'Xxxx',
   'is_alpha': True,
   'is_stop': True},
  {'lemma': '’',
   'pos': 'VERB',
   'tag': 'VBZ',
   'dep': 'ROOT',
   'shape': '’x',
   'is_alpha': False,
   'is_stop': True},
  {'lemma': 'up',
   'pos': 'ADP',
   'tag': 'IN',
   'dep': 'prt',
   'shape': 'xx',
   'is_alpha': True,
   'is_stop': True},
  {'lemma': '?',
   'pos': 'PUNCT',
   'tag': '.',
   'dep': 'punct',
   'shape': '?',
   'is_alpha': False,
   'is_stop': False}],
 'single_discourse_type': 'SINGLE_CATAPHORA',
 'SVM_predictions': [{'dimension': 'Task',
   'communicative_function': 'Statement'}],
 'final_features': frozenset({'General_ChatIntent_dci',
            'SINGLE_CATAPHORA',
            'di