In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import json
import random
import artm
import nltk
import networkx as nx
import numpy as np

from tqdm import tnrange, tqdm_notebook
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.optics import optics
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from bert_serving.client import BertClient
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
from nltk.corpus import stopwords

In [2]:
dialog_dataset_json = 'babi-1-6-full.json'
vw_data_path = 'data/babi-st.vw'
bigartm_batches_path = 'data/bigartm_batches_babi'
max_st = 25
num_topics = 2
gephi_csv_path = 'data/babi-{}.csv'.format(max_st)
!mkdir data/bigartm_batches_babi

In [166]:
dialog_dataset_json = 'convai2_>3.json'
vw_data_path = 'data/convai2_>3-st.vw'
bigartm_batches_path = 'data/convai2_>3_batches_babi'
max_st = 25
num_topics = 3
gephi_csv_path = 'data/convai2_>3-{}.csv'.format(max_st)
!mkdir 'data/convai2_>3_batches_babi'

mkdir: cannot create directory ‘data/convai2_>3_batches_babi’: File exists


In [170]:
dialog_dataset_json = 'f1-labeled-dialogs-with-dm.json'
vw_data_path = 'data/convai1-st.vw'
bigartm_batches_path = 'data/convai1_batches_babi'
max_st = 30
num_topics = 3
gephi_csv_path = 'data/convai1-{}.csv'.format(max_st)
!mkdir 'data/convai1_batches_babi'

mkdir: cannot create directory ‘data/convai1_batches_babi’: File exists


In [174]:
dialog_dataset_json = 'data/multi-woz2.json'
vw_data_path = 'data/multi-woz2-st.vw'
bigartm_batches_path = 'data/multi-woz2_batches'
max_st = 13
num_topics = 7
gephi_csv_path = 'data/multi-woz2-{}.csv'.format(max_st)
!mkdir 'data/multi-woz2_batches'

mkdir: cannot create directory ‘data/multi-woz2_batches’: File exists


In [6]:
stopwords_en = set(stopwords.words('english'))

def convert_to_vw(text, id_):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    tokens = [t.lower() for t in tokenizer.tokenize(text)]
    processed = []
    for t in tokens:
        l = lmtzr.lemmatize(t)
        if l not in stopwords_en:
            processed.append(l)
    counted = Counter(processed)
    res_str = str(id_)
    for k, v in counted.items():
        if v != 1:
            res_str = res_str + " {}:{}".format(k, v)
        else:
            res_str = res_str + " {}".format(k)
    return res_str


def convert_to_vw_data(sentences, vw_filename):
    vw_file = open(vw_filename, 'w')
    vw_data = []
    ind = 0
    for sent in sentences:
        converted = convert_to_vw(sent, ind)
        if len(converted.split(" ")) > 1:
            vw_data.append(convert_to_vw(sent, ind))
            ind += 1
    for row in vw_data:
        print(row, file=vw_file)
    vw_file.close()

    
def save_vw_to_file(sentences, vw_filename):
    vw_file = open(vw_filename, 'w')
    vw_data = []
    for sent in sentences:
        if len(sent.split(" ")) > 1:
            vw_data.append(sent)
        else:
            vw_data.append("this is noise entry for topic modelling")
    for row in vw_data:
        print(row, file=vw_file)
    vw_file.close()

In [8]:
def _init_sentence_embedder():
    # Other embedder https://tfhub.dev/google/universal-sentence-encoder-large/3
    tf.logging.set_verbosity(tf.logging.ERROR)
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
    embedder = hub.Module(module_url)
    return embedder
        
def _embed_sentences(sentences):
    embedder = _init_sentence_embedder()
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embeddings = session.run(embedder(sentences))
    return embeddings

def _init_tfidf_vectorizer():
    stop_words = []
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', norm='l2')
    return vectorizer

def _embed_sentences_with_tfidf(sentences):
    vectorizer = _init_tfidf_vectorizer()
    word_doc_matrix = vectorizer.fit_transform(sentences)
    return {'vectorizer': vectorizer, 'word_doc_matrix': word_doc_matrix}

def _embed_sentences_with_bert(sentences):
    bc = BertClient()
    return bc.encode(sentences)

def _extract_df_data_naive(dialogs):
    df_data = defaultdict(list)
    id_ = 0
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']

        for row in thread:
            text = row['text']
            df_data['st'].append(current_ind_both)
            df_data['sent'].append(text)
            df_data['cluster_id'].append(None)
            df_data['cluster_name'].append(None)
            df_data['user_id'].append(row['userId'])
            df_data['vw_sent'].append(convert_to_vw(text, id_))
            df_data['topic_name'].append('')
            df_data['topic_score'].append(0)
            id_ += 1
            current_ind_both += 1
    return pd.DataFrame(data=df_data) 


def add_cluster_name_to_dialogs(json_filepath, df):
    with open(json_filepath, 'r') as f:
        dialogs = json.load(f)
        
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']

        current_ind = 1

        for row in thread:
            text = row['text']
            row['cluster_name'] = df[(df['st'] == current_ind) & (df['sent'] == text)].iloc[0]['topic_name_uniq_with_st']
            current_ind += 1
    return dialogs

In [9]:
with open(dialog_dataset_json, 'r') as f:
    dialogs = json.load(f)
dialogs = dict(list(dialogs.items()))

In [12]:
df = _extract_df_data_naive(dialogs)

In [13]:
df.st.describe() 

count    36680.000000
mean        19.172083
std         11.144493
min          1.000000
25%         10.000000
50%         19.000000
75%         28.000000
max         54.000000
Name: st, dtype: float64

In [14]:
for i in range(1, max_st):    
    sentences = df[df.st == i]['vw_sent']
    save_vw_to_file(sentences, vw_data_path)
    batch_vectorizer = artm.BatchVectorizer(data_path=vw_data_path, data_format='vowpal_wabbit',
                                        target_folder='{}/{}'.format(bigartm_batches_path, i))
    lda = artm.LDA(num_topics=num_topics, alpha=0.01, beta=0.001,
               num_document_passes=5, dictionary=batch_vectorizer.dictionary)
    lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100)
    print(i, lda.sparsity_phi_last_value, lda.sparsity_theta_last_value, lda.perplexity_value[-1])
    top_tokens = lda.get_top_tokens(num_tokens=5)
    topics = {}    
    for j, token_list in enumerate(top_tokens):
        topic_name = 'topic_' + str(j)
        topic_value = ", ".join(token_list)
        print('{}: {}'.format(topic_name, topic_value))
        topics[topic_name] = topic_value
    print("------")
    
    theta = lda.transform(batch_vectorizer)
    sentences_topics = []
    sentences_topics_scores = [] 
    for k in range(min(len(sentences), theta.shape[1])):
        topics_distribution = theta[k]
        top1_topic = sorted(list(topics_distribution.items()), key=lambda x: x[1], reverse=True)[0]
        topic_name = top1_topic[0]        
        topic_str = ", ".join(sorted(set(topics[topic_name].split(", "))))        
        sentences_topics.append(topic_str)
        sentences_topics_scores.append(top1_topic[1])
    
    df.loc[df.st == i, 'topic_name'] = sentences_topics
    df.loc[df.st == i, 'topic_score'] = sentences_topics_scores

1 0.0 0.0 2.3037147521972656
topic_0: hello, hi, morning, good, morning
topic_1: good, hi, hello
------
2 0.0 0.0 2.999999761581421
topic_0: today, hello, help, help, hello
topic_1: today
------
3 0.0 0.0 18.630420684814453
topic_0: price, range, restaurant, make, reservation
topic_1: table, book, may, like, food
------
4 0.0 0.0 6.0
topic_0: topic, entry, noise, for, is
topic_1: modelling, is, for, noise, entry
------
5 0.0 0.0 1.0
topic_0: silence, silence
topic_1: 
------
6 0.0 0.0 6.822171688079834
topic_0: modelling, entry, for, noise, topic
topic_1: preference, cuisine, type, many, party
------
7 0.0 0.0 10.169947624206543
topic_0: please, london, paris, people, six
topic_1: food, love, cuisine, spanish, italian
------
8 0.0 0.0 9.48558235168457
topic_0: look, ok, let, option, topic
topic_1: party, would, many, people, range
------
9 0.0 0.0 8.320347785949707
topic_0: please, people, two, price, range
topic_1: silence, rome, bombay, madrid, please
------
10 0.0 0.0 11.28253173828

In [15]:
similar_topic_names_mapping = {}
topic_names = list(set(df['topic_name']))
for i in range(len(topic_names)):
    t1 = set(topic_names[i].split(", "))
    t1_str = ", ".join(sorted(t1))
    for j in range(i+1, len(topic_names)):
        t2 = set(topic_names[j].split(", "))
        if len(t1 - t2) <= 1:
            t2_str = ", ".join(sorted(t2))
            similar_topic_names_mapping[t1_str] = t2_str    
    if not similar_topic_names_mapping.get(t1_str):
        similar_topic_names_mapping[t1_str] = t1_str

In [16]:
df['topic_name_uniq'] = df['topic_name'].map(lambda x: similar_topic_names_mapping[x])

In [17]:
topic_name_to_st = defaultdict(set)
for _, row in df[['topic_name_uniq', 'st']].iterrows():
    topic_name_to_st[row['topic_name_uniq']].add(row['st'])

orig_topic_names_to_names_with_st = {}
for topic_name in topic_name_to_st.keys():    
    speech_turns = [str(e) for e in sorted(topic_name_to_st[topic_name])]
    orig_topic_names_to_names_with_st[topic_name] = "{} [{}]".format(topic_name, ",".join(speech_turns))

In [18]:
df['topic_name_uniq_with_st'] = df['topic_name_uniq'].map(lambda x: orig_topic_names_to_names_with_st[x])

In [19]:
st_sent_map = {}
for _, row in df.iterrows():
    st_sent_map[(row['st'], row['sent'])] = row['topic_name_uniq_with_st']

In [94]:
# st_sent_map = {}
# for _, row in df.iterrows():
#     st_sent_map[(row['st'], row['sent'])] = "{} [{}]".format(row['topic_name_uniq'], row['st'])

In [20]:
with open(dialog_dataset_json, 'r') as f:
    dialogs = json.load(f)
        
    for dialog_id, dialog in tqdm_notebook(dialogs.items()):
        thread = dialog['thread']

        current_ind = 1

        for row in thread:
            text = row['text']
            row['cluster_name'] = st_sent_map[(current_ind, text)]
            current_ind += 1

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [21]:
def build_graph(save_filename, dialogs, step_max=15):
    graph_dict = defaultdict(int)
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']

        current_ind = 1
        prev_row = None
        row = None
        for ind in range(len(thread)):
            if prev_row:
                row = thread[ind]

            if not prev_row:
                prev_row = thread[ind]
                row = None

            if row and current_ind < step_max:
                edge = (prev_row['cluster_name'], row['cluster_name'])
                graph_dict[edge] += 1
                current_ind += 1
                prev_row = row
                row = None
    G = nx.DiGraph()
    weighted_edges = [(k[0], k[1], v) for k, v in graph_dict.items()]
    G.add_weighted_edges_from(weighted_edges)

    m = nx.adjacency_matrix(G).todense().astype(float)
    m = np.squeeze(np.asarray(m))

    for arr in m.tolist():
        str_arr = [str(e) for e in arr]
        print(",".join(str_arr))
    print("\n".join(list(G.nodes)))
    with open(save_filename, 'w') as f:
        nodes = list(G.nodes)
        print(";" + ";".join(nodes), file=f)
        for ind, arr in enumerate(m.tolist()):
            str_arr = [nodes[ind]]
            str_arr += [str(e) for e in arr]
            print(";".join(str_arr), file=f)
    return G

In [22]:
g = build_graph(gephi_csv_path, dialogs, max_st)

0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,517.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,483.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,517.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,1281.0,0.0,0.0,596.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,194.0,0.0,0.0,0.0,0.0,81.0,0.0,0.0,0.0
0.0,0.0,0.0,362.0,0.0,103.0,0.0,322.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,638.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,267.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,512.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,128.0,0.0,0.0,282.0,0.0,246.0,665.0,0.0,0.0,0.0,338.0,392.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,451.0,482.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,286.0,0.0,0.0,108.0,0.0,556.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,137.0,0.0,0.0,315.0