In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import json
import random
import artm
import nltk
import networkx as nx
import numpy as np
import copy

from tqdm import tnrange, tqdm_notebook
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.optics import optics
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from bert_serving.client import BertClient
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
from nltk.corpus import stopwords

In [28]:
!mkdir data/predict
!mkdir data/predict/bigartm_batches

mkdir: cannot create directory ‘data/predict/bigartm_batches’: File exists


In [237]:
dialog_dataset_json = 'babi-1-6-full.json'
vw_data_path = 'data/babi-st.vw'
bigartm_batches_path = 'data/bigartm_batches_babi'
max_st = 54
num_topics = 2
gephi_csv_path = 'data/babi-{}.csv'.format(max_st)
!mkdir data/bigartm_batches_babi

mkdir: cannot create directory ‘data/bigartm_batches_babi’: File exists


In [166]:
dialog_dataset_json = 'convai2_>3.json'
vw_data_path = 'data/convai2_>3-st.vw'
bigartm_batches_path = 'data/convai2_>3_batches_babi'
max_st = 25
num_topics = 3
gephi_csv_path = 'data/convai2_>3-{}.csv'.format(max_st)
!mkdir 'data/convai2_>3_batches_babi'

mkdir: cannot create directory ‘data/convai2_>3_batches_babi’: File exists


In [170]:
dialog_dataset_json = 'f1-labeled-dialogs-with-dm.json'
vw_data_path = 'data/convai1-st.vw'
bigartm_batches_path = 'data/convai1_batches_babi'
max_st = 30
num_topics = 3
gephi_csv_path = 'data/convai1-{}.csv'.format(max_st)
!mkdir 'data/convai1_batches_babi'

mkdir: cannot create directory ‘data/convai1_batches_babi’: File exists


In [2]:
dialog_dataset_json = 'multi-woz2.json'
vw_data_path = 'data/multi-woz2-st.vw'
bigartm_batches_path = 'data/multi-woz2_batches'
max_st = 13
num_topics = 7
gephi_csv_path = 'data/multi-woz2-{}.csv'.format(max_st)
!mkdir 'data/multi-woz2_batches'

mkdir: cannot create directory ‘data/multi-woz2_batches’: File exists


In [3]:
stopwords_en = set(stopwords.words('english'))

def convert_to_vw(text, id_, user_id):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    tokens = [t.lower() for t in tokenizer.tokenize(text)]
    processed = []
    for t in tokens:
        l = lmtzr.lemmatize(t)
        if l not in stopwords_en:
            processed.append(l)
    counted = Counter(processed)
    res_str = str(id_)
    for k, v in counted.items():
        if v != 1:
            res_str = res_str + " {}:{}".format(k, v)
        else:
            res_str = res_str + " {}".format(k)
    res_str += ' {}'.format(user_id)
    return res_str


def convert_to_vw_data(sentences, vw_filename):
    vw_file = open(vw_filename, 'w')
    vw_data = []
    ind = 0
    for sent in sentences:
        converted = convert_to_vw(sent, ind)
        if len(converted.split(" ")) > 1:
            vw_data.append(convert_to_vw(sent, ind))
            ind += 1
    for row in vw_data:
        print(row, file=vw_file)
    vw_file.close()

    
def save_vw_to_file(sentences, vw_filename):
    vw_file = open(vw_filename, 'w')
    vw_data = []
    for sent in sentences:
        if len(sent.split(" ")) > 1:
            vw_data.append(sent)
        else:
            vw_data.append("this is noise entry for topic modelling")
    for row in vw_data:
        print(row, file=vw_file)
    vw_file.close()

In [4]:
def _init_sentence_embedder():
    # Other embedder https://tfhub.dev/google/universal-sentence-encoder-large/3
    tf.logging.set_verbosity(tf.logging.ERROR)
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
    embedder = hub.Module(module_url)
    return embedder
        
def _embed_sentences(sentences):
    embedder = _init_sentence_embedder()
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embeddings = session.run(embedder(sentences))
    return embeddings

def _init_tfidf_vectorizer():
    stop_words = []
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', norm='l2')
    return vectorizer

def _embed_sentences_with_tfidf(sentences):
    vectorizer = _init_tfidf_vectorizer()
    word_doc_matrix = vectorizer.fit_transform(sentences)
    return {'vectorizer': vectorizer, 'word_doc_matrix': word_doc_matrix}

def _embed_sentences_with_bert(sentences):
    bc = BertClient()
    return bc.encode(sentences)

def _extract_df_data_naive(dialogs):
    df_data = defaultdict(list)
    id_ = 0
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']

        for row in thread:
            text = row['text']
            df_data['st'].append(current_ind_both)
            df_data['sent'].append(text)
            df_data['cluster_id'].append(None)
            df_data['cluster_name'].append(None)
            df_data['user_id'].append(row['userId'])
            df_data['vw_sent'].append(convert_to_vw(text, id_, row['userId']))
            df_data['topic_name'].append('')
            df_data['topic_score'].append(0)
            id_ += 1
            current_ind_both += 1
    return pd.DataFrame(data=df_data) 


def add_cluster_name_to_dialogs(json_filepath, df):
    with open(json_filepath, 'r') as f:
        dialogs = json.load(f)
        
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']

        current_ind = 1

        for row in thread:
            text = row['text']
            row['cluster_name'] = df[(df['st'] == current_ind) & (df['sent'] == text)].iloc[0]['topic_name_uniq_with_st']
            current_ind += 1
    return dialogs

In [5]:
with open(dialog_dataset_json, 'r') as f:
    dialogs = json.load(f)
dialogs = dict(list(dialogs.items()))

In [6]:
df = _extract_df_data_naive(dialogs)

In [7]:
df.st.describe()

count    143048.000000
mean          8.356943
std           5.453840
min           1.000000
25%           4.000000
50%           8.000000
75%          12.000000
max          44.000000
Name: st, dtype: float64

In [8]:
ldas_pickle_data = {}
for i in range(1, max_st):    
    sentences = df[df.st == i]['vw_sent']
    save_vw_to_file(sentences, vw_data_path)
    batch_vectorizer = artm.BatchVectorizer(data_path=vw_data_path, data_format='vowpal_wabbit',
                                        target_folder='{}/{}'.format(bigartm_batches_path, i))
    lda = artm.LDA(num_topics=num_topics, alpha=0.01, beta=0.001,
               num_document_passes=5, dictionary=batch_vectorizer.dictionary)
    lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=100)
    print(i, lda.sparsity_phi_last_value, lda.sparsity_theta_last_value, lda.perplexity_value[-1])
    top_tokens = lda.get_top_tokens(num_tokens=7)
    topics = {}    
    for j, token_list in enumerate(top_tokens):
        topic_name = 'topic_' + str(j)
        topic_value = ", ".join(token_list)
        print('{}: {}'.format(topic_name, topic_value))
        topics[topic_name] = topic_value
    print("------")
    save_path = '{}/{}/dumped_model_{}'.format(bigartm_batches_path, i, i)
    lda.save(save_path)
    ldas_pickle_data[i] = {
        'save_path': save_path,
        'num_topics': num_topics,
        'topics': topics
    }
    
    theta = lda.transform(batch_vectorizer)
    sentences_topics = []
    sentences_topics_scores = [] 
    for k in range(min(len(sentences), theta.shape[1])):
        topics_distribution = theta[k]
        top1_topic = sorted(list(topics_distribution.items()), key=lambda x: x[1], reverse=True)[0]
        topic_name = top1_topic[0]        
        topic_str = ", ".join(sorted(set(topics[topic_name].split(", "))))        
        sentences_topics.append(topic_str)
        sentences_topics_scores.append(top1_topic[1])
    
    df.loc[df.st == i, 'topic_name'] = sentences_topics
    df.loc[df.st == i, 'topic_score'] = sentences_topics_scores

1 0.0 0.0 55.68326187133789
topic_0: User, hotel, stay, place, free, looking, need
topic_1: User, information, called, taxi, book, looking, get
topic_2: cambridge, User, train, need, looking, leaving, going
topic_3: User, town, looking, place, go, centre, part
topic_4: like, hi, would, User, hospital, good, know
topic_5: find, help, User, please, want, price, priced
topic_6: restaurant, User, looking, food, expensive, serf, cheap
------
2 0.0 0.0 100.6881103515625
topic_0: Bot, price, range, hotel, area, guesthouse, preference
topic_1: would, like, Bot, restaurant, area, sorry, food
topic_2: Bot, sure, day, help, time, train, destination
topic_3: Bot, else, museum, anything, cambridge, parkside, help
topic_4: located, Bot, cambridge, number, phone, road, hospital
topic_5: Bot, restaurant, town, centre, one, part, expensive
topic_6: Bot, type, looking, area, particular, many, attraction
------
3 0.0 0.0 60.822017669677734
topic_0: User, free, star, hotel, need, parking, wifi
topic_1: li

In [9]:
similar_topic_names_mapping = {}
topic_names = list(set(df['topic_name']))
for i in range(len(topic_names)):
    t1 = set(topic_names[i].split(", "))
    t1_str = ", ".join(sorted(t1))
    for j in range(i+1, len(topic_names)):
        t2 = set(topic_names[j].split(", "))
        if len(t1 - t2) <= 1:
            t2_str = ", ".join(sorted(t2))
            similar_topic_names_mapping[t1_str] = t2_str    
    if not similar_topic_names_mapping.get(t1_str):
        similar_topic_names_mapping[t1_str] = t1_str
        
ldas_pickle_data['sim_dict'] = similar_topic_names_mapping

In [10]:
df['topic_name_uniq'] = df['topic_name'].map(lambda x: similar_topic_names_mapping[x])

In [11]:
topic_name_to_st = defaultdict(set)
for _, row in df[['topic_name_uniq', 'st']].iterrows():
    topic_name_to_st[row['topic_name_uniq']].add(row['st'])

orig_topic_names_to_names_with_st = {}
for topic_name in topic_name_to_st.keys():    
    speech_turns = [str(e) for e in sorted(topic_name_to_st[topic_name])]
    orig_topic_names_to_names_with_st[topic_name] = "{} [{}]".format(topic_name, ",".join(speech_turns))

In [12]:
df['topic_name_uniq_with_st'] = df['topic_name_uniq'].map(lambda x: orig_topic_names_to_names_with_st[x])

In [13]:
st_sent_map = {}
for _, row in df.iterrows():
    st_sent_map[(row['st'], row['sent'])] = row['topic_name_uniq_with_st']

In [94]:
# st_sent_map = {}
# for _, row in df.iterrows():
#     st_sent_map[(row['st'], row['sent'])] = "{} [{}]".format(row['topic_name_uniq'], row['st'])

In [14]:
df['topic_name_uniq_with_st']

0         User, free, hotel, looking, need, place, stay [1]
1         Bot, area, attraction, looking, many, particul...
2         User, free, hotel, need, parking, star, thanks...
3         Bot, area, hotel, price, range, restaurant, st...
4         User, book, need, night, people, please, yes [...
5         Bot, book, like, reservation, sorry, time, wou...
6         User, book, need, night, people, please, yes [...
7         Bot, booking, number, reference, successful, t...
8         User, great, help, like, much, thank, would [5...
9          Bot, day, great, many, thank, time, welcome [10]
10        User, book, called, get, information, looking,...
11        Bot, anything, cambridge, else, help, museum, ...
12        User, address, could, get, number, phone, plea...
13        Bot, address, located, number, phone, postcode...
14        User, address, could, get, number, phone, plea...
15        Bot, address, centre, number, phone, road, str...
16        User, great, help, like, much,

In [15]:
with open(dialog_dataset_json, 'r') as f:
    dialogs = json.load(f)
        
    for dialog_id, dialog in tqdm_notebook(dialogs.items()):
        thread = dialog['thread']

        current_ind = 1

        for row in thread:
            text = row['text']
            row['cluster_name'] = st_sent_map[(current_ind, text)]
            current_ind += 1

HBox(children=(IntProgress(value=0, max=10438), HTML(value='')))




In [16]:
def build_graph(save_filename, dialogs, step_max=15):
    graph_dict = defaultdict(int)
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']

        current_ind = 1
        prev_row = None
        row = None
        for ind in range(len(thread)):
            if prev_row:
                row = thread[ind]

            if not prev_row:
                prev_row = thread[ind]
                row = None

            if row and current_ind < step_max:
                edge = (prev_row['cluster_name'], row['cluster_name'])
                graph_dict[edge] += 1
                current_ind += 1
                prev_row = row
                row = None
    G = nx.DiGraph()
    weighted_edges = [(k[0], k[1], v) for k, v in graph_dict.items()]
    G.add_weighted_edges_from(weighted_edges)

    m = nx.adjacency_matrix(G).todense().astype(float)
    m = np.squeeze(np.asarray(m))

    for arr in m.tolist():
        str_arr = [str(e) for e in arr]
        print(",".join(str_arr))
    print("\n".join(list(G.nodes)))
    with open(save_filename, 'w') as f:
        nodes = list(G.nodes)
        print(";" + ";".join(nodes), file=f)
        for ind, arr in enumerate(m.tolist()):
            str_arr = [nodes[ind]]
            str_arr += [str(e) for e in arr]
            print(";".join(str_arr), file=f)
    return G

In [17]:
g = build_graph(gephi_csv_path, dialogs, max_st)

0.0,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,0.0,0.0,0.0,0.0,992.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,135.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,0.0,0.0,0.0,158.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,177.0,0.0,97.0,0.0,0.0,0.0,0.0,0.0,0.0,433.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,323.0,0.0,0.0,0.0,0.0,443.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173.0,0.0
0.0,0.0,0.0,629.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0,0.0,0.0,379.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,0.0,0.0,0.0,0.0,198.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,833.0,0.0,0.0,298.0,0.0,0.0,0.0,616.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,868.0,0.0,222.0,0.0,0.0,0.0,0.0,143.0,0.0,0.0,0.0,40.0,579.0,0.0,0.0,432.0,435.0,0.0,406.0,0.0,0.0,0.0,183.0,0.0,0.0,0.0,435.0,0.0,0.0,75.0,0.0,0.0
0.0,0.0,0.0

# Predict using existing topic modelling

sentence => topic => unified topic name

In [172]:
import pickle

with open('data/babi_lda_data.pickle', 'wb') as f:
    pickle.dump(ldas_pickle_data, f)

In [252]:
def predict_topic(sentence, lda, topics, user_id):
    id_ = 1
    vw_pred_sents = [convert_to_vw(sentence, id_, user_id)]
    vw_pred_data_path = 'data/predict/pred.txt'
    save_vw_to_file(vw_pred_sents, vw_pred_data_path)
    bigartm_pred_batches_path = 'data/predict/bigartm_batches'
    batch_vectorizer = artm.BatchVectorizer(data_path=vw_pred_data_path, data_format='vowpal_wabbit',
                                    target_folder=bigartm_pred_batches_path)
    try:
        theta = lda.transform(batch_vectorizer)
    except artm.wrapper.exceptions.InvalidOperationException:
        return 'UNK', 1

    sentences_topics = []
    sentences_topics_scores = [] 
    for k in range(min(len(vw_pred_sents), theta.shape[1])):
        topics_distribution = theta[k]
        top1_topic = sorted(list(topics_distribution.items()), key=lambda x: x[1], reverse=True)[0]
        topic_name = top1_topic[0]        
        topic_str = ", ".join(sorted(set(topics[topic_name].split(", "))))        
        sentences_topics.append(topic_str)
        sentences_topics_scores.append(top1_topic[1])
    return sentences_topics[0], sentences_topics_scores[0]

ldas_cache = {}
def load_lda(ldas_pickle_data, ind):
    if ind in ldas_cache:
        return ldas_cache[ind]
    new_lda = artm.LDA(num_topics=ldas_pickle_data[ind]['num_topics'])
    new_lda.load(ldas_pickle_data[ind]['save_path'])
    ldas_cache[ind] = new_lda
    return new_lda

def predict_with_label_replace(sentence, lda, topics, user_id):
    topic, score = predict_topic(sentence, lda, topics, user_id)
    if topic != 'UNK':
        if topic not in similar_topic_names_mapping:
            print("Warning {}".format(topic))
        else:
            topic = similar_topic_names_mapping[topic]
    topic = topic.split(", ")
    return "_".join(topic)

def parse_dialogs(filename, with_history, ignore_options):
    dialogs = []
    with open(filename, 'r') as f:
        dialog = []
        st = 1
        for line in tqdm_notebook(f):
            if line.strip() == '':
                st = 1
                dialogs.append(dialog)
                dialog = []
            else:
                splitted = line.strip().split('\t')
                if len(splitted) == 1 and ignore_options:
                    continue
                elif len(splitted) == 1:
                    raise ValueError('Line has not 2 utterances (seems like an option) {}'.format(splitted))
                user_utt, bot_utt = splitted
                utt_num = user_utt.split(' ')[0]
                user_utt = ' '.join(user_utt.split(' ')[1:])
                if user_utt == '':
                    user_utt = '<SILENCE>'

                if bot_utt == '':
                    bot_utt = '<SILENCE>'
                new_lda_user = load_lda(ldas_pickle_data, st)
                new_lda_bot = load_lda(ldas_pickle_data, st+1)
                user_utt = predict_with_label_replace(user_utt, new_lda_user, ldas_pickle_data[st]['topics'], 'User')
                bot_utt = predict_with_label_replace(bot_utt, new_lda_bot, ldas_pickle_data[st+1]['topics'], 'Bot')
                
                if with_history:
                    if len(dialog) > 0:
                        prev_step = dialog[len(dialog) - 1]
                        user_utt_with_history = "{} {} {}".format(prev_step[1], prev_step[2], user_utt)
                    else:
                        user_utt_with_history = user_utt
                    dialog.append((utt_num, user_utt_with_history, bot_utt))
                else:
                    dialog.append((utt_num, user_utt, bot_utt))
                st += 1
    return dialogs

In [257]:
with_history, ignore_options = True, True
dialogs = parse_dialogs('../supervised-embedding-model/data/dialog-bAbI-tasks/dialog-babi-task5-full-dialogs-tst.txt', with_history, ignore_options)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))





In [258]:
with open('../supervised-embedding-model/data/test-task-5-topic.tsv', 'w') as f:
    for dialog in dialogs:
        for _, user_utt, bot_utt in dialog:
            print('{}\t{}'.format(user_utt, bot_utt), file=f)

In [2]:
!cat ../supervised-embedding-model/data/test-task-5-topic.tsv ../supervised-embedding-model/data/train-task-5-topic.tsv ../supervised-embedding-model/data/dev-task-5-topic.tsv | python ../supervised-embedding-model/build_vocabulary.py > ../supervised-embedding-model/data/vocab-task-5-topic.tsv

In [6]:
!cat ../supervised-embedding-model/data/vocab-task-5-topic.tsv

0	Bot_find_let_look_option_sure_think
1	User_people_please_price_range_restaurant_two
2	User_great_let_like_look_perfect_silence
3	UNK
4	Bot_else_let_look_ok_option_update
5	User_cuisine_food_french_italian_love_spanish
6	User_doe_number_phone_something_thanks_work
7	User_silence
8	User_great_like_look_rock_silence_thank
9	User_address_like_provide_rock_silence_thank
10	Bot_anything_api_call_cheap_else_sure_update
11	Bot_find_let_look_ok_option_think
12	Bot_api_call_cheap_expensive_four_moderate_six
13	Bot_api_call_else_moderate_sure_two_update
14	User_could_cuisine_food_instead_silence_six
15	Bot_anything_else_let_look_sure_update
16	User_actually_moderate_prefer_price_range_would
17	User_address_doe_else_perfect_something_work
18	User_could_four_instead_people_silence_six
19	User_actually_could_instead_prefer_work_would
20	Bot_hello_help_today
21	Bot_anything_great_let_option_reservation_think
22	Bot_api_call_cheap_let_look_moderate_ok
23	User_london_madrid_par

In [192]:
sentence = "Hi! I'd like to select italian cousine!"

ind = 1
new_lda = load_lda(ldas_pickle_data, ind)
predict_with_label_replace(sentence, new_lda, ldas_pickle_data[ind]['topics'])

'User_good_hello_hi_morning'