In [1]:
import spacy
import json
import sys
import numpy as np

In [2]:
import json


def add_features_to_dialogs_raw(json_path):
    with open(json_path, 'r') as f:
        dialogs = json.load(f)
        
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']
        for row in thread:
            text = row['text']
            # DO CLASSIFICATION HERE
            # dialog_act, probability = predict_dialogue_act(text)
            # row['dialogue_act'] = {'name': dialog_act, 'probability': probability}                        
    with open(json_path + '.out', 'w') as f:
        json.dump(dialogs, f, ensure_ascii=False, indent=2)

add_features_to_dialogs_raw('f1-labeled-dialogs-with-dm.json')

In [3]:
def make_features_dict(doc):
    # Lemma: The base form of the word.
    # POS: The simple UPOS part-of-speech tag.
    # Tag: The detailed part-of-speech tag.
    # Dep: Syntactic dependency, i.e. the relation between tokens.
    # Shape: The word shape â€“ capitalization, punctuation, digits.
    # is alpha: Is the token an alpha character?
    # is stop: Is the token part of a stop list, i.e. the most common words of the language?
    features = []
    for token in doc:
        features.append(
            {
                'lemma': token.lemma_, 
                 'pos': token.pos_, 
                 'tag': token.tag_, 
                 'dep': token.dep_,
                 'shape': token.shape_, 
                 'is_alpha': token.is_alpha, 
                 'is_stop': token.is_stop
            }
        )
    return features

def add_spacy_features_to_dialogs(json_path, nlp):
    with open(json_path, 'r') as f:
        raw_dialogs = json.load(f)
    
    dialogs = {}
    i = 0
    for dialogs_dict in raw_dialogs:
        for key, dialog in dialogs_dict.items():
            new_key = f"{i}_{key}"
            dialogs[new_key] = dialog
            i += 1
        
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']
        for row in thread:
            text = row['text']
            doc = nlp(text)
            features_dict = make_features_dict(doc)
            row['features_dict'] = features_dict
    return dialogs
#     with open(json_path + '.out', 'w') as f:
#         json.dump(dialogs, f, ensure_ascii=False, indent=2)
#     return dialogs

In [4]:
import tomotopy as tp

def get_lemmas_from_row(row):
    lemmas = []
    for f in row['features_dict']:
        if not f['is_stop'] and f['pos'] != 'PUNCT' and f['pos'] != 'SPACE' and f['pos'] != 'X': 
            lemmas.append(f['lemma'])
    return lemmas

def topic_modelling_features(json_path):
    mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=0, k=30, seed=42)
    with open(json_path, 'r') as f:
        dialogs = json.load(f)
    
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for row in thread:
            text = row['text']
            words = get_lemmas_from_row(row)
            mdl.add_doc(words)
    mdl.burn_in = 100
    mdl.train()
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
    mdl.summary()
    print('Saving...', file=sys.stderr, flush=True)
    
    for k in range(mdl.k):
        print('Topic #{}'.format(k))
        for word, prob in mdl.get_topic_words(k):
            print('\t', word, prob, sep='\t')
    return mdl


In [5]:
def get_topic_model_features(json_path, mdl):
    with open(json_path, 'r') as f:
        dialogs = json.load(f)
    
    for dialog_id, dialog in dialogs.items():
        thread = dialog['thread']
        for row in thread:
            text = row['text']
            words = get_lemmas_from_row(row)
            
            doc_inst = mdl.make_doc(words)
            topic_dist, ll = mdl.infer(doc_inst)
            max_topic_k = np.argmax(topic_dist)
            topic_modelling_features = [[word, prob] for word, prob in mdl.get_topic_words(max_topic_k, top_n=5)]    
            row['topic_model_features'] = topic_modelling_features  
    return dialogs

In [6]:
import joblib


def load_sklearn_model(sgd_path, le_path, tfidf_path):
    clf = joblib.load(sgd_path)
    le = joblib.load(le_path)
    tfidf = joblib.load(tfidf_path)
    return clf, le, tfidf


def add_discourse_features_to_dialogs_raw(dialogs, key, clf, le, tfidf):        
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']
        for ind, row in enumerate(thread):
            text = row['text']
            if key == 'pair_discourse_type':
                if ind > 0:
                    prev_text = thread[ind - 1]['text']
                    x = prev_text + ' . ' + text
                    x = tfidf.transform([x])
                    pair_label = clf.predict(x)
                    pair_label = le.inverse_transform(pair_label)
                    row[key] = pair_label[0]
            elif key == 'single_discourse_type':
                x = text
                x = tfidf.transform([x])
                single_label = clf.predict(x)
                single_label = le.inverse_transform(single_label)
                row[key] = single_label[0]
    return dialogs

In [4]:
nlp = spacy.load("en_core_web_sm")
clf_pair, le_pair, tfidf_pair = load_sklearn_model('pairs/discofuse_sgd_pairs.joblib', 'pairs/discofuse_le_pairs.joblib', 'pairs/discofuse_tfidf_pairs.joblib')
clf_single, le_single, tfidf_single = load_sklearn_model('single/discofuse_sgd_single.joblib', 'single/discofuse_le_single.joblib', 'single/discofuse_tfidf_single.joblib')

In [28]:
fname = 'data/topicalchat_train.dima.json'
out_fname = 'data/topicalchat.train.spacy.dialogact.discourse.0310.json'

In [29]:
dialogs_with_spacy = add_spacy_features_to_dialogs(fname, nlp)

In [30]:
dialogs_with_discourse_pairs = add_discourse_features_to_dialogs_raw(dialogs_with_spacy, 'pair_discourse_type', clf_pair, le_pair, tfidf_pair)

In [31]:
dialogs_with_discourse_single = add_discourse_features_to_dialogs_raw(dialogs_with_discourse_pairs, 'single_discourse_type', clf_single, le_single, tfidf_single)

In [32]:
with open(out_fname, 'w') as f:
    json.dump(dialogs_with_discourse_single, f, ensure_ascii=False, indent=2)

In [23]:
fname = 'data/topicalchat.train.spacy.dialogact.discourse.0310.json'
out_fname = 'data/topicalchat.train.spacy.dialogact.discourse.topicmodel.0310.json'

In [24]:
mdl = topic_modelling_features(fname)

Training...


Num docs: 187010 , Vocab size: 15706 , Num words: 1590617
Removed top words: []
Iteration: 0	Log-likelihood: -8.414755805352737
Iteration: 10	Log-likelihood: -8.216607783780823
Iteration: 20	Log-likelihood: -8.143368958191331
Iteration: 30	Log-likelihood: -8.102733461549374
Iteration: 40	Log-likelihood: -8.077116319869488
Iteration: 50	Log-likelihood: -8.055231666555828
Iteration: 60	Log-likelihood: -8.04585034525188
Iteration: 70	Log-likelihood: -8.033323269350749
Iteration: 80	Log-likelihood: -8.027072525017774
Iteration: 90	Log-likelihood: -8.002522251200942
Iteration: 100	Log-likelihood: -7.880130256852988
Iteration: 110	Log-likelihood: -7.807352781663074
Iteration: 120	Log-likelihood: -7.758575539541434
Iteration: 130	Log-likelihood: -7.7261166249570365
Iteration: 140	Log-likelihood: -7.699629664944132
Iteration: 150	Log-likelihood: -7.679722711802533
Iteration: 160	Log-likelihood: -7.663174643137689
Iteration: 170	Log-likelihood: -7.645517930379691
Iteration: 180	Log-likelihood: 

Saving...


Iteration: 960	Log-likelihood: -7.457616028962733
Iteration: 970	Log-likelihood: -7.457312358410493
Iteration: 980	Log-likelihood: -7.457271955687385
Iteration: 990	Log-likelihood: -7.45789361796443
<Basic Info>
| LDAModel (current version: 0.9.1)
| 187010 docs, 1590617 words
| Total Vocabs: 36865, Used Vocabs: 15706
| Entropy of words: -7.43248
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 1010, Burn-in steps: 100
| Optimization Interval: 10
| Log-likelihood per word: -7.45789
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 3 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 30 (the number of topics between 1 ~ 32767)
| alpha: 0.1 (hyperparameter of Dirichlet distribution for document-topic)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 42 (random seed)
| trained in version 0.9.1
|
<Parameters>
| alpha (Dirichlet prior on the per-document topi

In [25]:
dialogs = get_topic_model_features(fname, mdl)

In [26]:
with open(out_fname, 'w') as f:
    json.dump(dialogs, f, ensure_ascii=False, indent=2)