In [1]:
import spacy
import json

In [8]:
import json


def add_features_to_dialogs_raw(json_path):
    with open(json_path, 'r') as f:
        dialogs = json.load(f)
        
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']
        for row in thread:
            text = row['text']
            # DO CLASSIFICATION HERE
            # dialog_act, probability = predict_dialogue_act(text)
            # row['dialogue_act'] = {'name': dialog_act, 'probability': probability}                        
    with open(json_path + '.out', 'w') as f:
        json.dump(dialogs, f, ensure_ascii=False, indent=2)

add_features_to_dialogs_raw('f1-labeled-dialogs-with-dm.json')

In [3]:
def make_features_dict(doc):
    # Lemma: The base form of the word.
    # POS: The simple UPOS part-of-speech tag.
    # Tag: The detailed part-of-speech tag.
    # Dep: Syntactic dependency, i.e. the relation between tokens.
    # Shape: The word shape â€“ capitalization, punctuation, digits.
    # is alpha: Is the token an alpha character?
    # is stop: Is the token part of a stop list, i.e. the most common words of the language?
    features = []
    for token in doc:
        features.append(
            {
                'lemma': token.lemma_, 
                 'pos': token.pos_, 
                 'tag': token.tag_, 
                 'dep': token.dep_,
                 'shape': token.shape_, 
                 'is_alpha': token.is_alpha, 
                 'is_stop': token.is_stop
            }
        )
    return features

def add_spacy_features_to_dialogs(json_path, nlp):
    with open(json_path, 'r') as f:
        dialogs = json.load(f)
        
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']
        for row in thread:
            text = row['text']
            doc = nlp(text)
            features_dict = make_features_dict(doc)
            row['features_dict'] = features_dict
    return dialogs
#     with open(json_path + '.out', 'w') as f:
#         json.dump(dialogs, f, ensure_ascii=False, indent=2)
#     return dialogs

In [52]:
import joblib


def load_sklearn_model(sgd_path, le_path, tfidf_path):
    clf = joblib.load(sgd_path)
    le = joblib.load(le_path)
    tfidf = joblib.load(tfidf_path)
    return clf, le, tfidf


def add_discourse_features_to_dialogs_raw(dialogs, key, clf, le, tfidf):        
    for dialog_id, dialog in dialogs.items():
        current_ind_both = 1

        thread = dialog['thread']
        for ind, row in enumerate(thread):
            text = row['text']
            if key == 'pair_discourse_type':
                if ind > 0:
                    prev_text = thread[ind - 1]['text']
                    x = prev_text + ' . ' + text
                    x = tfidf.transform([x])
                    pair_label = clf.predict(x)
                    pair_label = le.inverse_transform(pair_label)
                    row[key] = pair_label[0]
            elif key == 'single_discourse_type':
                x = text
                x = tfidf.transform([x])
                single_label = clf.predict(x)
                single_label = le.inverse_transform(single_label)
                row[key] = single_label[0]
    return dialogs

In [2]:
nlp = spacy.load("en_core_web_sm")
clf_pair, le_pair, tfidf_pair = load_sklearn_model('pairs/discofuse_sgd_pairs.joblib', 'pairs/discofuse_le_pairs.joblib', 'pairs/discofuse_tfidf_pairs.joblib')
clf_single, le_single, tfidf_single = load_sklearn_model('single/discofuse_sgd_single.joblib', 'single/discofuse_le_single.joblib', 'single/discofuse_tfidf_single.joblib')

In [69]:
fname = 'data/multi-woz2-dima.json'
out_fname = 'data/multi-woz2.spacy.dialogact.discourse.2909.json'

In [70]:
dialogs_with_spacy = add_spacy_features_to_dialogs(fname, nlp)

In [71]:
dialogs_with_discourse_pairs = add_discourse_features_to_dialogs_raw(dialogs_with_spacy, 'pair_discourse_type', clf_pair, le_pair, tfidf_pair)

In [72]:
dialogs_with_discourse_single = add_discourse_features_to_dialogs_raw(dialogs_with_discourse_pairs, 'single_discourse_type', clf_single, le_single, tfidf_single)

In [73]:
with open(out_fname, 'w') as f:
    json.dump(dialogs_with_discourse_single, f, ensure_ascii=False, indent=2)