# Function to Get Closest Cluster Given Question Text
## get_cluster_for_q_text

This notebook defines several helper functions for and a function that gets the closest cluster number given a question text that does not necessarily have to come from the corpus.

The function 'get_cluster_for_q_text' itself is defined in the last cell of this notebook.

Example usage coming soon...

In [1]:
import os
import pkg_resources
import numpy as np
import json
from pprint import pprint

from convokit import Corpus, QuestionTypology, download, MotifsExtractor, QuestionTypologyUtils

import itertools
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy

from ast import literal_eval as make_tuple
from collections import defaultdict, Counter
from scipy import sparse
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import Normalizer
from spacy.en import English
from spacy.symbols import *
from spacy.tokens.doc import Doc

In [4]:
spacy_NLP = spacy.load('en')

In [5]:
def select_all(x):
    return True

In [6]:
def extract_arcs(comment_text, selector=select_all):
    sent_df = []
    spacy_obj = spacy_NLP(comment_text)
    for s_idx, sent in enumerate(spacy_obj.sents):
        sent_text = sent.text.strip()
        if len(sent_text) == 0: continue
        if selector(sent_text):
            sent_df.append({
                    'idx': 'A', 'sent_idx': s_idx, 'span': sent, 
                    'arc_sets': MotifsExtractor.get_arcs(sent.root, True),
                    'content': sent_text, 'sent_key': 'A' + '_' + str(s_idx)
                })
    sent_df = pd.DataFrame(sent_df)
    return sent_df

In [7]:
def load_motif_info(motif_dir):
    super_mappings = {}
    with open(os.path.join(motif_dir, 'question_supersets_arcset_to_super.json')) as f:
        for line in f.readlines():
            entry = json.loads(line)
            super_mappings[tuple(entry['arcset'])] = tuple(entry['super'])

    downlinks = MotifsExtractor.read_downlinks(os.path.join(motif_dir, 'question_tree_downlinks.json'))    
    node_counts = MotifsExtractor.read_nodecounts(os.path.join(motif_dir, 'question_tree_arc_set_counts.tsv'))
    return super_mappings, downlinks, node_counts

In [8]:
def fit_questions_and_answers(sent_df, q_vocab, a_vocab, 
                            super_mappings, downlink_info, node_count_info,
                            threshold, outfile=None, per_sent=False): 

    question_to_fits = defaultdict(set)
    question_to_leaf_fits = defaultdict(set)
    question_to_a_fits = defaultdict(set)

    for tup in sent_df.itertuples():
        if per_sent:
            key = tup.sent_key
        else:
            key = tup.idx
        for arc in tup.arc_sets:
            if arc in a_vocab: question_to_a_fits[key].add(arc)

        motif_fits = MotifsExtractor.fit_question(tup.arc_sets, downlink_info, node_count_info)
        for entry in motif_fits.values():
            motif = entry['arcset']
            if motif == ('*', ): continue
            super_motif = super_mappings.get(motif, '')
            if super_motif not in q_vocab: continue
            if entry['arcset_count'] < threshold: continue
            if entry['max_valid_child_count'] < threshold:
                question_to_leaf_fits[key].add(super_motif)
            question_to_fits[key].add(super_motif)
    if outfile is not None:
        df = pd.DataFrame.from_dict({
                'question_fits': question_to_fits,
                'question_leaf_fits': question_to_leaf_fits,
                'question_a_fits': question_to_a_fits
            })
        df.to_csv(outfile + '.fits.tsv', sep='\t')
    return question_to_fits, question_to_leaf_fits, question_to_a_fits

In [9]:
def make_new_qa_mtx_obj(question_to_fits, question_to_leaf_fits, question_to_a_fits, ref_mtx_obj,
        outfile=None):

    docs = [x for x,y in question_to_fits.items() if len(y) > 0]
    doc_to_idx = {doc:idx for idx,doc in enumerate(docs)}
    qterm_idxes = []
    leaves = []
    qdoc_idxes = []
    aterm_idxes = []
    adoc_idxes = []

    for doc in docs:
        qterms = question_to_fits[doc]
        for term in qterms:
            qterm_idxes.append(ref_mtx_obj['q_term_to_idx'][term])
            leaves.append(term in question_to_leaf_fits[doc])
            qdoc_idxes.append(doc_to_idx[doc])
        aterms = question_to_a_fits[doc]
        for term in aterms:
            aterm_idxes.append(ref_mtx_obj['a_term_to_idx'][term])
            adoc_idxes.append(doc_to_idx[doc])

    qterm_idxes = np.array(qterm_idxes)
    leaves = np.array(leaves)
    qdoc_idxes = np.array(qdoc_idxes)
    aterm_idxes = np.array(aterm_idxes)
    adoc_idxes = np.array(adoc_idxes)
    new_mtx_obj = {'q_terms': ref_mtx_obj['q_terms'], 'q_didxes': qdoc_idxes, 'docs': docs, 'q_leaves': leaves,
                  'q_term_counts': ref_mtx_obj['q_term_counts'], 'q_term_to_idx': ref_mtx_obj['q_term_to_idx'],
                  'doc_to_idx': doc_to_idx, 'q_tidxes': qterm_idxes, 'N_idf_docs': len(ref_mtx_obj['docs']),
                   'a_terms': ref_mtx_obj['a_terms'],
                  'a_term_counts': ref_mtx_obj['a_term_counts'], 'a_term_to_idx': ref_mtx_obj['a_term_to_idx'],
                  'a_tidxes': aterm_idxes, 'a_didxes': adoc_idxes}
    if outfile is not None:
        np.save(outfile + '.q.tidx.npy', qterm_idxes)
        np.save(outfile + '.q.leaves.npy', leaves)
        np.save(outfile + '.a.tidx.npy', aterm_idxes)
        np.save(outfile + '.q.didx.npy', qdoc_idxes)
        np.save(outfile + '.a.didx.npy', adoc_idxes)
        with open(outfile + '.docs.txt', 'w') as f:
            f.write('\n'.join(docs))

    return new_mtx_obj

In [10]:
def build_mtx(mtx_obj, data_type, norm, idf, leaves_only):
    #norm = l2, idf = False, leaves_only = True
    N_terms = len(mtx_obj[data_type + '_terms'])
    N_docs = len(mtx_obj['docs'])
    if 'N_idf_docs' in mtx_obj:
        N_idf_docs = mtx_obj['N_idf_docs']  # technical detail:  we want IDFs on the *training* data
    else:
        N_idf_docs = N_docs
    if idf:
        data = np.log(N_docs) - np.log(mtx_obj[data_type + '_term_counts'][mtx_obj[data_type + '_tidxes']])
    else:
        data = np.ones_like(mtx_obj[data_type + '_tidxes'])
        if leaves_only:
            data[~mtx_obj[data_type + '_leaves']] = 0
    mtx = sparse.csr_matrix((data, (mtx_obj[data_type + '_tidxes'], mtx_obj[data_type + '_didxes'])),
        shape=(N_terms,N_docs))
    if norm:
        mtx = Normalizer(norm=norm).fit_transform(mtx.astype(np.double))
    return mtx

def project_qa_embeddings(mtx_obj, lq, au, outfile=None):

    qmtx = build_mtx(mtx_obj,'q',norm='l2', idf=False, leaves_only=True)
    amtx = build_mtx(mtx_obj, 'a', norm='l2', idf=True, leaves_only=False)

    lq_norm = Normalizer().fit_transform(lq)
    au_norm = Normalizer().fit_transform(au)

    qdoc_vects = Normalizer().fit_transform(qmtx.T) * lq_norm
    adoc_vects = ((amtx.T) * au)

    if outfile is not None:
        np.save(outfile + '.qdoc', qdoc_vects)
        np.save(outfile + '.adoc', adoc_vects)

    return qdoc_vects, adoc_vects

In [11]:
def generate_qtype_model(mtx_obj, lq, au, n_clusters, snip=True, 
                            random_state=None, max_iter=1000,
                            display=None, max_dist_quantile=None,
                            outfile=None):

    lq_norm = Normalizer().fit_transform(lq)
    au_norm = Normalizer().fit_transform(au)

    km = KMeans(n_clusters=n_clusters, random_state=random_state, max_iter=max_iter)
    km.fit(lq_norm)

    motif_labels = km.predict(lq_norm)
    motif_dists = km.transform(lq_norm).min(axis=1)
    aarc_labels = km.predict(au_norm)
    aarc_dists = km.transform(au_norm).min(axis=1)
    motif_df = pd.DataFrame({'motif': mtx_obj['q_terms'], 'cluster': motif_labels, 'cluster_dist': motif_dists})[['motif', 'cluster_dist', 'cluster']]
    aarc_df = pd.DataFrame({'aarc': mtx_obj['a_terms'], 'cluster': aarc_labels, 'cluster_dist': aarc_dists})[['aarc', 'cluster_dist', 'cluster']]

    if display is not None:
        print('displaying for %d clusters' % n_clusters)
        print('-----')
        for c in range(n_clusters):
            print(c)
            print('--------')
            motif_subset = motif_df[motif_df.cluster == c]
            aarc_subset = aarc_df[aarc_df.cluster == c]
            print('\tquestions (%d):' % len(motif_subset))
            display_top_motifs(motif_subset, display, max_dist_quantile, random_state)
            print('')
            print('\tanswers (%d):' % len(aarc_subset))
            display_top_motifs(aarc_subset, display, max_dist_quantile, random_state)
            print('')
        print('\n=====\n')

    if outfile is not None:
        joblib.dump(km, '%s.%d.km' % (outfile, n_clusters))
        motif_df.to_csv('%s.%d.motifs.tsv' % (outfile, n_clusters), sep='\t')
        aarc_df.to_csv('%s.%d.aarcs.tsv' % (outfile, n_clusters), sep='\t')

    return km, motif_df, aarc_df

In [13]:
def get_best_cluster(qdoc_vects, adoc_vects, km):

    n_clusters = km.n_clusters
    qdoc_norm = Normalizer().fit_transform(qdoc_vects)
    adoc_norm = Normalizer().fit_transform(adoc_vects)

    qdoc_labels = km.predict(qdoc_norm)

    return int(qdoc_labels[0])

In [16]:
def get_cluster_for_q_text(q_text, questionTypology):
    '''
    Takes a string containing the question text and a question typology object and returns the closest cluster.
    :param q_text: the question text
    :param questionTypology: the question typology object
    :return: the closest cluster number in questionTypology that q_text corresponds to
    '''
    sent_df = extract_arcs(q_text)
    super_mappings, downlinks, node_counts = load_motif_info(questionTypology.motifs_dir)
    avocab = set(questionTypology.mtx_obj['a_terms'])
    qvocab = set(questionTypology.mtx_obj['q_terms'])
    question_to_fits, question_to_leaf_fits, question_to_a_fits = fit_questions_and_answers(sent_df, qvocab, avocab, 
        super_mappings, downlinks, node_counts, questionTypology.question_threshold)
    new_mtx_obj = make_new_qa_mtx_obj(question_to_fits, question_to_leaf_fits, question_to_a_fits, 
        questionTypology.mtx_obj)
    qdoc_vects, adoc_vects = project_qa_embeddings(new_mtx_obj, questionTypology.lq, questionTypology.a_u)
    return get_best_cluster(qdoc_vects, adoc_vects, questionTypology.km)