# Sample Usage for Function to Get Closest Cluster Given Question Text

This notebook defines several helper functions for and a function that gets the closest cluster number given a question text that does not necessarily have to come from the corpus.

The function is called 'get_cluster_for_q_text' and is used in several examples below for questions that do and don't exist already in the corpus. Examples of other questions in the same cluster that the function predicts are also supplied for comparison.

In [1]:
import os
import pkg_resources
import numpy as np
import json
from pprint import pprint

from convokit import Corpus, QuestionTypology, download, MotifsExtractor, QuestionTypologyUtils

import itertools
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy

from ast import literal_eval as make_tuple
from collections import defaultdict, Counter
from scipy import sparse
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import Normalizer
from spacy.en import English
from spacy.symbols import *
from spacy.tokens.doc import Doc

In [2]:
data_dir = download('reddit-iama-corpus')

Downloading reddit-iama-corpus from http://zissou.infosci.cornell.edu/socialkit/datasets/reddit-iama-corpus/10k.json (181.3MB)... Done


In [3]:
num_clusters = 6

In [4]:
#Load the corpus
corpus = Corpus(filename=os.path.join(data_dir, 'reddit-iama-corpus'))

In [5]:
#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus, data_dir, dataset_name='reddit-iama', num_dims=25,
num_clusters=num_clusters, verbose=False, random_seed=428)

In [6]:
spacy_NLP = spacy.load('en')

In [7]:
def select_all(x):
    return True

In [8]:
def extract_arcs(comment_text, selector=select_all):
    sent_df = []
    spacy_obj = spacy_NLP(comment_text)
    for s_idx, sent in enumerate(spacy_obj.sents):
        sent_text = sent.text.strip()
        if len(sent_text) == 0: continue
        if selector(sent_text):
            sent_df.append({
                    'idx': 'A', 'sent_idx': s_idx, 'span': sent, 
                    'arc_sets': MotifsExtractor.get_arcs(sent.root, True),
                    'content': sent_text, 'sent_key': 'A' + '_' + str(s_idx)
                })
    sent_df = pd.DataFrame(sent_df)
    return sent_df

In [9]:
def load_motif_info(motif_dir):

    super_mappings = {}
    with open(os.path.join(motif_dir, 'question_supersets_arcset_to_super.json')) as f:
        for line in f.readlines():
            entry = json.loads(line)
            super_mappings[tuple(entry['arcset'])] = tuple(entry['super'])

    downlinks = MotifsExtractor.read_downlinks(os.path.join(motif_dir, 'question_tree_downlinks.json'))    
    node_counts = MotifsExtractor.read_nodecounts(os.path.join(motif_dir, 'question_tree_arc_set_counts.tsv'))
    return super_mappings, downlinks, node_counts

In [10]:
def fit_questions_and_answers(sent_df, q_vocab, a_vocab, 
                            super_mappings, downlink_info, node_count_info,
                            threshold, outfile=None, per_sent=False): 

    question_to_fits = defaultdict(set)
    question_to_leaf_fits = defaultdict(set)
    question_to_a_fits = defaultdict(set)

    for tup in sent_df.itertuples():
        if per_sent:
            key = tup.sent_key
        else:
            key = tup.idx
        for arc in tup.arc_sets:
            if arc in a_vocab: question_to_a_fits[key].add(arc)

        motif_fits = MotifsExtractor.fit_question(tup.arc_sets, downlink_info, node_count_info)
        for entry in motif_fits.values():
            motif = entry['arcset']
            if motif == ('*', ): continue
            super_motif = super_mappings.get(motif, '')
            if super_motif not in q_vocab: continue
            if entry['arcset_count'] < threshold: continue
            if entry['max_valid_child_count'] < threshold:
                question_to_leaf_fits[key].add(super_motif)
            question_to_fits[key].add(super_motif)
    if outfile is not None:
        df = pd.DataFrame.from_dict({
                'question_fits': question_to_fits,
                'question_leaf_fits': question_to_leaf_fits,
                'question_a_fits': question_to_a_fits
            })
        df.to_csv(outfile + '.fits.tsv', sep='\t')
    return question_to_fits, question_to_leaf_fits, question_to_a_fits

In [11]:
def make_new_qa_mtx_obj(question_to_fits, question_to_leaf_fits, question_to_a_fits, ref_mtx_obj,
        outfile=None):

    docs = [x for x,y in question_to_fits.items() if len(y) > 0]
    doc_to_idx = {doc:idx for idx,doc in enumerate(docs)}
    qterm_idxes = []
    leaves = []
    qdoc_idxes = []
    aterm_idxes = []
    adoc_idxes = []

    for doc in docs:
        qterms = question_to_fits[doc]
        for term in qterms:
            qterm_idxes.append(ref_mtx_obj['q_term_to_idx'][term])
            leaves.append(term in question_to_leaf_fits[doc])
            qdoc_idxes.append(doc_to_idx[doc])
        aterms = question_to_a_fits[doc]
        for term in aterms:
            aterm_idxes.append(ref_mtx_obj['a_term_to_idx'][term])
            adoc_idxes.append(doc_to_idx[doc])

    qterm_idxes = np.array(qterm_idxes)
    leaves = np.array(leaves)
    qdoc_idxes = np.array(qdoc_idxes)
    aterm_idxes = np.array(aterm_idxes)
    adoc_idxes = np.array(adoc_idxes)
    new_mtx_obj = {'q_terms': ref_mtx_obj['q_terms'], 'q_didxes': qdoc_idxes, 'docs': docs, 'q_leaves': leaves,
                  'q_term_counts': ref_mtx_obj['q_term_counts'], 'q_term_to_idx': ref_mtx_obj['q_term_to_idx'],
                  'doc_to_idx': doc_to_idx, 'q_tidxes': qterm_idxes, 'N_idf_docs': len(ref_mtx_obj['docs']),
                   'a_terms': ref_mtx_obj['a_terms'],
                  'a_term_counts': ref_mtx_obj['a_term_counts'], 'a_term_to_idx': ref_mtx_obj['a_term_to_idx'],
                  'a_tidxes': aterm_idxes, 'a_didxes': adoc_idxes}
    if outfile is not None:
        np.save(outfile + '.q.tidx.npy', qterm_idxes)
        np.save(outfile + '.q.leaves.npy', leaves)
        np.save(outfile + '.a.tidx.npy', aterm_idxes)
        np.save(outfile + '.q.didx.npy', qdoc_idxes)
        np.save(outfile + '.a.didx.npy', adoc_idxes)
        with open(outfile + '.docs.txt', 'w') as f:
            f.write('\n'.join(docs))

    return new_mtx_obj

In [12]:
def build_mtx(mtx_obj, data_type, norm, idf, leaves_only):
    #norm = l2, idf = False, leaves_only = True
    N_terms = len(mtx_obj[data_type + '_terms'])
    N_docs = len(mtx_obj['docs'])
    if 'N_idf_docs' in mtx_obj:
        N_idf_docs = mtx_obj['N_idf_docs']  # technical detail:  we want IDFs on the *training* data
    else:
        N_idf_docs = N_docs
    if idf:
        data = np.log(N_docs) - np.log(mtx_obj[data_type + '_term_counts'][mtx_obj[data_type + '_tidxes']])
    else:
        data = np.ones_like(mtx_obj[data_type + '_tidxes'])
        if leaves_only:
            data[~mtx_obj[data_type + '_leaves']] = 0
    mtx = sparse.csr_matrix((data, (mtx_obj[data_type + '_tidxes'], mtx_obj[data_type + '_didxes'])),
        shape=(N_terms,N_docs))
    if norm:
        mtx = Normalizer(norm=norm).fit_transform(mtx.astype(np.double))
    return mtx

def project_qa_embeddings(mtx_obj, lq, au, outfile=None):

    qmtx = build_mtx(mtx_obj,'q',norm='l2', idf=False, leaves_only=True)
    amtx = build_mtx(mtx_obj, 'a', norm='l2', idf=True, leaves_only=False)

    lq_norm = Normalizer().fit_transform(lq)
    au_norm = Normalizer().fit_transform(au)

    qdoc_vects = Normalizer().fit_transform(qmtx.T) * lq_norm
    adoc_vects = ((amtx.T) * au)

    if outfile is not None:
        np.save(outfile + '.qdoc', qdoc_vects)
        np.save(outfile + '.adoc', adoc_vects)

    return qdoc_vects, adoc_vects

In [13]:
def generate_qtype_model(mtx_obj, lq, au, n_clusters, snip=True, 
                            random_state=None, max_iter=1000,
                            display=None, max_dist_quantile=None,
                            outfile=None):

    lq_norm = Normalizer().fit_transform(lq)
    au_norm = Normalizer().fit_transform(au)

    km = KMeans(n_clusters=n_clusters, random_state=random_state, max_iter=max_iter)
    km.fit(lq_norm)

    motif_labels = km.predict(lq_norm)
    motif_dists = km.transform(lq_norm).min(axis=1)
    aarc_labels = km.predict(au_norm)
    aarc_dists = km.transform(au_norm).min(axis=1)
    motif_df = pd.DataFrame({'motif': mtx_obj['q_terms'], 'cluster': motif_labels, 'cluster_dist': motif_dists})[['motif', 'cluster_dist', 'cluster']]
    aarc_df = pd.DataFrame({'aarc': mtx_obj['a_terms'], 'cluster': aarc_labels, 'cluster_dist': aarc_dists})[['aarc', 'cluster_dist', 'cluster']]

    if display is not None:
        print('displaying for %d clusters' % n_clusters)
        print('-----')
        for c in range(n_clusters):
            print(c)
            print('--------')
            motif_subset = motif_df[motif_df.cluster == c]
            aarc_subset = aarc_df[aarc_df.cluster == c]
            print('\tquestions (%d):' % len(motif_subset))
            display_top_motifs(motif_subset, display, max_dist_quantile, random_state)
            print('')
            print('\tanswers (%d):' % len(aarc_subset))
            display_top_motifs(aarc_subset, display, max_dist_quantile, random_state)
            print('')
        print('\n=====\n')

    if outfile is not None:
        joblib.dump(km, '%s.%d.km' % (outfile, n_clusters))
        motif_df.to_csv('%s.%d.motifs.tsv' % (outfile, n_clusters), sep='\t')
        aarc_df.to_csv('%s.%d.aarcs.tsv' % (outfile, n_clusters), sep='\t')

    return km, motif_df, aarc_df

In [14]:
def get_best_cluster(qdoc_vects, adoc_vects, km):

    n_clusters = km.n_clusters
    qdoc_norm = Normalizer().fit_transform(qdoc_vects)
    adoc_norm = Normalizer().fit_transform(adoc_vects)

    qdoc_labels = km.predict(qdoc_norm)

    return int(qdoc_labels[0])

In [15]:
def get_cluster_for_q_text(q_text, questionTypology):
    '''
    Takes a string containing the question text and a question typology object and returns the closest cluster.
    :param q_text: the question text
    :param questionTypology: the question typology object
    :return: the closest cluster number in questionTypology that q_text corresponds to
    '''
    sent_df = extract_arcs(q_text)
    super_mappings, downlinks, node_counts = load_motif_info(questionTypology.motifs_dir)
    avocab = set(questionTypology.mtx_obj['a_terms'])
    qvocab = set(questionTypology.mtx_obj['q_terms'])
    question_to_fits, question_to_leaf_fits, question_to_a_fits = fit_questions_and_answers(sent_df, qvocab, avocab, 
        super_mappings, downlinks, node_counts, questionTypology.question_threshold)
    new_mtx_obj = make_new_qa_mtx_obj(question_to_fits, question_to_leaf_fits, question_to_a_fits, 
        questionTypology.mtx_obj)
    qdoc_vects, adoc_vects = project_qa_embeddings(new_mtx_obj, questionTypology.lq, questionTypology.a_u)
    return get_best_cluster(qdoc_vects, adoc_vects, questionTypology.km)

# Sample Usage of get_cluster_for_q_text for questions in corpus

First off, we can classify questions that occur in the reddit-iama corpus and expect the result to be the same cluster in which the question was clustered when building the Question Typology. Here are some examples:

In [27]:
cl = get_cluster_for_q_text("I literally thought you played Amos on the Expanse the first time I saw BSG. Has anyone else ever gotten that mixed up? Please say yes so I don't feel like a total ass. Also, I really enjoy watching the transformation your character has undertaken (halfway through season 4 of bsg) , do you have any other upcoming projects? Yes to both questions is totally acceptable", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
# "I literally thought you played..." is Question 1 in the following...
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=3)

	10 sample question motifs for type 4 (214 total motifs):
		1. ('*',)
		2. ('have>*',)
		3. ('gotten_*',)
		4. ('considered_*', 'have>*')
		5. ('considered_*', 'considered_ever')
		6. ('play_*',)
		7. ('so_*',)
		8. ('considered_*', 'considered_ever', 'considered_have', 'have>*')
		9. ('considered_*', 'considered_ever', 'considered_have')
		10. ('feel_*', 'feel_does', 'how>*')
	3 sample question-answer pairs that were assigned type 4 (29525 total questions with this type) :
		Question 1. I literally thought you played Amos on the Expanse the first time I saw BSG. Has anyone else ever gotten that mixed up? Please say yes so I don't feel like a total ass. Also, I really enjoy watching the transformation your character has undertaken (halfway through season 4 of bsg) , do you have any other upcoming projects? Yes to both questions is totally acceptable
		Answer 1. I have gotten that a few times.
Handsome devil....

There are things coming up, just can't talk about them yet.

Code for unem

In [28]:
cl = get_cluster_for_q_text("I'm planning on making my way to Cuba soon.  What advice would you give someone visiting Havana for a few days?  What should I definitely do and what things should I avoid?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
# "I'm planning on making my way to Cuba soon..." is Question 1 in the following...
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=3)

	10 sample question motifs for type 0 (134 total motifs):
		1. ('give_*', 'give_would', 'what>*')
		2. ('give_*',)
		3. ('give_*', 'what>*')
		4. ('give_*', 'give_what', 'give_would')
		5. ('give_*', 'give_what')
		6. ('give_*', 'give_to', 'give_what', 'give_would')
		7. ('give_*', 'give_to', 'give_would', 'what>*')
		8. ('give_*', 'give_to', 'give_what')
		9. ('give_*', 'give_would')
		10. ('recommend_*', 'what>*')
	3 sample question-answer pairs that were assigned type 0 (9444 total questions with this type) :
		Question 1. I'm planning on making my way to Cuba soon.  What advice would you give someone visiting Havana for a few days?  What should I definitely do and what things should I avoid?
		Answer 1. The answer to that question depends much on what you want out of a trip to Cuba. I think the first thing to think about, is if you will go to Cuba individually or with a group. For the first time in many years it is much easier to go to Cuba individually rather than in groups. Both 

In [29]:
cl = get_cluster_for_q_text("does the constant fame and attention ever get overwhelming?  Would you recommend your path of life for anyone else?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
# "does the constant fame and attention..." is Question 24 in the following...
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=30)

	10 sample question motifs for type 5 (169 total motifs):
		1. ('use_*',)
		2. ('are_*', 'how>*')
		3. ('how>*', 'how>does')
		4. ('use_*', 'use_do')
		5. ('work_*', 'work_do')
		6. ('pay_*',)
		7. ('have_*', 'how>*')
		8. ('what>*', 'what>happens')
		9. ('how>*', 'make_*', 'make_do')
		10. ('is_*', 'is_how')
	30 sample question-answer pairs that were assigned type 5 (16800 total questions with this type) :
		Question 1. Hope I'm not too late. I'm curious what bands or artists someone as eccentric as yourself enjoys? I'm getting a 'They Might Be Giants' vibe from your music, which is definitely a good thing. Thanks!
		Answer 1. I havent been a follow one band type of guy for a long long time. 
Bands are weird. But i am overdue now to check out they might be giants so thankyou for the reminder. 
I love getting turned on to stuff i havent heard before. Like last night i heard some really cool folk pop music from sudan. 
I love folk music especially from romania. 
Also was clued in to the

Just reread your question my bad! My school district sent me checks to re-reimburse the gas money it cost for me to go to all my schools    and sent me old clothes they had which was nice. I have had enough state/district support to live on, but I don't own anything nice. All of my clothes are old and I have a small space to call my own, but I don't mind :) Hope this helps!
		Question 14. Hi Robert! Congrats on your engagement by the way.  I'm curious about who else you would like to see guest judge on Shark Tank?
		Answer 14. Marcus Lemonis. and Gene Simmons. Love that guy
		Question 15. When do you guys predict that the "Big One" will occur?
		Answer 15. Tim and Dave--sometime in the future; exact time unknown but probably sometime in the next 500 years or it could be this week
		Question 16. Would you recommend I sell some of my Ethereum to help fund my company?
		Answer 16. Haha, personally I'm holding on to most of my cryptos (ETH, XRP, BTC, XLM in order of amount I hold). But, if

		Question 28. Would you recommend fillers or an implant to define the jawline? 
		Answer 28. Implants are more powerful, but fillers are nonsurgical. It depends on the route you want to go.
		Question 29. My friend took Lsd for the first time and I showed him super jail and he watched every episode, it's one of his favorite shows now. It's always been one of my favorite shows, I watched it all the time in middle school. I'm just curious what was the inspiration for the characters in that show, especially the warden?
		Answer 29. You wouldn't believe how many people tell me their first LSD experience was watching Superjail! Haha. The Warden was greatly inspired by Willy Wonka, the Joker, and the brilliant David Wain himself.
		Question 30. Would you recommend this as a job? I want to choose between this or a police officer 
		Answer 30. If you can handle the stress from the calls and the hours of sitting, it can be really rewarding! Law Enforcement would be much more physically active 

In [32]:
cl = get_cluster_for_q_text("Instead of Cricket flour, do you think \"mixed insect\" flour would sell as well? And also if so, what would \"mixed insect\" flour contain? In terms of A. Best cost vs profit ratio to you the seller and B. Best protein per gram value to the consumer?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
# "Instead of Cricket flour, do you..." is Question 5 in the following...
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=30)

	10 sample question motifs for type 2 (161 total motifs):
		1. ('think_*',)
		2. ('think_*', 'think_do', 'think_what')
		3. ('do>*', 'think_*')
		4. ('think_*', 'what>*')
		5. ('think_*', 'think_do', 'think_of')
		6. ('think_*', 'think_do', 'think_is')
		7. ('think_*', 'think_do', 'think_of', 'think_what', 'what>*', 'what>do')
		8. ('think_*', 'think_is')
		9. ('what>*', 'what>do')
		10. ('do>*', 'think_*', 'think_do', 'think_is')
	30 sample question-answer pairs that were assigned type 2 (21479 total questions with this type) :
		Question 1. I think I know the answer to this, but is it a terrible idea to bicycle down to totality from Portland on the day of?
		Answer 1. I won't say it is a terrible idea but if you do bike down, I would make sure you're very visible, wearing a helmet, have supplies like food and water, and be vigilant in watching out for traffic patterns.
		Question 2. Hey Luis! I've been a massive fan of yours ever since the EP days. I honestly think with some more exp

		Question 9. I am in a points league with daily roster moves.  Think i could get by with using a daily catcher?   Any other position you think i could do this? 
		Answer 9. If you have daily roster moves you're going to need at least two catchers, or one slot where you are picking and dropping them.  Even the catchers who play the most like Buster Posey sit out a fair amount of the time, although NL ones may see pinch hits in those games.
		Question 10. Hello,

Thanks to everyone for doing this AMA. Besides being educational it's also quite funny to read the humourous responses to light-hearted questions.

Q: With the amount of time spent in space has there been any significant temporary/permanent decrease in muscular strength, immunity to common illnesses (colds or flus for instance), or absorption of nutrients from food by Kelly compared to his twin?

Q2: Has radiation levels been tested and compared between the two men?

Q3: In the future do you think a stronger study of twins coul

		Question 21. Thank you so much for the AMA, Allison.

What do you think of the different ways in which we teach young children and high schoolers? In our youth, we see numbers as a part of the world and are taught how to implement ideas with examples. At higher levels, theoriy is often invoked without being properly explained. Do you think that this could be remedied?
		Answer 21. Yes.  One way to integrate those two modes of learning, that I think we need to get better at in general: Don't just present polished, well-known results.  We should walk students through the history of failed and incomplete ideas.  Theory research is a process, not just a single "aha" moment.
		Question 22. The cost of Universities, and law schools in particular, seems to reflect the availability for student loans more than it does the market for jobs. One of the consequences of making loans more widely available is the steady increase in the cost of education to match the maximum amount students can recei

# Sample usage on questions not in the corpus...

Here, we see get_cluster_for_q_text operate on questions not in the corpus. Additional examples from the cluster prediction are provided as well.

In [33]:
# A question clearly not in the corpus...
cl = get_cluster_for_q_text("What advice would you have for turtles looking for jobs?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=10)

	10 sample question motifs for type 0 (134 total motifs):
		1. ('give_*', 'give_would', 'what>*')
		2. ('give_*',)
		3. ('give_*', 'what>*')
		4. ('give_*', 'give_what', 'give_would')
		5. ('give_*', 'give_what')
		6. ('give_*', 'give_to', 'give_what', 'give_would')
		7. ('give_*', 'give_to', 'give_would', 'what>*')
		8. ('give_*', 'give_to', 'give_what')
		9. ('give_*', 'give_would')
		10. ('recommend_*', 'what>*')
	10 sample question-answer pairs that were assigned type 0 (9444 total questions with this type) :
		Question 1. I'm planning on making my way to Cuba soon.  What advice would you give someone visiting Havana for a few days?  What should I definitely do and what things should I avoid?
		Answer 1. The answer to that question depends much on what you want out of a trip to Cuba. I think the first thing to think about, is if you will go to Cuba individually or with a group. For the first time in many years it is much easier to go to Cuba individually rather than in groups. Both

		Question 10. What advice would you give someone interviewing to be accepted into an Ivy league school, and to Harvard in particular?
		Answer 10. Be confident.  This is different than being full of yourself.  Have a solid base of interesting things and activities behind you (as well as strong grads and tests scores).  Know what you want to do...even if you're open to this changing.  Harvard in particular likes students who are extremely focused and driven.


In [34]:
cl = get_cluster_for_q_text("What was the worst part of the climb up the really tall mountain?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=10)

	10 sample question motifs for type 1 (161 total motifs):
		1. ('what>*', 'what>was')
		2. ('get_*', 'get_did')
		3. ('how>*', 'how>did')
		4. ('was_*', 'what>*')
		5. ('have_*', 'have_did')
		6. ('what>*', 'what>did')
		7. ('was_*',)
		8. ('take_*', 'take_did')
		9. ('how>*', 'was_*')
		10. ('meet_*', 'meet_did')
	10 sample question-answer pairs that were assigned type 1 (14017 total questions with this type) :
		Question 1. Happiest guy you know? What was the toughest time in your life that's tried that claim? How did you overcome it? 
		Answer 1. Look, I'm one happy son of a bitch. It's just that simple. I can't name one person that is happier than I am. 

We all have trials. Some get through them better than others. 
		Question 2. How did you get involved into the Let's Play network? Did Roosterteeth they reach out to you?
		Answer 2. ScrewAttack was acquired by Fullscreen in 2014, so after Fullscreen acquired RT it made sense for us to work together. A lot of us have been fans of 

In [36]:
cl = get_cluster_for_q_text("What sort of utensils do I need to start cooking?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=10)

	10 sample question motifs for type 0 (134 total motifs):
		1. ('give_*', 'give_would', 'what>*')
		2. ('give_*',)
		3. ('give_*', 'what>*')
		4. ('give_*', 'give_what', 'give_would')
		5. ('give_*', 'give_what')
		6. ('give_*', 'give_to', 'give_what', 'give_would')
		7. ('give_*', 'give_to', 'give_would', 'what>*')
		8. ('give_*', 'give_to', 'give_what')
		9. ('give_*', 'give_would')
		10. ('recommend_*', 'what>*')
	10 sample question-answer pairs that were assigned type 0 (9444 total questions with this type) :
		Question 1. I'm planning on making my way to Cuba soon.  What advice would you give someone visiting Havana for a few days?  What should I definitely do and what things should I avoid?
		Answer 1. The answer to that question depends much on what you want out of a trip to Cuba. I think the first thing to think about, is if you will go to Cuba individually or with a group. For the first time in many years it is much easier to go to Cuba individually rather than in groups. Both

		Question 10. What advice would you give someone interviewing to be accepted into an Ivy league school, and to Harvard in particular?
		Answer 10. Be confident.  This is different than being full of yourself.  Have a solid base of interesting things and activities behind you (as well as strong grads and tests scores).  Know what you want to do...even if you're open to this changing.  Harvard in particular likes students who are extremely focused and driven.


In [40]:
cl = get_cluster_for_q_text("Do you forsee the discovery of evidence of alien life in our lifetimes?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=10)

	10 sample question motifs for type 2 (161 total motifs):
		1. ('think_*',)
		2. ('think_*', 'think_do', 'think_what')
		3. ('do>*', 'think_*')
		4. ('think_*', 'what>*')
		5. ('think_*', 'think_do', 'think_of')
		6. ('think_*', 'think_do', 'think_is')
		7. ('think_*', 'think_do', 'think_of', 'think_what', 'what>*', 'what>do')
		8. ('think_*', 'think_is')
		9. ('what>*', 'what>do')
		10. ('do>*', 'think_*', 'think_do', 'think_is')
	10 sample question-answer pairs that were assigned type 2 (21479 total questions with this type) :
		Question 1. I think I know the answer to this, but is it a terrible idea to bicycle down to totality from Portland on the day of?
		Answer 1. I won't say it is a terrible idea but if you do bike down, I would make sure you're very visible, wearing a helmet, have supplies like food and water, and be vigilant in watching out for traffic patterns.
		Question 2. Hey Luis! I've been a massive fan of yours ever since the EP days. I honestly think with some more exp

		Question 9. I am in a points league with daily roster moves.  Think i could get by with using a daily catcher?   Any other position you think i could do this? 
		Answer 9. If you have daily roster moves you're going to need at least two catchers, or one slot where you are picking and dropping them.  Even the catchers who play the most like Buster Posey sit out a fair amount of the time, although NL ones may see pinch hits in those games.
		Question 10. Hello,

Thanks to everyone for doing this AMA. Besides being educational it's also quite funny to read the humourous responses to light-hearted questions.

Q: With the amount of time spent in space has there been any significant temporary/permanent decrease in muscular strength, immunity to common illnesses (colds or flus for instance), or absorption of nutrients from food by Kelly compared to his twin?

Q2: Has radiation levels been tested and compared between the two men?

Q3: In the future do you think a stronger study of twins coul

In [44]:
cl = get_cluster_for_q_text("Do you plan to release a version that would make it easier to operate for people with disabilities?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=10)

	10 sample question motifs for type 3 (122 total motifs):
		1. ('will>*',)
		2. ('expect_*',)
		3. ('planning_*',)
		4. ('when>*', 'when>will')
		5. ('are>*', 'planning_*', 'planning_are')
		6. ('plan_*',)
		7. ('expect_*', 'expect_can')
		8. ('have_*', 'have_will', 'will>*')
		9. ('have_*', 'will>*')
		10. ('have_*', 'have_will')
	10 sample question-answer pairs that were assigned type 3 (14057 total questions with this type) :
		Question 1. Will a 16 seed ever upset a 1 seed?

What are some of the most interesting Cinderella teams in the history of the tournament?
		Answer 1. Maybe? Probably? Statistically, the odds for a 16 seed winning are greater than zero, but that doesn't mean we're going to ever see it happen. Though, according to users of our bracket game, we're getting closer to witnessing history. Last year, 5.68 percent of all brackets had a 16 seed over a 1 seed. That's a slight drop since the all-time high of 6.2 percent in 2015, but the percentage is trending upward sinc

In [46]:
cl = get_cluster_for_q_text("What is the most number of wars a single individual has participated in?", questionTypology)
questionTypology.display_motifs_for_type(cl, num_egs=10)
questionTypology.display_question_answer_pairs_for_type(cl, num_egs=10)

	10 sample question motifs for type 5 (169 total motifs):
		1. ('use_*',)
		2. ('are_*', 'how>*')
		3. ('how>*', 'how>does')
		4. ('use_*', 'use_do')
		5. ('work_*', 'work_do')
		6. ('pay_*',)
		7. ('have_*', 'how>*')
		8. ('what>*', 'what>happens')
		9. ('how>*', 'make_*', 'make_do')
		10. ('is_*', 'is_how')
	10 sample question-answer pairs that were assigned type 5 (16800 total questions with this type) :
		Question 1. Hope I'm not too late. I'm curious what bands or artists someone as eccentric as yourself enjoys? I'm getting a 'They Might Be Giants' vibe from your music, which is definitely a good thing. Thanks!
		Answer 1. I havent been a follow one band type of guy for a long long time. 
Bands are weird. But i am overdue now to check out they might be giants so thankyou for the reminder. 
I love getting turned on to stuff i havent heard before. Like last night i heard some really cool folk pop music from sudan. 
I love folk music especially from romania. 
Also was clued in to the