In [1]:
import os
os.chdir('../../..')

This notebook demonstrates the use of vectors in ConvoKit, as well as the use of the bag-of-words transformer, BoWTransformer, and the VectorClassifier.

In [2]:
import convokit

In [3]:
from convokit import Corpus, download

In [4]:
corpus = Corpus(download('subreddit-Cornell'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/subreddit-Cornell


In [5]:
corpus.print_summary_stats()

Number of Speakers: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Using a Transformer to add a vector

In [6]:
from convokit import VectorClassifier, BoWTransformer

In [7]:
random_utt = corpus.random_utterance()
random_utt

Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x1233a9790>, 'meta': {'score': 6, 'top_level_comment': 'd66ewdu', 'retrieved_on': 1473250252, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': ''}, '_id': 'd66ewdu', 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', '_owner': <convokit.model.corpus.Corpus object at 0x1233a9790>, 'meta': {}, '_id': 'deesta', 'vectors': []}), 'conversation_id': '4wdr4e', '_root': '4wdr4e', 'reply_to': '4wdr4e', 'timestamp': 1470463334, 'text': "Hockey is one of a very limited number of sports that Cornell is really good at (I'd even argue that it's our best sport, both men's and women's). \n\nHockey season tickets are definitely worth it in my opinion, but splitting the cost with friends might be a good idea for your first year (rotate who goes to games, decide if it's worth the investment to buy tickets in your later years, etc.). I missed less tha

In [8]:
random_utt.vectors

[]

In [9]:
corpus.vectors

set()

In [10]:
corpus.vectors

set()

In [11]:
bow_transformer = BoWTransformer(obj_type="utterance", vector_name='bow')
bow_transformer.fit_transform(corpus)

Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x1233a9790>

In [12]:
random_utt

Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x1233a9790>, 'meta': {'score': 6, 'top_level_comment': 'd66ewdu', 'retrieved_on': 1473250252, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': ''}, '_id': 'd66ewdu', 'vectors': ['bow'], 'speaker': Speaker({'obj_type': 'speaker', '_owner': <convokit.model.corpus.Corpus object at 0x1233a9790>, 'meta': {}, '_id': 'deesta', 'vectors': []}), 'conversation_id': '4wdr4e', '_root': '4wdr4e', 'reply_to': '4wdr4e', 'timestamp': 1470463334, 'text': "Hockey is one of a very limited number of sports that Cornell is really good at (I'd even argue that it's our best sport, both men's and women's). \n\nHockey season tickets are definitely worth it in my opinion, but splitting the cost with friends might be a good idea for your first year (rotate who goes to games, decide if it's worth the investment to buy tickets in your later years, etc.). I missed les

In [13]:
random_utt.vectors

['bow']

### Fetching the vector for the utterance

In [14]:
corpus._vector_matrices['bow'].matrix = corpus._vector_matrices['bow'].matrix.toarray()

In [15]:
random_utt.get_vector('bow')

array([[0, 0, 0, ..., 0, 0, 0]])

In [16]:
random_utt.get_vector('bow', as_dataframe=True)

Unnamed: 0,00,000,00am,00pm,01,02,03,04,05,06,...,youtu,youtube,yr,yrs,yup,zero,zeus,zimride,zip,zone
d66ewdu,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
random_utt.get_vector('bow', as_dataframe=True, columns=['youtu', 'youtube', 'yr'])

Unnamed: 0,youtu,youtube,yr
d66ewdu,0,0,0


In [18]:
random_utt.get_vector('bow', as_dataframe=False, columns=['youtu', 'youtube', 'yr'])

array([[0, 0, 0]])

### What does this look like at the Corpus level?

In [19]:
corpus.vectors

{'bow'}

In [20]:
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]))

In [21]:
bow_matrix = corpus.get_vector_matrix('bow')

In [22]:
bow_matrix.to_dataframe().head()

Unnamed: 0,00,000,00am,00pm,01,02,03,04,05,06,...,youtu,youtube,yr,yrs,yup,zero,zeus,zimride,zip,zone
nyx4d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o0145,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o1gca,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o0ss4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o31u0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
bow_matrix.columns[:10]

['00', '000', '00am', '00pm', '01', '02', '03', '04', '05', '06']

In [24]:
bow_matrix.ids[:10]

['nyx4d',
 'o0145',
 'o1gca',
 'o0ss4',
 'o31u0',
 'o4ipd',
 'o456r',
 'o4544',
 'o3l7i',
 'o3fqm']

In [25]:
bow_matrix.name

'bow'

In [26]:
bow_matrix.matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Dumping and loading

### Dumps all vectors by default

In [27]:
corpus.meta['num_users'] = corpus.meta['num_user']

In [28]:
del corpus.meta['num_user']

In [29]:
corpus.meta_index.overall_index

{'subreddit': "<class 'str'>",
 'num_posts': "<class 'int'>",
 'num_comments': "<class 'int'>",
 'num_users': "<class 'int'>"}

In [30]:
# dumps all vectors by default
corpus.dump('cornell-with-bow', base_path='convokit/classifier/demo')

In [31]:
os.listdir('convokit/classifier/demo/cornell-with-bow')

['utterances.jsonl',
 'conversations.json',
 'vectors.bow.p',
 'corpus.json',
 'speakers.json',
 'index.json']

### But vectors can be excluded

In [32]:
corpus.dump('cornell-no-bow', base_path='convokit/classifier/demo', exclude_vectors=['bow'])

In [33]:
os.listdir('convokit/classifier/demo/cornell-no-bow')

['utterances.jsonl',
 'conversations.json',
 'corpus.json',
 'speakers.json',
 'index.json']

Let's check if they really are excluded:

In [34]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-no-bow')

In [35]:
corpus.vectors

set()

In [36]:
corpus.random_utterance().vectors

[]

### When the corpus is loaded, vectors are present 'structurally' but not actually loaded

In [37]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-with-bow')

In [38]:
corpus.vectors

{'bow'}

In [39]:
corpus.random_utterance()

Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x28e22ca90>, 'meta': {'score': 1, 'top_level_comment': None, 'retrieved_on': 1500177226, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/6j9iby/how_hard_is_it_to_transfer_from_an_md_community/', 'author_flair_text': ''}, '_id': '6j9iby', 'vectors': ['bow'], 'speaker': Speaker({'obj_type': 'speaker', '_owner': <convokit.model.corpus.Corpus object at 0x28e22ca90>, 'meta': {}, '_id': '[deleted]', 'vectors': []}), 'conversation_id': '6j9iby', '_root': '6j9iby', 'reply_to': None, 'timestamp': 1498326927, 'text': '[deleted]'})

In [40]:
corpus.random_utterance().vectors

['bow']

In [41]:
corpus._vector_matrices # private variable storing vector matrices

{}

In [42]:
# when fetched normally
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

In [43]:
corpus._vector_matrices

{'bow': ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
 	with 2108383 stored elements in Compressed Sparse Row format>)}

### We can also load the corpus with vectors fully loaded

In [44]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-with-bow', preload_vectors=['bow'])

In [45]:
corpus._vector_matrices # private variable storing matrices

{'bow': ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
 	with 2108383 stored elements in Compressed Sparse Row format>)}

In [46]:
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

### Training a VectorClassifier

Setting up a basic prediction task where we predict whether an Utterance (corresponding to a Reddit comment) has a score above 0 based on its bag-of-words vector.

In [47]:
bow_classifier = VectorClassifier(obj_type="utterance", labeller=lambda utt: utt.meta['score'] > 0)

Initializing default classification model (standard scaled logistic regression)


In [None]:
bow_classifier.fit(corpus, vector_name='bow')

In [None]:
bow_classifier.transform(corpus, vector_name='bow')

In [None]:
bow_classifier.summarize(corpus).head()

In [None]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).head()

In [None]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).tail()

In [None]:
y_true, y_pred = bow_classifier.get_y_true_pred(corpus)

In [None]:
bow_classifier.base_accuracy(corpus)

In [None]:
bow_classifier.accuracy(corpus)

In [None]:
print(bow_classifier.classification_report(corpus))

## A bag-of-words prediction task at the Conversation-level

For each conversation, based on the bag-of-words of the first 5 utterances, we ask the question of whether the comment thread will eventually double in length or stay the same length. 

In [None]:
top_level_comment_ids = [utt.id for utt in corpus.iter_utterances() if utt.id == utt.meta['top_level_comment']]

In [None]:
corpus.print_summary_stats()

In [None]:
len(top_level_comment_ids)

In [None]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids)

In [None]:
threads_corpus.print_summary_stats()

In [None]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [None]:
from collections import Counter

In [None]:
# Count of negative / positive examples
Counter(threads_corpus.get_conversations_dataframe()['meta.thread_doubles'])

In [None]:
bow_transformer2 = BoWTransformer(obj_type="conversation", vector_name='first_5_BoW')

In [None]:
bow_transformer2.fit_transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
threads_corpus.vectors

In [None]:
bow_classifier2 = VectorClassifier(obj_type="conversation", 
                    labeller=lambda convo: convo.meta['thread_doubles'])

In [None]:
bow_classifier2.fit_transform(threads_corpus, vector_name='first_5_BoW',
                              selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
summary = bow_classifier2.summarize(threads_corpus, 
                                    selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
summary.head()

In [None]:
summary.tail()

In [None]:
bow_classifier2.base_accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
bow_classifier2.accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
print(bow_classifier2.classification_report(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None))

In [None]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).head(10)

In [None]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).tail(10)

In [None]:
bow_classifier2.confusion_matrix(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
bow_classifier2.evaluate_with_cv(threads_corpus, vector_name='first_5_BoW', selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
bow_classifier2.evaluate_with_train_test_split(threads_corpus, vector_name='first_5_BoW',
                                               selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
objs = list(threads_corpus.iter_conversations(selector=lambda convo: convo.meta['thread_doubles'] is not None))

Since vectors are tied to Corpus, you cannot apply a VectorClassifier to an arbitrary set of objects.

In [None]:
objs = bow_classifier2.transform_objs(objs)

In [None]:
# bow_classifier2.summarize_objs(objs)

# Here be dragons (i.e. this is not updated yet)

## Paired bag-of-words prediction for comment thread doubling in length versus staying the same length based on first 5 utterances

In [None]:
from convokit import Pairer, PairedBoW

In [None]:
corpus = Corpus(filename=download('subreddit-Cornell'))

In [None]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids, preserve_convo_meta=True)

In [None]:
next(threads_corpus.iter_conversations())

In [None]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [None]:
next(threads_corpus.iter_conversations()).meta

In [None]:
pairer = Pairer(obj_type="conversation", 
                pos_label_func=lambda convo: convo.meta['thread_doubles'], 
                neg_label_func=lambda convo: convo.meta['thread_doubles'] == False,
                pairing_func=lambda convo: convo.meta['original_convo_id']
               )

In [None]:
pairer.transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
print(next(threads_corpus.iter_conversations()))

In [None]:
for convo in threads_corpus.iter_conversations():
    if convo.meta['pair_id'] is not None:
        print(convo)
        break

In [None]:
from convokit import BoWTransformer

In [None]:
bow_transformer = BoWTransformer(obj_type="conversation", 
                                 text_func=lambda convo: " ".join([utt.text for utt in convo.get_chronological_utterance_list()][:5]),
                    )

In [None]:
bow_transformer.fit_transform(threads_corpus, selector=lambda convo: convo.meta['pair_id'] is not None)

In [None]:
paired_bow = PairedBoW(obj_type="conversation")

In [None]:
paired_bow.fit(threads_corpus)

In [None]:
paired_bow.summarize(threads_corpus)

In [None]:
paired_bow.get_coefs(feature_names=bow_transformer.get_vocabulary())