In [1]:
import os
os.chdir('../../..')

This notebook demonstrates the use of vectors in ConvoKit, as well as the use of the bag-of-words transformer, BoWTransformer, and the VectorClassifier.

In [2]:
import convokit

In [3]:
from convokit import Corpus, download

In [4]:
corpus = Corpus(download('subreddit-Cornell'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/subreddit-Cornell


In [5]:
corpus.print_summary_stats()

Number of Speakers: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Using a Transformer to add a vector

In [6]:
from convokit import VectorClassifier, BoWTransformer

In [7]:
random_utt = corpus.random_utterance()
random_utt

Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x12cee4590>, 'meta': {'score': 6, 'top_level_comment': 'dn1wqtd', 'retrieved_on': 1506808451, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': ''}, '_id': 'dn1wqtd', 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', '_owner': <convokit.model.corpus.Corpus object at 0x12cee4590>, 'meta': {'num_posts': 0, 'num_comments': 25}, '_id': 'skiptheroutine', 'vectors': []}), 'conversation_id': '70b8pd', '_root': '70b8pd', 'reply_to': '70b8pd', 'timestamp': 1505499162, 'text': "The Cornell IT site has a list of services for students that are free. You can get Microsoft Office for free, as well as access to Lynda.com and Box.com (on that note, don't ever save your files to just your hard drive, make sure everything is in Google Drive / Box). I would also recommend you set up Net Print and test it out preemptively. Once set up you can print to

In [8]:
random_utt.vectors

[]

In [9]:
corpus.vectors

[]

In [10]:
bow_transformer = BoWTransformer(obj_type="utterance", vector_name='bow')
bow_transformer.fit_transform(corpus)

Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x12cee4590>

In [11]:
random_utt

Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x12cee4590>, 'meta': {'score': 6, 'top_level_comment': 'dn1wqtd', 'retrieved_on': 1506808451, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': ''}, '_id': 'dn1wqtd', 'vectors': ['bow'], 'speaker': Speaker({'obj_type': 'speaker', '_owner': <convokit.model.corpus.Corpus object at 0x12cee4590>, 'meta': {'num_posts': 0, 'num_comments': 25}, '_id': 'skiptheroutine', 'vectors': []}), 'conversation_id': '70b8pd', '_root': '70b8pd', 'reply_to': '70b8pd', 'timestamp': 1505499162, 'text': "The Cornell IT site has a list of services for students that are free. You can get Microsoft Office for free, as well as access to Lynda.com and Box.com (on that note, don't ever save your files to just your hard drive, make sure everything is in Google Drive / Box). I would also recommend you set up Net Print and test it out preemptively. Once set up you can pri

In [12]:
random_utt.vectors

['bow']

### Fetching the vector for the utterance

In [13]:
random_utt.get_vector('bow')

<1x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 106 stored elements in Compressed Sparse Row format>

In [14]:
random_utt.get_vector('bow', as_dataframe=True)

Unnamed: 0,00,000,00am,00pm,01,02,03,04,05,06,...,youtu,youtube,yr,yrs,yup,zero,zeus,zimride,zip,zone
dn1wqtd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
random_utt.get_vector('bow', as_dataframe=True, columns=['youtu', 'youtube', 'yr'])

Unnamed: 0,youtu,youtube,yr
dn1wqtd,0,0,0


In [16]:
random_utt.get_vector('bow', as_dataframe=False, columns=['youtu', 'youtube', 'yr'])

array([[0, 0, 0]])

### What does this look like at the Corpus level?

In [17]:
corpus.vectors

['bow']

In [18]:
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

In [19]:
bow_matrix = corpus.get_vector_matrix('bow')

In [20]:
bow_matrix.to_dataframe().head()

Unnamed: 0,00,000,00am,00pm,01,02,03,04,05,06,...,youtu,youtube,yr,yrs,yup,zero,zeus,zimride,zip,zone
nyx4d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o0145,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o1gca,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o0ss4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o31u0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
bow_matrix.columns[:10]

['00', '000', '00am', '00pm', '01', '02', '03', '04', '05', '06']

In [22]:
bow_matrix.ids[:10]

['nyx4d',
 'o0145',
 'o1gca',
 'o0ss4',
 'o31u0',
 'o4ipd',
 'o456r',
 'o4544',
 'o3l7i',
 'o3fqm']

In [23]:
bow_matrix.name

'bow'

In [24]:
bow_matrix.matrix

<74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>

## Dumping and loading

### Dumps all vectors by default

In [25]:
# dumps all vectors by default
corpus.dump('cornell-with-bow', base_path='convokit/classifier/demo')

In [26]:
os.listdir('convokit/classifier/demo/cornell-with-bow')

['utterances.jsonl',
 'conversations.json',
 'vectors.bow.p',
 'corpus.json',
 'speakers.json',
 'index.json']

### But vectors can be excluded

In [27]:
corpus.dump('cornell-no-bow', base_path='convokit/classifier/demo', exclude_vectors=['bow'])

In [28]:
os.listdir('convokit/classifier/demo/cornell-no-bow')

['utterances.jsonl',
 'conversations.json',
 'corpus.json',
 'speakers.json',
 'index.json']

Let's check if they really are excluded:

In [29]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-no-bow')

In [30]:
corpus.vectors

[]

In [31]:
corpus.random_utterance().vectors

[]

### When the corpus is loaded, vectors are present 'structurally' but not actually loaded

In [32]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-with-bow')

In [33]:
corpus.vectors

['bow']

In [34]:
corpus.random_utterance()

Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x28ebf1a50>, 'meta': {'score': 11, 'top_level_comment': 'e8m5m8w', 'retrieved_on': 1541563337, 'gilded': 0, 'gildings': {'gid_1': 0, 'gid_2': 0, 'gid_3': 0}, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/9s0u0e/the_wrong_costume/e8m5m8w/', 'author_flair_text': "IS &amp; CS '20"}, '_id': 'e8m5m8w', 'vectors': ['bow'], 'speaker': Speaker({'obj_type': 'speaker', '_owner': <convokit.model.corpus.Corpus object at 0x28ebf1a50>, 'meta': {'num_posts': 30, 'num_comments': 296}, '_id': 'Straight_Derpin', 'vectors': []}), 'conversation_id': '9s0u0e', '_root': '9s0u0e', 'reply_to': '9s0u0e', 'timestamp': 1540750277, 'text': "Honestly it sucks that SS uniforms are so cool looking. If it weren't for the whole...Nazis being Nazis thing, it'd make a great costume. Fuck that guy tho, such a neckbeard thing to do"})

In [35]:
corpus.random_utterance().vectors

['bow']

In [36]:
corpus._vector_matrices # private variable storing vector matrices

{}

In [37]:
# when fetched normally
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

In [38]:
corpus._vector_matrices

{'bow': ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
 	with 2108383 stored elements in Compressed Sparse Row format>)}

### We can also load the corpus with vectors fully loaded

In [39]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-with-bow', vectors=['bow'])

In [40]:
corpus._vector_matrices # private variable storing matrices

{'bow': ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
 	with 2108383 stored elements in Compressed Sparse Row format>)}

In [41]:
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

### Training a VectorClassifier

Setting up a basic prediction task where we predict whether an Utterance (corresponding to a Reddit comment) has a score above 0 based on its bag-of-words vector.

In [42]:
bow_classifier = VectorClassifier(obj_type="utterance", labeller=lambda utt: utt.meta['score'] > 0)

Initializing default classification model (standard scaled logistic regression)


In [43]:
bow_classifier.fit(corpus, vector_name='bow')

<convokit.classifier.vector_classifier.VectorClassifier at 0x28f2fe6d0>

In [44]:
bow_classifier.transform(corpus, vector_name='bow')

<convokit.model.corpus.Corpus at 0x13a8c7510>

In [45]:
bow_classifier.summarize(corpus).head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
dhhm9sa,True,1.0
dw553ml,True,1.0
dvzmhdx,True,1.0
dvzpp79,True,1.0
dw0imao,True,1.0


In [46]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).head()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
hotels,1.270001
hbhs,1.11569
engine,1.109702
involves,1.081836
lincoln,1.071464


In [47]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).tail()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
mahogany,-0.667785
ignoreme,-0.722992
hilton,-0.742234
binary,-0.764383
creation,-0.784593


In [48]:
y_true, y_pred = bow_classifier.get_y_true_pred(corpus)

In [49]:
bow_classifier.base_accuracy(corpus)

0.9279546644822538

In [50]:
bow_classifier.accuracy(corpus)

0.9491452589737737

In [51]:
print(bow_classifier.classification_report(corpus))

              precision    recall  f1-score   support

       False       0.88      0.34      0.49      5365
        True       0.95      1.00      0.97     69102

    accuracy                           0.95     74467
   macro avg       0.91      0.67      0.73     74467
weighted avg       0.95      0.95      0.94     74467



## A bag-of-words prediction task at the Conversation-level

For each conversation, based on the bag-of-words of the first 5 utterances, we ask the question of whether the comment thread will eventually double in length or stay the same length. 

In [52]:
top_level_comment_ids = [utt.id for utt in corpus.iter_utterances() if utt.id == utt.meta['top_level_comment']]

In [53]:
corpus.print_summary_stats()

Number of Speakers: 7568
Number of Utterances: 74467
Number of Conversations: 10744


In [54]:
len(top_level_comment_ids)

32893

In [55]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids)


['c3p1rn8', 'c3oyf4d', 'c3p8bze', 'c3ocsyl', 'c3od15i']


In [56]:
threads_corpus.print_summary_stats()

Number of Speakers: 6160
Number of Utterances: 63697
Number of Conversations: 32888


In [57]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [58]:
from collections import Counter

In [59]:
# Count of negative / positive examples
Counter(threads_corpus.get_conversations_dataframe()['meta.thread_doubles'])

Counter({None: 31628, False: 852, True: 408})

In [60]:
bow_transformer2 = BoWTransformer(obj_type="conversation", vector_name='first_5_BoW')

Initializing default unigram CountVectorizer...Done.


In [61]:
bow_transformer2.fit_transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

<convokit.model.corpus.Corpus at 0x28f6557d0>

In [62]:
threads_corpus.vectors

['first_5_BoW']

In [63]:
bow_classifier2 = VectorClassifier(obj_type="conversation", 
                    labeller=lambda convo: convo.meta['thread_doubles'])

Initializing default classification model (standard scaled logistic regression)


In [64]:
bow_classifier2.fit_transform(threads_corpus, vector_name='first_5_BoW',
                              selector=lambda convo: convo.meta['thread_doubles'] is not None)

<convokit.model.corpus.Corpus at 0x28f6557d0>

In [65]:
summary = bow_classifier2.summarize(threads_corpus, 
                                    selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [66]:
summary.head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
e5626fc,True,1.0
dnqc6mc,True,1.0
cx87pi5,True,1.0
e8p3t2v,True,1.0
dxw7g0r,True,1.0


In [67]:
summary.tail()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
e0iez9l,False,4.210991e-08
cyeq0e8,False,2.785094e-08
dmtcex3,False,2.70029e-08
ck1dyvi,False,1.802432e-08
e6m7j9z,False,2.434951e-10


In [68]:
bow_classifier2.base_accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

0.6761904761904762

In [69]:
bow_classifier2.accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

1.0

In [70]:
print(bow_classifier2.classification_report(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       852
        True       1.00      1.00      1.00       408

    accuracy                           1.00      1260
   macro avg       1.00      1.00      1.00      1260
weighted avg       1.00      1.00      1.00      1260



In [71]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).head(10)

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
removed,0.681566
welcome,0.620911
word,0.429693
hour,0.375569
brought,0.359174
profile,0.351579
http,0.351215
head,0.32328
www,0.310239
comp,0.300922


In [72]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).tail(10)

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
tried,-0.265996
desk,-0.269321
internet,-0.270021
long,-0.278097
dean,-0.278983
23,-0.282168
extra,-0.316261
hill,-0.317753
goes,-0.361453
thanks,-0.362681


In [73]:
bow_classifier2.confusion_matrix(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

array([[852,   0],
       [  0, 408]])

In [74]:
bow_classifier2.evaluate_with_cv(threads_corpus, vector_name='first_5_BoW', selector=lambda convo: convo.meta['thread_doubles'] is not None)

Running a cross-validated evaluation...
Done.


array([0.6468254 , 0.6984127 , 0.69047619, 0.6984127 , 0.67063492])

In [75]:
bow_classifier2.evaluate_with_train_test_split(threads_corpus, vector_name='first_5_BoW',
                                               selector=lambda convo: convo.meta['thread_doubles'] is not None)

Running a train-test-split evaluation...
Done.


(0.6904761904761905, array([[140,  24],
        [ 54,  34]]))

In [76]:
objs = list(threads_corpus.iter_conversations(selector=lambda convo: convo.meta['thread_doubles'] is not None))

Since vectors are tied to Corpus, you cannot apply a VectorClassifier to an arbitrary set of objects.

In [77]:
objs = bow_classifier2.transform_objs(objs)

NotImplementedError: transform_objs() is not supported for VectorClassifier

In [None]:
# bow_classifier2.summarize_objs(objs)

# Here be dragons (i.e. this is not updated yet)

## Paired bag-of-words prediction for comment thread doubling in length versus staying the same length based on first 5 utterances

In [None]:
from convokit import Pairer, PairedBoW

In [None]:
corpus = Corpus(filename=download('subreddit-Cornell'))

In [None]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids, preserve_convo_meta=True)

In [None]:
next(threads_corpus.iter_conversations())

In [None]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [None]:
next(threads_corpus.iter_conversations()).meta

In [None]:
pairer = Pairer(obj_type="conversation", 
                pos_label_func=lambda convo: convo.meta['thread_doubles'], 
                neg_label_func=lambda convo: convo.meta['thread_doubles'] == False,
                pairing_func=lambda convo: convo.meta['original_convo_id']
               )

In [None]:
pairer.transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
print(next(threads_corpus.iter_conversations()))

In [None]:
for convo in threads_corpus.iter_conversations():
    if convo.meta['pair_id'] is not None:
        print(convo)
        break

In [None]:
from convokit import BoWTransformer

In [None]:
bow_transformer = BoWTransformer(obj_type="conversation", 
                                 text_func=lambda convo: " ".join([utt.text for utt in convo.get_chronological_utterance_list()][:5]),
                    )

In [None]:
bow_transformer.fit_transform(threads_corpus, selector=lambda convo: convo.meta['pair_id'] is not None)

In [None]:
paired_bow = PairedBoW(obj_type="conversation")

In [None]:
paired_bow.fit(threads_corpus)

In [None]:
paired_bow.summarize(threads_corpus)

In [None]:
paired_bow.get_coefs(feature_names=bow_transformer.get_vocabulary())