In [1]:
import os
os.chdir('../../..')

This notebook demonstrates the use of vectors in ConvoKit, as well as the use of the bag-of-words transformer, BoWTransformer, and the VectorClassifier.

In [2]:
import convokit

In [3]:
from convokit import Corpus, download

In [4]:
corpus = Corpus(download('subreddit-Cornell'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/subreddit-Cornell


In [5]:
corpus.print_summary_stats()

Number of Speakers: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Using a Transformer to add a vector

In [6]:
from convokit import VectorClassifier, BoWTransformer

In [7]:
random_utt = corpus.random_utterance()
random_utt

Utterance({'obj_type': 'utterance', 'meta': {'score': 1, 'top_level_comment': 'chi6twz', 'retrieved_on': 1433800228, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': ''}, 'vectors': [], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x120f00890>, 'id': 'Adium'}), 'conversation_id': '25aqqf', 'reply_to': 'chi6twz', 'timestamp': 1400111772, 'text': 'Just make sure you activate it before you graduate.', 'owner': <convokit.model.corpus.Corpus object at 0x120f00890>, 'id': 'chi76a9'})

In [8]:
random_utt.vectors

[]

In [9]:
corpus.vectors

set()

In [10]:
bow_transformer = BoWTransformer(obj_type="utterance", vector_name='bow')
bow_transformer.fit_transform(corpus)

Initializing default unigram CountVectorizer...Done.


<convokit.model.corpus.Corpus at 0x120f00890>

In [11]:
random_utt

Utterance({'obj_type': 'utterance', 'meta': {'score': 1, 'top_level_comment': 'chi6twz', 'retrieved_on': 1433800228, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': ''}, 'vectors': ['bow'], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x120f00890>, 'id': 'Adium'}), 'conversation_id': '25aqqf', 'reply_to': 'chi6twz', 'timestamp': 1400111772, 'text': 'Just make sure you activate it before you graduate.', 'owner': <convokit.model.corpus.Corpus object at 0x120f00890>, 'id': 'chi76a9'})

In [12]:
random_utt.vectors

['bow']

### Fetching the vector for the utterance

In [13]:
random_utt.get_vector('bow')

<1x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [14]:
random_utt.get_vector('bow', as_dataframe=True)

Unnamed: 0,00,000,00am,00pm,01,02,03,04,05,06,...,youtu,youtube,yr,yrs,yup,zero,zeus,zimride,zip,zone
chi76a9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
random_utt.get_vector('bow', as_dataframe=True, columns=['youtu', 'youtube', 'yr'])

Unnamed: 0,youtu,youtube,yr
chi76a9,0,0,0


In [16]:
random_utt.get_vector('bow', as_dataframe=False, columns=['youtu', 'youtube', 'yr'])

<1x3 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

### What does this look like at the Corpus level?

In [17]:
corpus.vectors

{'bow'}

In [18]:
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

In [19]:
corpus.get_vector_matrix('bow')._sparse

True

In [20]:
bow_matrix = corpus.get_vector_matrix('bow')

In [21]:
bow_matrix.to_dataframe().head()

Unnamed: 0,00,000,00am,00pm,01,02,03,04,05,06,...,youtu,youtube,yr,yrs,yup,zero,zeus,zimride,zip,zone
nyx4d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o0145,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o1gca,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o0ss4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o31u0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
bow_matrix.columns[:10]

['00', '000', '00am', '00pm', '01', '02', '03', '04', '05', '06']

In [23]:
bow_matrix.ids[:10]

['nyx4d',
 'o0145',
 'o1gca',
 'o0ss4',
 'o31u0',
 'o4ipd',
 'o456r',
 'o4544',
 'o3l7i',
 'o3fqm']

In [24]:
bow_matrix.name

'bow'

In [25]:
bow_matrix.matrix

<74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>

## Dumping and loading

### Dumps all vectors by default

In [26]:
corpus.meta['num_users'] = corpus.meta['num_user']

In [27]:
del corpus.meta['num_user']

In [28]:
corpus.meta_index.overall_index

{'subreddit': "<class 'str'>",
 'num_posts': "<class 'int'>",
 'num_comments': "<class 'int'>",
 'num_users': "<class 'int'>"}

In [29]:
# dumps all vectors by default
corpus.dump('cornell-with-bow', base_path='convokit/classifier/demo')

In [30]:
os.listdir('convokit/classifier/demo/cornell-with-bow')

['utterances.jsonl',
 'conversations.json',
 'vectors.bow.p',
 'corpus.json',
 'speakers.json',
 'index.json']

### But vectors can be excluded

In [31]:
corpus.dump('cornell-no-bow', base_path='convokit/classifier/demo', exclude_vectors=['bow'])

In [32]:
os.listdir('convokit/classifier/demo/cornell-no-bow')

['utterances.jsonl',
 'conversations.json',
 'corpus.json',
 'speakers.json',
 'index.json']

Let's check if they really are excluded:

In [33]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-no-bow')

In [34]:
corpus.vectors

set()

In [35]:
corpus.random_utterance().vectors

[]

### When the corpus is loaded, vectors are present 'structurally' but not actually loaded

In [36]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-with-bow')

In [37]:
corpus.vectors

{'bow'}

In [38]:
corpus.random_utterance()

Utterance({'obj_type': 'utterance', 'meta': {'score': 1, 'top_level_comment': 'e1mtoy5', 'retrieved_on': 1535637246, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/8vdvxb/confused_about_signing_up_for_classes/e1r3cuw/', 'author_flair_text': ''}, 'vectors': ['bow'], 'speaker': Speaker({'obj_type': 'speaker', 'meta': {}, 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x29afef950>, 'id': 'mbigred6'}), 'conversation_id': '8vdvxb', 'reply_to': 'e1mtoy5', 'timestamp': 1530665356, 'text': 'so helpful thank u!!', 'owner': <convokit.model.corpus.Corpus object at 0x29afef950>, 'id': 'e1r3cuw'})

In [39]:
corpus.random_utterance().vectors

['bow']

In [40]:
corpus.vectors

{'bow'}

In [41]:
# when fetched normally
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

### We can also load the corpus with vectors fully loaded

In [43]:
corpus = Corpus(filename='convokit/classifier/demo/cornell-with-bow', preload_vectors=['bow'])

In [44]:
corpus.vectors

{'bow'}

In [45]:
corpus.get_vector_matrix('bow')

ConvoKitMatrix('name': bow, 'matrix': <74467x9340 sparse matrix of type '<class 'numpy.int64'>'
	with 2108383 stored elements in Compressed Sparse Row format>)

### Training a VectorClassifier

Setting up a basic prediction task where we predict whether an Utterance (corresponding to a Reddit comment) has a score above 0 based on its bag-of-words vector.

In [46]:
bow_classifier = VectorClassifier(obj_type="utterance", 
                                  vector_name='bow',
                                  labeller=lambda utt: utt.meta['score'] > 0)

Initialized default classification model (standard scaled logistic regression).


In [47]:
bow_classifier.fit(corpus)

<convokit.classifier.vector_classifier.VectorClassifier at 0x29b811a50>

In [48]:
bow_classifier.transform(corpus)

<convokit.model.corpus.Corpus at 0x13a42e2d0>

In [49]:
bow_classifier.summarize(corpus).head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
dhhm9sa,True,1.0
dw553ml,True,1.0
dvzmhdx,True,1.0
dvzpp79,True,1.0
dw0imao,True,1.0


In [50]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).head()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
hotels,1.270001
hbhs,1.11569
engine,1.109702
involves,1.081836
lincoln,1.071464


In [51]:
bow_classifier.get_coefs(feature_names=bow_transformer.get_vocabulary()).tail()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
mahogany,-0.667785
ignoreme,-0.722992
hilton,-0.742234
binary,-0.764383
creation,-0.784593


In [52]:
y_true, y_pred = bow_classifier.get_y_true_pred(corpus)

In [53]:
bow_classifier.base_accuracy(corpus)

0.9279546644822538

In [54]:
bow_classifier.accuracy(corpus)

0.9491452589737737

In [55]:
print(bow_classifier.classification_report(corpus))

              precision    recall  f1-score   support

       False       0.88      0.34      0.49      5365
        True       0.95      1.00      0.97     69102

    accuracy                           0.95     74467
   macro avg       0.91      0.67      0.73     74467
weighted avg       0.95      0.95      0.94     74467



## A bag-of-words prediction task at the Conversation-level

For each conversation, based on the bag-of-words of the first 5 utterances, we ask the question of whether the comment thread will eventually double in length or stay the same length. 

In [56]:
top_level_comment_ids = [utt.id for utt in corpus.iter_utterances() if utt.id == utt.meta['top_level_comment']]

In [57]:
corpus.print_summary_stats()

Number of Speakers: 7568
Number of Utterances: 74467
Number of Conversations: 10744


In [58]:
len(top_level_comment_ids)

32893

In [59]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids)


['c3ocsyl', 'c3oyf4d', 'c3p8bze', 'c3od15i', 'c3p1rn8']


In [60]:
threads_corpus.print_summary_stats()

Number of Speakers: 6160
Number of Utterances: 63697
Number of Conversations: 32888


In [61]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [62]:
from collections import Counter

In [63]:
# Count of negative / positive examples
Counter(threads_corpus.get_conversations_dataframe()['meta.thread_doubles'])

Counter({None: 31628, True: 408, False: 852})

In [64]:
bow_transformer2 = BoWTransformer(obj_type="conversation", vector_name='first_5_BoW')

Initializing default unigram CountVectorizer...Done.


In [65]:
bow_transformer2.fit_transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

<convokit.model.corpus.Corpus at 0x296203f90>

In [66]:
threads_corpus.vectors

{'first_5_BoW'}

In [68]:
bow_classifier2 = VectorClassifier(obj_type="conversation", 
                                   vector_name='first_5_BoW',
                                   labeller=lambda convo: convo.meta['thread_doubles'])

Initialized default classification model (standard scaled logistic regression).


In [70]:
bow_classifier2.fit_transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

<convokit.model.corpus.Corpus at 0x296203f90>

In [71]:
summary = bow_classifier2.summarize(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [72]:
summary.head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
e8p3t2v,True,1.0
dxw7g0r,True,1.0
dnqc6mc,True,1.0
cx87pi5,True,1.0
e5626fc,True,1.0


In [73]:
summary.tail()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
e0iez9l,False,4.207159e-08
cyeq0e8,False,2.786999e-08
dmtcex3,False,2.698015e-08
ck1dyvi,False,1.798191e-08
e6m7j9z,False,2.424837e-10


In [74]:
bow_classifier2.base_accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

0.6761904761904762

In [75]:
bow_classifier2.accuracy(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

1.0

In [76]:
print(bow_classifier2.classification_report(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       852
        True       1.00      1.00      1.00       408

    accuracy                           1.00      1260
   macro avg       1.00      1.00      1.00      1260
weighted avg       1.00      1.00      1.00      1260



In [77]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).head(10)

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
removed,0.68155
welcome,0.620885
word,0.429735
hour,0.375601
brought,0.359192
profile,0.351574
http,0.351239
head,0.323378
www,0.310207
comp,0.300936


In [78]:
bow_classifier2.get_coefs(feature_names=bow_transformer2.get_vocabulary()).tail(10)

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
tried,-0.26594
desk,-0.269333
internet,-0.269927
long,-0.278174
dean,-0.279008
23,-0.282138
extra,-0.316226
hill,-0.317767
goes,-0.361459
thanks,-0.362673


In [79]:
bow_classifier2.confusion_matrix(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

array([[852,   0],
       [  0, 408]])

In [81]:
bow_classifier2.evaluate_with_cv(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

Running a cross-validated evaluation...Done.


array([0.69444444, 0.71031746, 0.70238095, 0.6547619 , 0.63492063])

In [82]:
bow_classifier2.evaluate_with_train_test_split(threads_corpus, 
                                               selector=lambda convo: convo.meta['thread_doubles'] is not None)

Running a train-test-split evaluation...
Done.


(0.6865079365079365, array([[141,  29],
        [ 50,  32]]))

# Here be dragons (i.e. this is not updated yet)

## Paired bag-of-words prediction for comment thread doubling in length versus staying the same length based on first 5 utterances

In [None]:
from convokit import Pairer, PairedVectorPrediction

In [None]:
corpus = Corpus(filename=download('subreddit-Cornell'))

In [None]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids, preserve_convo_meta=True)

In [None]:
next(threads_corpus.iter_conversations())

In [None]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [None]:
next(threads_corpus.iter_conversations()).meta

In [None]:
pairer = Pairer(obj_type="conversation", 
                pos_label_func=lambda convo: convo.meta['thread_doubles'], 
                neg_label_func=lambda convo: convo.meta['thread_doubles'] == False,
                pairing_func=lambda convo: convo.meta['original_convo_id']
               )

In [None]:
pairer.transform(threads_corpus, selector=lambda convo: convo.meta['thread_doubles'] is not None)

In [None]:
print(next(threads_corpus.iter_conversations()))

In [None]:
for convo in threads_corpus.iter_conversations():
    if convo.meta['pair_id'] is not None:
        print(convo)
        break

In [None]:
from convokit import BoWTransformer

In [None]:
bow_transformer = BoWTransformer(obj_type="conversation", 
                                 text_func=lambda convo: " ".join([utt.text for utt in convo.get_chronological_utterance_list()][:5]),
                    )

In [None]:
bow_transformer.fit_transform(threads_corpus, selector=lambda convo: convo.meta['pair_id'] is not None)

In [None]:
paired_bow = PairedVectorPrediction(obj_type="conversation")

In [None]:
paired_bow.fit(threads_corpus)

In [None]:
paired_bow.summarize(threads_corpus)

In [None]:
paired_bow.get_coefs(feature_names=bow_transformer.get_vocabulary())