In [1]:
import os
os.chdir('../../..')

In [2]:
import convokit



In [3]:
from convokit import Corpus, download

In [4]:
corpus = Corpus(filename=download('subreddit-Cornell'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/subreddit-Cornell


In [5]:
corpus.print_summary_stats()

Number of Users: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Some new Conversation functionality

In [6]:
convo = corpus.get_conversation('o31u0')

In [7]:
convo.print_conversation_structure()

cchambo
    jklol
    djnap
    Brimwoodboy
        jklol


In [8]:
convo.print_conversation_structure(lambda utt: utt.id)

o31u0
    c3dzmtu
    c3e0ou0
    c3f7l5b
        c3feqc4


In [9]:
convo.get_chronological_utterance_list()

[Utterance({'id': 'o31u0', 'user': User([('name', 'cchambo')]), 'root': 'o31u0', 'reply_to': None, 'timestamp': 1325714498, 'text': '', 'meta': {'score': 27, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/o31u0/cornell_scientists_create_hole_in_time_where/', 'author_flair_text': 'SNES 2015'}}),
 Utterance({'id': 'c3dzmtu', 'user': User([('name', 'jklol')]), 'root': 'o31u0', 'reply_to': 'o31u0', 'timestamp': 1325721216, 'text': 'wooo! aep!', 'meta': {'score': 3, 'top_level_comment': 'c3dzmtu', 'retrieved_on': 1428124647, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': 'AEP 2011'}}),
 Utterance({'id': 'c3e0ou0', 'user': User([('name', 'djnap')]), 'root': 'o31u0', 'reply_to': 'o31u0', 'timestamp': 1325727478, 'text': "it's always nice to see Cornell in the news", 'meta': {'score': 2, 'top_level_comment': 'c3e0ou0', 're

In [10]:
[utt.user.id for utt in convo.get_chronological_utterance_list()]

['cchambo', 'jklol', 'djnap', 'Brimwoodboy', 'jklol']

In [11]:
convo.get_root_to_leaf_paths()

[[Utterance({'id': 'o31u0', 'user': User([('name', 'cchambo')]), 'root': 'o31u0', 'reply_to': None, 'timestamp': 1325714498, 'text': '', 'meta': {'score': 27, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/o31u0/cornell_scientists_create_hole_in_time_where/', 'author_flair_text': 'SNES 2015'}}),
  Utterance({'id': 'c3e0ou0', 'user': User([('name', 'djnap')]), 'root': 'o31u0', 'reply_to': 'o31u0', 'timestamp': 1325727478, 'text': "it's always nice to see Cornell in the news", 'meta': {'score': 2, 'top_level_comment': 'c3e0ou0', 'retrieved_on': 1428125150, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': 'Engineering 2015'}})],
 [Utterance({'id': 'o31u0', 'user': User([('name', 'cchambo')]), 'root': 'o31u0', 'reply_to': None, 'timestamp': 1325714498, 'text': '', 'meta': {'score': 27, 'top_level_comment': None, 'retriev

In [12]:
for path in convo.get_root_to_leaf_paths():
    print([utt.user.id for utt in path])

['cchambo', 'djnap']
['cchambo', 'Brimwoodboy', 'jklol']
['cchambo', 'jklol']


## Cumulative BoW

In [13]:
from convokit import Forecaster

Let's set up a forecasting task to predict for whether a Reddit comment will have a positive score, i.e. upvotes > downvotes.

In [14]:
# Adding a 'y' feature to fit to
for utt in corpus.iter_utterances():
    utt.add_meta('pos_score', int(utt.meta['score'] > 0))

In [15]:
forecaster = Forecaster(label_feat='pos_score', skip_broken_convos=True)

No model passed to Forecaster. Initializing default forecaster model: Cumulative Bag-of-words...
Initializing default unigram CountVectorizer...
Initializing default classification model (standard scaled logistic regression)


In [16]:
forecaster.fit(corpus)

Fitting cumulative BoW classification model...
Done.




In [17]:
forecaster.transform(corpus)

<convokit.model.corpus.Corpus at 0x104ee0a58>

In [18]:
forecast_df = forecaster.summarize(corpus)

In [19]:
forecast_df.shape

(63697, 2)

In [20]:
forecast_df.head()

Unnamed: 0_level_0,forecast,forecast_prob
utt_id,Unnamed: 1_level_1,Unnamed: 2_level_1
dqskvar,1.0,1.0
dqskx4k,1.0,1.0
dqteypw,1.0,1.0
dqskuoz,1.0,1.0
d0mte0j,1.0,1.0


In [21]:
forecast_df.tail(10)

Unnamed: 0_level_0,forecast,forecast_prob
utt_id,Unnamed: 1_level_1,Unnamed: 2_level_1
dwkaek2,0.0,0.008834
dygvblg,0.0,0.007995
e8rjhfe,0.0,0.007228
e8se59n,0.0,0.00705
cfdzy7o,0.0,0.002103
dlpl133,0.0,0.001701
dbu9m5t,0.0,0.00165
cfdyxdt,0.0,0.001453
d54rl1r,0.0,6.6e-05
d54utw6,0.0,5e-06


Let's examine a Conversation that has an utterance forecasted to have negative score.

In [37]:
corpus.get_utterance('dpn8e4v')

Utterance({'id': 'dpn8e4v', 'user': User([('name', 'Trumpsamerican')]), 'root': '7a75x0', 'reply_to': 'dpm8anu', 'timestamp': 1510358978, 'text': "If you don't mind me asking, is your gpa over a 3.7? Because I heard that ILR's average GPA is a 3.5, and that's incredibly low given that History majors typically get 3.8+. I applied to ILR, btw.\n\nLastly, pertaining to the last portion of your statement, is it possible to take only HR classes, and things in that realm, and avoid history/law classes?", 'meta': {'score': 0, 'top_level_comment': 'dp95zls', 'retrieved_on': 1512515249, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/7a75x0/what_is_the_best_college_for_law_school/dpn8e4v/', 'author_flair_text': '', 'pos_score': 0, 'forecast': 0.0, 'forecast_prob': 0.3523093730147849}})

In [38]:
corpus.get_utterance('dpn8e4v').root

'7a75x0'

In [39]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure()

Trumpsamerican
    IthacaisGorges_
    _vpl
    lyfehack
        Trumpsamerican
            byanilla
                Trumpsamerican
                    byanilla
                        Trumpsamerican
                            byanilla
                                Trumpsamerican
                                    byanilla
                                        Trumpsamerican
                                            byanilla
                    [deleted]
    mattezai


### Forecasted

In [40]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure(lambda utt: str(utt.meta['forecast']))

None
    1.0
    1.0
    1.0
        1.0
            1.0
                0.0
                    1.0
                        0.0
                            1.0
                                1.0
                                    1.0
                                        1.0
                                            1.0
                    1.0
    1.0


### Actual

In [41]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure(lambda utt: str(utt.meta['pos_score']))

0
    1
    1
    1
        1
            1
                0
                    1
                        0
                            1
                                1
                                    1
                                        1
                                            1
                    1
    1


In [42]:
forecasts = [utt.meta['forecast'] for utt in corpus.iter_utterances()]
actual = [utt.meta['pos_score'] for utt in corpus.iter_utterances()]

In [43]:
y_true_pred = [(forecast, actual) for forecast, actual in zip(forecasts, actual) if forecast is not None]

In [44]:
import numpy as np
from collections import Counter

In [45]:
y_pred = np.array([x[0] for x in y_true_pred])

In [46]:
y_true = np.array([x[1] for x in y_true_pred])

In [47]:
# baseline accuracy: assume all positive
np.mean(y_true)

0.9413316168736361

In [48]:
# achieved accuracy
np.mean(y_true == y_pred)

0.9522112501373691

In [49]:
from sklearn.metrics import confusion_matrix

In [50]:
confusion_matrix(y_true=y_true, y_pred=y_pred)

array([[  863,  2874],
       [  170, 59790]])