In [1]:
from convokit import Corpus, download

In [2]:
corpus = Corpus(filename=download('subreddit-Cornell'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/subreddit-Cornell


In [3]:
corpus.print_summary_stats()

Number of Users: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Some new Conversation functionality

In [4]:
convo = corpus.get_conversation('o31u0')

In [5]:
convo.print_conversation_structure()

cchambo
    jklol
    djnap
    Brimwoodboy
        jklol


In [6]:
convo.print_conversation_structure(lambda utt: utt.id)

o31u0
    c3dzmtu
    c3e0ou0
    c3f7l5b
        c3feqc4


In [7]:
convo.get_chronological_utterance_list()

[Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x130afd7d0>, 'meta': {'score': 27, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/o31u0/cornell_scientists_create_hole_in_time_where/', 'author_flair_text': 'SNES 2015'}, '_id': 'o31u0', 'user': Speaker({'obj_type': 'user', '_owner': <convokit.model.corpus.Corpus object at 0x130afd7d0>, 'meta': {'num_posts': 10, 'num_comments': 29}, '_id': 'cchambo', '_name': 'cchambo'}), 'root': 'o31u0', 'reply_to': None, 'timestamp': 1325714498, 'text': ''}),
 Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x130afd7d0>, 'meta': {'score': 3, 'top_level_comment': 'c3dzmtu', 'retrieved_on': 1428124647, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': 'AEP 2011'}, '_id': 'c3dzmtu', 'user': Speaker({'obj_type': 

In [8]:
[utt.user.id for utt in convo.get_chronological_utterance_list()]

['cchambo', 'jklol', 'djnap', 'Brimwoodboy', 'jklol']

In [9]:
convo.get_root_to_leaf_paths()

[[Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x130afd7d0>, 'meta': {'score': 27, 'top_level_comment': None, 'retrieved_on': -1, 'gilded': -1, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '/r/Cornell/comments/o31u0/cornell_scientists_create_hole_in_time_where/', 'author_flair_text': 'SNES 2015'}, '_id': 'o31u0', 'user': Speaker({'obj_type': 'user', '_owner': <convokit.model.corpus.Corpus object at 0x130afd7d0>, 'meta': {'num_posts': 10, 'num_comments': 29}, '_id': 'cchambo', '_name': 'cchambo'}), 'root': 'o31u0', 'reply_to': None, 'timestamp': 1325714498, 'text': ''}),
  Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x130afd7d0>, 'meta': {'score': 3, 'top_level_comment': 'c3dzmtu', 'retrieved_on': 1428124647, 'gilded': 0, 'gildings': None, 'subreddit': 'Cornell', 'stickied': False, 'permalink': '', 'author_flair_text': 'AEP 2011'}, '_id': 'c3dzmtu', 'user': Speaker({'obj_type'

In [10]:
for path in convo.get_root_to_leaf_paths():
    print([utt.user.id for utt in path])

['cchambo', 'jklol']
['cchambo', 'djnap']
['cchambo', 'Brimwoodboy', 'jklol']


## Cumulative BoW

In [11]:
from convokit import Forecaster

Let's set up a forecasting task to predict for whether a Reddit comment will have a positive score, i.e. upvotes > downvotes.

In [12]:
# Adding a 'y' feature to fit to
for utt in corpus.iter_utterances():
    utt.add_meta('pos_score', int(utt.meta['score'] > 0))

In [14]:
forecaster = Forecaster(label_func=lambda utt: utt.meta['pos_score'], skip_broken_convos=True)

No model passed to Forecaster. Initializing default forecaster model: Cumulative Bag-of-words...
Initializing default unigram CountVectorizer...
Initializing default classification model (standard scaled logistic regression)


In [15]:
forecaster.fit(corpus)

Fitting cumulative BoW classification model...
Done.




In [None]:
forecaster.transform(corpus)

In [None]:
forecast_df = forecaster.summarize(corpus)

In [None]:
forecast_df.shape

In [None]:
forecast_df.head()

In [None]:
forecast_df.tail(10)

Let's examine a Conversation that has an utterance forecasted to have negative score.

In [None]:
corpus.get_utterance('dpn8e4v')

In [None]:
corpus.get_utterance('dpn8e4v').root

In [None]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure()

### Forecasted

In [None]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure(lambda utt: str(utt.meta['forecast']))

### Actual

In [None]:
corpus.get_conversation(corpus.get_utterance('dpn8e4v').root).print_conversation_structure(lambda utt: str(utt.meta['pos_score']))

In [None]:
forecasts = [utt.meta['forecast'] for utt in corpus.iter_utterances()]
actual = [utt.meta['pos_score'] for utt in corpus.iter_utterances()]

In [None]:
y_true_pred = [(forecast, actual) for forecast, actual in zip(forecasts, actual) if forecast is not None]

In [None]:
import numpy as np
from collections import Counter

In [None]:
y_pred = np.array([x[0] for x in y_true_pred])

In [None]:
y_true = np.array([x[1] for x in y_true_pred])

In [None]:
# baseline accuracy: assume all positive
np.mean(y_true)

In [None]:
# achieved accuracy
np.mean(y_true == y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_true=y_true, y_pred=y_pred)