We can use Hypergraph features for various predictive tasks:

In [2]:
import convokit
from convokit import Corpus, HyperConvo
import pickle

In [3]:
corpus = Corpus(filename=convokit.download("reddit-corpus-small"))

Dataset already exists at /Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit/convokit/tensors/reddit-corpus-small


In [4]:
top_level_utterance_ids = [utt.id for utt in corpus.iter_utterances() if utt.id == utt.meta['top_level_comment']]

In [5]:
len(top_level_utterance_ids)

10000

In [6]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_utterance_ids, 
                                              preserve_convo_meta=True,
                                              preserve_corpus_meta=False)

### Annotating dataset with predictive features: hyperconvo, volume, BoW, reply-tree

In [7]:
## annotating
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, invalid_val=-1)
hc.fit_transform(threads_corpus)
feats = threads_corpus.random_conversation().meta["hyperconvo"]

In [8]:
list(feats.keys())[:3]

['max[indegree over c->c responses]',
 'argmax[indegree over c->c responses]',
 'norm.max[indegree over c->c responses]']

In [9]:
## volume is the number of unique users in the first 10 comments
for convo in threads_corpus.iter_conversations():
    convo.meta['volume'] = len(set([utt.user for utt in convo.get_chronological_utterance_list()[:10]]))

In [10]:
## reply-tree features (subset of hyperconvo features related to comment-to-comment features)
for convo in threads_corpus.iter_conversations():
    reply_tree_feats = {k: v for k, v in convo.meta['hyperconvo'].items() if "c->c" in k}
    convo.meta['reply-tree'] = reply_tree_feats

### Adding the predictive labels to the dataset

In [11]:
for convo in threads_corpus.iter_conversations():
    convo.meta['comment-growth'] = len(list(convo.iter_utterances())) >= 15
    
    convo_utts = convo.get_chronological_utterance_list()
    if len(convo_utts) >= 20:
        first_10_spkrs = len(set([utt.speaker.id for utt in convo_utts[:10]]))
        first_20_spkrs = len(set([utt.speaker.id for utt in convo_utts[:20]]))
        convo.meta['commenter-growth'] = (first_20_spkrs / first_10_spkrs) >= 2.0
    else:
        convo.meta['commenter-growth'] = None

In [13]:
from convokit import Classifier, Pairer

In [14]:
pairer_1 = Pairer(obj_type="conversation", 
                pairing_func=lambda convo: convo.meta['original_convo_id'],
                pos_label_func=lambda convo: convo.meta['comment-growth'],
                neg_label_func=lambda convo: not convo.meta['comment-growth'],
                pair_id_feat_name="pair_id_1",
                label_feat_name="pair_obj_1",
                pair_orientation_feat_name="pair_orientation_1"
               )

In [15]:
pairer_1.transform(threads_corpus)

<convokit.model.corpus.Corpus at 0x139e275d0>

In [16]:
pairer_2 = Pairer(obj_type="conversation", 
                pairing_func=lambda convo: convo.meta['original_convo_id'],
                pos_label_func=lambda convo: convo.meta['commenter-growth'],
                neg_label_func=lambda convo: not convo.meta['commenter-growth'],
                pair_id_feat_name="pair_id_2",
                label_feat_name="pair_obj_2",
                pair_orientation_feat_name="pair_orientation_2"
               )

In [17]:
pairer_2.transform(threads_corpus)

<convokit.model.corpus.Corpus at 0x139e275d0>

### Adding BoW vectors -- only on paired convos

In [26]:
from convokit import BoWTransformer

In [29]:
bow = BoWTransformer(obj_type="conversation", 
                     text_func=lambda convo: ' '.join([utt.text for utt in convo.get_chronological_utterance_list()[:10]]),
                     vector_name="bow_1"
                    )

Initializing default unigram CountVectorizer...


In [30]:
bow.fit_transform(threads_corpus, selector=lambda convo: convo.meta['pair_id_1'] is not None)

<convokit.model.corpus.Corpus at 0x139e275d0>

In [31]:
bow = BoWTransformer(obj_type="conversation", 
                     text_func=lambda convo: ' '.join([utt.text for utt in convo.get_chronological_utterance_list()[:10]]),
                     vector_name="bow_2"
                    )

Initializing default unigram CountVectorizer...


In [32]:
bow.fit_transform(threads_corpus, selector=lambda convo: convo.meta['pair_id_2'] is not None)

<convokit.model.corpus.Corpus at 0x139e275d0>

In [34]:
from convokit import PairedPrediction, PairedBoW

## Comment-growth cross-validated scores for different feature sets: hyperconvo, volume, reply-tree, BoW

In [19]:
pp = PairedPrediction(obj_type="conversation",
                      pred_feats=["hyperconvo"],
                      pair_id_feat_name="pair_id_1",
                      label_feat_name="pair_obj_1",
                      pair_orientation_feat_name="pair_orientation_1"
                     )
pp.summarize(threads_corpus)

Found 549 valid pairs.


0.5846994535519126

In [20]:
pp = PairedPrediction(obj_type="conversation",
                      pred_feats=["volume"],
                      pair_id_feat_name="pair_id_1",
                      label_feat_name="pair_obj_1",
                      pair_orientation_feat_name="pair_orientation_1"
                     )
pp.summarize(threads_corpus)

Found 549 valid pairs.


0.517304189435337

In [21]:
pp = PairedPrediction(obj_type="conversation",
                      pred_feats=["reply-tree"],
                      pair_id_feat_name="pair_id_1",
                      label_feat_name="pair_obj_1",
                      pair_orientation_feat_name="pair_orientation_1"
                     )
pp.summarize(threads_corpus)

Found 549 valid pairs.


0.6120218579234973

In [36]:
p_bow = PairedBoW(obj_type="conversation",
                  vector_name="bow_1",
                  pair_id_feat_name="pair_id_1",
                  label_feat_name="pair_obj_1",
                  pair_orientation_feat_name="pair_orientation_1"
                 )

In [37]:
p_bow.summarize(threads_corpus)

Found 549 valid pairs.


0.5264116575591985

## Commenter-growth cross-validated scores for different feature sets: hyperconvo, volume, reply-tree, BoW

In [22]:
pp = PairedPrediction(obj_type="conversation",
                      pred_feats=["hyperconvo"],
                      pair_id_feat_name="pair_id_2",
                      label_feat_name="pair_obj_2",
                      pair_orientation_feat_name="pair_orientation_2"
                     )
pp.summarize(threads_corpus)

Found 306 valid pairs.


0.5980392156862745

In [23]:
pp = PairedPrediction(obj_type="conversation",
                      pred_feats=["volume"],
                      pair_id_feat_name="pair_id_2",
                      label_feat_name="pair_obj_2",
                      pair_orientation_feat_name="pair_orientation_2"
                     )
pp.summarize(threads_corpus)

Found 306 valid pairs.


0.5915032679738562

In [24]:
pp = PairedPrediction(obj_type="conversation",
                      pred_feats=["reply-tree"],
                      pair_id_feat_name="pair_id_2",
                      label_feat_name="pair_obj_2",
                      pair_orientation_feat_name="pair_orientation_2"
                     )
pp.summarize(threads_corpus)

Found 306 valid pairs.


0.49673202614379086

In [38]:
p_bow = PairedBoW(obj_type="conversation",
                  vector_name="bow_2",
                  pair_id_feat_name="pair_id_2",
                  label_feat_name="pair_obj_2",
                  pair_orientation_feat_name="pair_orientation_2"
                 )

In [39]:
p_bow.summarize(threads_corpus)

Found 306 valid pairs.


0.5261437908496732