In [1]:
import os
os.getcwd()
os.chdir("../..")
os.getcwd()

'/Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit'

We can use Hypergraph features for various predictive tasks:

In [8]:
import convokit
from convokit import Corpus, HyperConvo
import pickle

In [9]:
corpus = Corpus(filename=convokit.download("reddit-corpus-small"))

Dataset already exists at /Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit/convokit/tensors/reddit-corpus-small


In [5]:
top_level_utterance_ids = [utt.id for utt in corpus.iter_utterances() if utt.id == utt.meta['top_level_comment']]

In [6]:
len(top_level_utterance_ids)

10000

In [10]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_utterance_ids, 
                                              preserve_convo_meta=True,
                                              preserve_corpus_meta=False)

In [14]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10)
hc.fit_transform(threads_corpus)
feats = threads_corpus.random_conversation().meta["hyperconvo"]

In [17]:
threads_corpus.print_summary_stats()

Number of Speakers: 116643
Number of Utterances: 288846
Number of Conversations: 10000


In [15]:
list(feats.keys())[:3]

['max[indegree over c->c responses]',
 'argmax[indegree over c->c responses]',
 'norm.max[indegree over c->c responses]']

In [16]:
from collections import defaultdict

In [19]:
from convokit import Classifier

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
import random
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

random.seed(2019)

for task in ["comment-growth", "commenter-growth"]: #, "post-deleted", "user-deleted"
    print("task {}".format(task))
    pos, neg = [], []
    for roots in thread_roots_by_self_post.values():
        has_pos, has_neg = [], []
        for root in roots:
            if task == "comment-growth":
                if len(threads[root]) >= 15:
                    has_pos.append(root)
                if len(threads[root]) == 10:
                    has_neg.append(root)
            elif task == "commenter-growth":
                if len(threads[root]) >= 20:
                    if len(set(c.user.name for c in threads[root].values())) >= \
                        len(set(c.user.name for c in thread_pfxs[root].values())) * 2:
                            has_pos.append(root)
                    else:
                        has_neg.append(root)
#             elif task == "post-deleted":
#                 if len(threads[root]) >= 10:
#                     if threads[root][root].user.info["post-deleted"]:
#                         has_pos.append(root)
#                     else:
#                         has_neg.append(root)
#             elif task == "user-deleted":
#                 if len(threads[root]) >= 10:
#                     if threads[root][root].user.info["user-deleted"]:
#                         has_pos.append(root)
#                     else:
#                         has_neg.append(root)
            else:
                print("unrecognized task name")

        if has_pos and has_neg:
            pos.append(random.choice(has_pos))
            neg.append(random.choice(has_neg))

    print("- {} positive, {} negative pts".format(len(pos), len(neg)))

    # make data from pos and neg
    X = []
    X_volume, X_reply, X_bow = [], [], []
    threads_text = []
    for root in pos + neg:
        # get ordered set of feature values
        v = [feats[root][k] for k in sorted(feats[root].keys()) if "TRIADS" not in k]
        # data cleaning
        v = [t if (not np.isnan(t) and np.isfinite(t)) else 0 for t in v]
        X.append(v)
        
        # volume baseline - get num participants in thread with at least length of 10
        X_volume.append([len(set(c.user.name for c in thread_pfxs[root].values()))])   
        # reply tree baseline
        X_reply.append([feats[root][k] if (not np.isnan(feats[root][k]) and np.isfinite(feats[root][k])) else 0 for k in sorted(feats[root].keys()) 
                        if "c->c" in k])
        # BOW baseline text
        thread_text = " ".join([u.text for u in thread_pfxs[root].values()
                                if not (task == "post-deleted" and u.id == root)])  
        # don't consider root post for post-deleted task, since we could just look for the string "[deleted]"
        threads_text.append(thread_text)
        
    ys = [1]*len(pos) + [0]*len(neg)

    X, ys = np.array(X), np.array(ys)

    for X_tmp, name in [(X, "hyperconvo"), (X_volume, "volume"), (X_reply, "reply tree"), (None, "BOW")]:
        if name == "BOW":
            text_train, text_test, y_train, y_test = train_test_split(threads_text, ys, test_size=0.1, random_state=42)
            cv = CountVectorizer(min_df=0.05, max_df=0.8)
            X_train = cv.fit_transform(text_train)
            X_test = cv.transform(text_test)
        else:
            X_train, X_test, y_train, y_test = train_test_split(X_tmp, ys, test_size=0.1, random_state=42)
        
        clf = LogisticRegression(solver="liblinear")
        clf.fit(X_train, y_train)

        train_acc = clf.score(X_train, y_train)
        test_acc = clf.score(X_test, y_test)
        print("- {}: {:.4f} train, {:.4f} test".format(name, train_acc, test_acc))

task comment-growth
- 174 positive, 174 negative pts
- hyperconvo: 0.7252 train, 0.5143 test
- volume: 0.5687 train, 0.6000 test
- reply tree: 0.6102 train, 0.5143 test
- BOW: 0.9936 train, 0.4286 test
task commenter-growth
- 121 positive, 121 negative pts
- hyperconvo: 0.6866 train, 0.6800 test
- volume: 0.4747 train, 0.4000 test
- reply tree: 0.5668 train, 0.6000 test
- BOW: 0.9954 train, 0.4400 test
