In [1]:
import os
os.getcwd()
os.chdir("../..")
os.getcwd()

'/Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit'

We can use Hypergraph features for various predictive tasks:

In [2]:
import convokit
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus"))

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus


In [4]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)
hc.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x11f895a58>

In [5]:
remake_cache = False
if remake_cache:
    with open("hyperconvo_feats.p", "wb") as f:
        hyperconvo_feats = {}
        for convo in corpus.iter_conversations():
            hyperconvo_feats.update(convo.meta["hyperconvo"])
        pickle.dump(hyperconvo_feats, f)
else:
    with open("hyperconvo_feats.p", "rb") as f:
        hyperconvo_feats = pickle.load(f)

In [6]:
threads = corpus.utterance_threads(include_root=False)

In [7]:
missing = 0
present = 0
for tlc_id, thread in threads.items():
    if tlc_id not in thread:
        missing += 1
    else:
        present += 1

In [8]:
# Use only the first 10 comments in each thread
thread_pfxs = corpus.utterance_threads(prefix_len=10, include_root=False)

In [9]:
len(threads)

100000

In [10]:
from collections import defaultdict
thread_roots_by_self_post = defaultdict(list)
for top_level_comment, thread in threads.items():
    rt = thread[next(iter(thread))].root
    thread_roots_by_self_post[rt].append(top_level_comment)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
path_stats = hc.retrieve_motif_pathway_stats(corpus)

In [13]:
motif_counts = hc.retrieve_motif_counts(corpus)

In [14]:
threads_motifs = hc.retrieve_motifs(corpus)

In [15]:
threads_motifs['dnppqdj']

{'NO_EDGE_TRIADS': [<convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b34f28>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b34898>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b347f0>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b34630>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b34128>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b34198>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b34048>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b34160>],
 'SINGLE_EDGE_TRIADS': [<convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b343c8>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x1c2b340b8>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x11fb3feb8>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x11fb3f3c8>],
 'INCOMING_TRIADS': [],
 'OUTGOING_TRIADS': [],
 'DYADIC_TRIADS': [<convokit.hyperconvo.triadMotif.TriadMotif at 0x11fb3fe80>,
  <convokit.hyperconvo.triadMotif.TriadMotif at 0x11fb3f668>,
  <convokit.hyperconvo.tr

In [16]:
latent_motif_count = {thread_id: hc._latent_motif_count(motif_dict, trans=False)[0] for thread_id, motif_dict in threads_motifs.items()}

In [25]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import normalize, StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression

In [81]:
# first generate positive and negative examples based on task

def generate_pos_neg(task: str, post_to_thread_obj, threads, thread_pfxs):
    pos, neg = [], []
    if task == "comment-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos = [root for root in thread_roots if len(threads[root]) >= 15]
            has_neg = [root for root in thread_roots if len(threads[root]) == 10]
            
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    elif task == "commenter-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos, has_neg = [], []
            for root in thread_roots:
                if len(threads[root]) >= 20:
                    if len(set(c.user.name for c in threads[root].values())) >= \
                        len(set(c.user.name for c in thread_pfxs[root].values())) * 2:
                            has_pos.append(root)
                    else:
                        has_neg.append(root)
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    print("- {} positive, {} negative pts for {} task".format(len(pos), len(neg), task))
    
    return pos, neg


In [83]:
pos_comment_growth, neg_comment_growth = generate_pos_neg("comment-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 1723 positive, 1723 negative pts for comment-growth task


In [82]:
pos_commenter_growth, neg_commenter_growth = generate_pos_neg("commenter-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 1121 positive, 1121 negative pts for commenter-growth task


In [None]:

# extract features for all these examples

# construct the pair_df, flipping between positive and negative

# StandardScaler + LogisticRegression to finish off


In [148]:
def generate_paired_features(feats, pos, neg):
    X, y = [], []
    flip = True
    
    for idx in range(len(pos)):
        pos_feats = np.array(list(feats[pos[idx]].values()))
        neg_feats = np.array(list(feats[neg[idx]].values()))
        
        if np.isnan(pos_feats).any() or np.isnan(neg_feats).any(): continue
            
        if flip:
            y.append(1)
            diff = pos_feats - neg_feats
        else:
            y.append(0)
            diff = neg_feats - pos_feats
        X.append(diff)
        flip = not flip

    return np.array(X), np.array(y)

In [161]:
from copy import deepcopy

In [162]:
hyperconv_motif = deepcopy(hyperconvo_feats)
for thread_id, feats in hyperconv_motif.items():
    feats.update(motif_counts[thread_id])

In [165]:
hyperconv_paths = deepcopy(hyperconvo_feats)
for thread_id, feats in hyperconv_paths.items():
    feats.update(path_stats[thread_id])

In [169]:
hyperconv_latent = deepcopy(hyperconvo_feats)
for thread_id, feats in hyperconv_latent.items():
    feats.update(latent_motif_count[thread_id])

In [171]:
hyperconv_motifall = deepcopy(hyperconvo_feats)
for thread_id, feats in hyperconv_motifall.items():
    feats.update(motif_counts[thread_id])
    feats.update(path_stats[thread_id])
    feats.update(latent_motif_count[thread_id])

In [214]:
def print_extreme_coefs(clf, feats):
    feats_ordered = list(feats[next(iter(feats))])
    coefs = clf.named_steps['logreg'].coef_[0].tolist()
    
    assert len(feats_ordered) == len(coefs)
    feats_coefs = sorted(list(zip(feats_ordered, coefs)), key=lambda x: x[1], reverse=True)
    
    print("TOP 5 FEATURES")
    for ft, coef in feats_coefs[:5]:
        print("{}: {:.3f}".format(ft, coef))
    print()
    print("BOTTOM 5 FEATURES")
    for ft, coef in feats_coefs[-5:]:
        print("{}: {:.3f}".format(ft, coef))
    print()

In [215]:
import random
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

random.seed(2019)

for task in ["comment-growth", "commenter-growth"]: #, "post-deleted", "user-deleted"
    print("TASK: {}\n".format(task))
    
    pos, neg = generate_pos_neg(task, thread_roots_by_self_post, threads, thread_pfxs)

    X, y = generate_paired_features(hyperconvo_feats, pos, neg)
    X_motifcnt, y_motifcnt = generate_paired_features(motif_counts, pos, neg)
    X_latent, y_latent = generate_paired_features(latent_motif_count, pos, neg)
    X_path, y_path = generate_paired_features(path_stats, pos, neg)
    X_hcmotif, y_hcmotif = generate_paired_features(hyperconv_motif, pos, neg)
    X_hcpath, y_hcpath = generate_paired_features(hyperconv_paths, pos, neg)
    X_hclatent, y_hclatent = generate_paired_features(hyperconv_latent, pos, neg)
    X_all, y_all = generate_paired_features(hyperconv_motifall, pos, neg)
    for X, y, feats, name in [(X, y, hyperconvo_feats, "hyperconv"),
                       (X_motifcnt, y_motifcnt, motif_counts, "motifcount"),
                       (X_latent, y_latent, latent_motif_count, "latentmotif"),
                       (X_path, y_path, path_stats, "motifpaths"),
                       (X_hcmotif, y_hcmotif, hyperconv_motif, "hyperconv-motif"),
                       (X_hcpath, y_hcpath, hyperconv_paths, "hyperconv-paths"),
                       (X_hclatent, y_hclatent, hyperconv_latent, "hyperconv-latent"),
                       (X_all, y_all, hyperconv_motifall, "hyperconvo-motifall")
                      ]:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
        clf.fit(X_train, y_train)

        train_acc = clf.score(X_train, y_train)
        test_acc = clf.score(X_test, y_test)
        print("- {}: {:.4f} train, {:.4f} test".format(name, train_acc, test_acc))
        print_extreme_coefs(clf, feats)

TASK: comment-growth

- 1723 positive, 1723 negative pts for comment-growth task
- hyperconv: 0.6379 train, 0.6291 test
TOP 5 FEATURES
norm.max[indegree over C->C mid-thread responses]: 0.352
max[indegree over C->C responses]: 0.351
entropy[indegree over C->C responses]: 0.332
mean[outdegree over C->c responses]: 0.294
mean[outdegree over C->C responses]: 0.294

BOTTOM 5 FEATURES
mean-nonzero[outdegree over C->C responses]: -0.318
norm.max[outdegree over C->c mid-thread responses]: -0.365
norm.max[outdegree over C->C mid-thread responses]: -0.365
entropy[outdegree over C->c mid-thread responses]: -0.875
entropy[outdegree over C->C mid-thread responses]: -0.875

- motifcount: 0.6176 train, 0.6029 test
TOP 5 FEATURES
NO_EDGE_TRIADS: 0.623
DIRECIPROCAL_TRIADS: 0.245
OUTGOING_TRIADS: 0.205
DIRECTED_CYCLE_TRIADS: 0.162
SINGLE_EDGE_TRIADS: 0.142

BOTTOM 5 FEATURES
UNIDIRECTIONAL_TRIADS: 0.030
INCOMING_1TO3_TRIADS: 0.000
OUTGOING_RECIPROCAL_TRIADS: -0.028
DYADIC_TRIADS: -0.031
INCOMING_TRIADS

- hyperconv-latent: 0.6444 train, 0.5385 test
TOP 5 FEATURES
prop-nonzero[outdegree over C->c mid-thread responses]: 0.699
prop-nonzero[outdegree over C->C mid-thread responses]: 0.699
norm.max[indegree over C->C responses]: 0.697
norm.2nd-largest[indegree over C->C responses]: 0.489
max[indegree over c->c mid-thread responses]: 0.478

BOTTOM 5 FEATURES
2nd-largest[outdegree over C->c responses]: -0.432
2nd-largest[outdegree over C->C responses]: -0.432
mean-nonzero[indegree over C->C responses]: -0.432
max[indegree over C->C mid-thread responses]: -0.494
entropy[indegree over C->C responses]: -0.602

- hyperconvo-motifall: 0.6602 train, 0.5249 test
TOP 5 FEATURES
prop-nonzero[outdegree over C->c mid-thread responses]: 0.754
prop-nonzero[outdegree over C->C mid-thread responses]: 0.754
norm.max[indegree over C->C responses]: 0.709
prop-multiple[indegree over C->C responses]: 0.552
norm.2nd-largest[indegree over C->C responses]: 0.532

BOTTOM 5 FEATURES
2nd-largest[outdegree over C->c r

In [193]:
import random
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

random.seed(2019)

for task in ["comment-growth", "commenter-growth"]: #, "post-deleted", "user-deleted"
    print("task {}".format(task))
    
    pos, neg = generate_pos_neg(task, thread_roots_by_self_post, threads, thread_pfxs)

    X, y = generate_paired_hyperconv_features(hyperconvo_feats, pos, neg)
    # make data from pos and neg
    X = []
    X_volume, X_reply, X_bow = [], [], []
    X_motifpath = []
    X_motifcount = []
    X_latentmotif = []
    threads_text = []
    for root in pos + neg:
        # get ordered set of feature values
        v = [hyperconvo_feats[root][k] for k in sorted(hyperconvo_feats[root].keys())]
        # data cleaning
        v = [t if (not np.isnan(t) and np.isfinite(t)) else 0 for t in v]
        X.append(v)
        
        X_motifpath.append(list(path_stats[root].values()))
        X_motifcount.append(list(motif_counts[root].values()))
        X_latentmotif.append(list(latent_motif_count[root].values()))
        # volume baseline - get num participants in thread with at least length of 10
        X_volume.append([len(set(c.user.name for c in thread_pfxs[root].values()))])   
        # reply tree baseline
        X_reply.append([hyperconvo_feats[root][k] if (not np.isnan(hyperconvo_feats[root][k]) and np.isfinite(hyperconvo_feats[root][k])) else 0 for k in sorted(hyperconvo_feats[root].keys()) 
                        if "c->c" in k])
        # BOW baseline text
        thread_text = " ".join([u.text for u in thread_pfxs[root].values()
                                if not (task == "post-deleted" and u.id == root)])  
        # don't consider root post for post-deleted task, since we could just look for the string "[deleted]"
        threads_text.append(thread_text)
        
    ys = [1]*len(pos) + [0]*len(neg)

    X, ys = np.array(X), np.array(ys)
    X_volume = np.array(X_volume)
    X_reply = np.array(X_reply)
    X_motifpath = np.array(X_motifpath)
    X_motifcount = np.array(X_motifcount)
    X_latentmotif = np.array(X_latentmotif)
    X_hc_path = np.concatenate([X_motifpath, X], axis=1)
    X_hc_count = np.concatenate([X_motifcount, X], axis=1)
    X_hc_latent = np.concatenate([X_latentmotif, X], axis=1)
    X_all = np.concatenate([X_motifpath, X_motifcount, X_latentmotif, X], axis=1)
    for X_tmp, name in [(X, "hyperconvo"), 
#                         (X_volume, "volume"), 
                        (X_reply, "reply tree"), 
                        (None, "BOW"), 
                        (X_motifpath, "motifpaths"), 
                        (X_motifcount, "motifcounts"),
                        (X_latentmotif, "latentmotifcounts"),
                        (X_hc_path, "hyperconv-motifpaths"),
                        (X_hc_count, "hyperconvo-motifcounts"),
                        (X_hc_latent, "hyperconvo-latentmotifs"),
                        (X_all, "hyperconvo-motifall")
                       ]:
        if name == "BOW":
            text_train, text_test, y_train, y_test = train_test_split(threads_text, ys, test_size=0.1, random_state=42)
            cv = CountVectorizer(min_df=0.05, max_df=0.8)
            X_train = cv.fit_transform(text_train)
            X_test = cv.transform(text_test)
        else:
            X_train, X_test, y_train, y_test = train_test_split(X_tmp, ys, test_size=0.1, random_state=42)
        
        clf = Pipeline([("normalizer", Normalizer()), ("logreg", LogisticRegression(solver='liblinear'))])
#         clf = Pipeline([("normalizer", Normalizer()), ("featselect", SelectPercentile(f_classif, 10)), ("logreg", LogisticRegression(solver='liblinear'))])
#         clf = GridSearchCV(base_clf, {"logreg__C": [10**i for i in range(-4,4)], "featselect__percentile": list(range(10, 110, 10))}, cv=3)

#         print(X_train.shape)
#         print(y_train.shape)

#         clf = LogisticRegression(solver="liblinear")
        clf.fit(X_train, y_train)

        train_acc = clf.score(X_train, y_train)
        test_acc = clf.score(X_test, y_test)
        print("- {}: {:.4f} train, {:.4f} test".format(name, train_acc, test_acc))

task comment-growth
- 1723 positive, 1723 negative pts for comment-growth task
- hyperconvo: 0.5917 train, 0.5362 test
- reply tree: 0.5688 train, 0.5362 test


KeyboardInterrupt: 