In [1]:
import os
os.chdir("../..")

In [2]:
import convokit

In [3]:
from convokit import Corpus

In [4]:
fake_news_dir = '/sauna/fake-news'
# politics_dir = '/sauna/reddit_201810_raw/corpus/pokemontrades_banlist~-~politics/politics'

magicTCG_corpus = '/sauna/reddit_201810_raw/corpus/macosxapps~-~magicTCG/magicTCG/'

In [5]:
corpus = Corpus(filename=magicTCG_corpus)
# corpus = Corpus(filename=os.path.join(fake_news_dir, 'relationships_trunc_paired'))

In [6]:
# Start time: 01 Oct 2017
# End time: 01 Oct 2018
start_time = 1506816000
end_time = 1538352000

In [7]:
valid_convo_ids = [convo.id for convo in corpus.iter_conversations() if start_time <= convo.meta['timestamp'] <= end_time]

In [8]:
valid_convo_ids = set(valid_convo_ids)

In [9]:
len(valid_convo_ids)

52009

In [10]:
corpus.filter_conversations_by(lambda convo: convo.id in valid_convo_ids)

In [11]:
corpus.print_summary_stats()

Number of Users: 74943
Number of Utterances: 1699392
Number of Conversations: 52009


In [34]:
# corpus.dump('politeness_trunc', base_path=fake_news_dir)

In [12]:
threads = corpus.utterance_threads(include_root=False)

In [13]:
# Use only the first 10 comments in each thread
thread_pfxs = corpus.utterance_threads(prefix_len=10, include_root=False)

In [14]:
len(threads)

473570

In [15]:
from collections import defaultdict
thread_roots_by_self_post = defaultdict(list)
for top_level_comment, thread in threads.items():
    rt = thread[next(iter(thread))].root
    thread_roots_by_self_post[rt].append(top_level_comment)

In [16]:
# first generate positive and negative examples based on task
import random
def generate_pos_neg(task: str, post_to_thread_obj, threads, thread_pfxs):
    pos, neg = [], []
    if task == "comment-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos = [root for root in thread_roots if len(threads[root]) >= 15]
            has_neg = [root for root in thread_roots if len(threads[root]) == 10]
            
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    elif task == "commenter-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos, has_neg = [], []
            for root in thread_roots:
                if len(threads[root]) >= 20:
                    if len(set(c.user.name for c in threads[root].values())) >= \
                        len(set(c.user.name for c in thread_pfxs[root].values())) * 2:
                            has_pos.append(root)
                    else:
                        has_neg.append(root)
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    print("- {} positive, {} negative pts for {} task".format(len(pos), len(neg), task))
    
    return pos, neg

In [17]:
pos_comment_growth, neg_comment_growth = generate_pos_neg("comment-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 1850 positive, 1850 negative pts for comment-growth task


In [18]:
pos_commenter_growth, neg_commenter_growth = generate_pos_neg("commenter-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 1263 positive, 1263 negative pts for commenter-growth task


In [19]:
thread_to_convo = {thread_id: convo_id for convo_id, thread_ids in thread_roots_by_self_post.items() for thread_id in thread_ids}

In [20]:
commenter_growth_convos = set()
for thread_id in pos_commenter_growth + neg_commenter_growth:
    commenter_growth_convos.add(thread_to_convo[thread_id])

In [21]:
comment_growth_convos = set()
for thread_id in pos_comment_growth + neg_comment_growth:
    comment_growth_convos.add(thread_to_convo[thread_id])

In [22]:
paired_convos = comment_growth_convos.union(commenter_growth_convos)

In [41]:
corpus.filter_conversations_by(lambda convo: convo.id in paired_convos)

In [43]:
corpus.print_summary_stats()

Number of Users: 45942
Number of Utterances: 678291
Number of Conversations: 2594


In [44]:
corpus.dump('magicTCG_trunc_paired', base_path=fake_news_dir)

In [25]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)
hyperconvo_feats = hc.retrieve_feats(corpus)

  "norm.max": lambda l: np.max(l) / np.sum(l),
  if len(l) > 1 else np.nan,
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  pk = 1.0*pk / np.sum(pk, axis=0)
  if len(l) > 1 else np.nan


In [26]:
path_stats = hc.retrieve_motif_pathway_stats(corpus)

In [27]:
motif_counts = hc.retrieve_motif_counts(corpus)

In [28]:
threads_motifs = hc.retrieve_motifs(corpus)

In [29]:
import pandas as pd

In [30]:
hyperconv_df = pd.DataFrame.from_dict(hyperconvo_feats, orient='index')
hyperconv_feat_names = list(hyperconv_df.columns)

In [31]:
path_stats_df = pd.DataFrame.from_dict(path_stats, orient='index')
columns = ['PATH-'+', '.join(filter(lambda x: type(x) == str, col)).strip() for col in path_stats_df.columns.values]
path_stats_df.columns = columns
path_feat_names = list(path_stats_df.columns)

In [32]:
motif_counts_df = pd.DataFrame.from_dict(motif_counts, orient='index')
motif_feat_names = list(motif_counts_df.columns)

In [33]:
def get_num_users(thread):
    return len(set(utt.user.name for utt in thread.values()))

thread_to_usercount = dict()
for thread_id in thread_pfxs:
    thread_to_usercount[thread_id] = {"num_users": get_num_users(thread_pfxs[thread_id])}

In [34]:
num_users_df = pd.DataFrame.from_dict(thread_to_usercount, orient='index')
num_users_feat = ['num_users']

In [35]:
feats_df = pd.concat([hyperconv_df, path_stats_df, motif_counts_df, num_users_df], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [36]:
feats_df.shape

(473570, 261)

In [37]:
# threads = {k: v for k, v in corpus.utterance_threads(include_root=False).items() if k in valid_threads}

In [38]:
# Use only the first 10 comments in each thread
# thread_pfxs = {k: v for k, v in corpus.utterance_threads(prefix_len=10, include_root=False).items() if k in valid_threads}

In [39]:
len(set(feats_df.index))

473570

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 
from sklearn.model_selection import cross_val_score
import numpy as np

for task in ["comment-growth", "commenter-growth"]: #, "post-deleted", "user-deleted"
    print("TASK: {}\n".format(task))
    
    if task == "comment-growth":
        pos, neg = pos_comment_growth, neg_comment_growth
    elif task == "commenter-growth":
        pos, neg = pos_commenter_growth, neg_commenter_growth
#     pos, neg = generate_pos_neg(task, thread_roots_by_self_post, threads, thread_pfxs)
    for feature_set, name in [(hyperconv_feat_names, "hyperconvo"),
                        (hyperconv_feat_names + num_users_feat, "hyperconv-usercount"),
                        (motif_feat_names, "motif"),
                        (motif_feat_names, "motif-usercount"),
                        (path_feat_names, "motifpaths"),
                        (path_feat_names + num_users_feat, "motifpaths-usercount"),
                        (hyperconv_feat_names + motif_feat_names, "hyperconv-motif"),
                        (hyperconv_feat_names + path_feat_names, "hyperconv-paths"),
                        (hyperconv_feat_names + motif_feat_names + path_feat_names, "hyperconvo-motifall"),
                        (hyperconv_feat_names + motif_feat_names + path_feat_names + num_users_feat, "hyperconvo-motifall"),
                        (num_users_feat, "usercount")
                       ]:
        clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])      
#         loo = LeaveOneOut()
        pp = convokit.PairedPrediction()
        X, y = pp._generate_paired_X_y(feats_df[feature_set], pos, neg)

#         clf.fit(X, y)
#         clf.score(X, y)
#         print(X.shape)
#         print(X[0])
#         print(y.shape)
        scores = cross_val_score(clf, X, y, cv=20)
        print("- {}, cv_accuracy: {:.4f}".format(name, scores.mean()))


#         print("Feature set: {}".format(name))
#         pp.fit_predict(feats_df[feature_set], pos, neg, test_size=0.2)
#         pp.print_extreme_coefs(feature_set, num_features=5)

TASK: comment-growth

Excluded 18 data point(s) that contained NaN values.
- hyperconvo, cv_accuracy: 0.5716
Excluded 18 data point(s) that contained NaN values.
- hyperconv-usercount, cv_accuracy: 0.5711
- motif, cv_accuracy: 0.5973
- motif-usercount, cv_accuracy: 0.5973
- motifpaths, cv_accuracy: 0.5968
- motifpaths-usercount, cv_accuracy: 0.5963
Excluded 18 data point(s) that contained NaN values.
- hyperconv-motif, cv_accuracy: 0.5945
Excluded 18 data point(s) that contained NaN values.
- hyperconv-paths, cv_accuracy: 0.5891
Excluded 18 data point(s) that contained NaN values.
- hyperconvo-motifall, cv_accuracy: 0.5902
Excluded 18 data point(s) that contained NaN values.
- hyperconvo-motifall, cv_accuracy: 0.5907
- usercount, cv_accuracy: 0.5865
TASK: commenter-growth

Excluded 11 data point(s) that contained NaN values.
- hyperconvo, cv_accuracy: 0.5568
Excluded 11 data point(s) that contained NaN values.
- hyperconv-usercount, cv_accuracy: 0.5591
- motif, cv_accuracy: 0.5422
- mo