In [1]:
import os

In [2]:
fake_news_dir = '/sauna/fake-news'
news_dir = '/sauna/reddit_201810_raw/corpus/newreddits_nsfw~-~news/news/'
donald_dir = '/sauna/reddit_201810_raw/corpus/TheTwoBeerQueers~-~The_Donald/The_Donald/'

In [2]:
import pandas as pd

In [3]:
os.chdir('/home/caleb/Cornell-Conversational-Analysis-Toolkit')

In [4]:
import convokit
from convokit import Corpus, Conversation, Utterance

In [8]:
donald_corpus = Corpus(filename=donald_dir)

In [23]:
news_corpus = Corpus(filename=news_dir)

In [7]:
def get_url_to_convo_dict(corpus: Corpus):
    url_to_convo = dict()
    for convo in corpus.iter_conversations():
        url = convo.meta['url']
        if url in url_to_convo:
            convo1_len = len(list(url_to_convo[url].iter_utterances()))
            convo2_len = len(list(convo.iter_utterances()))
            if convo2_len > convo1_len:
                url_to_convo[url] = convo
        else:
            url_to_convo[url] = convo
    return url_to_convo

In [8]:
url_to_convo_donald = get_url_to_convo_dict(donald_corpus)

In [15]:
url_to_convo_news = get_url_to_convo_dict(news_corpus)

In [22]:
len(url_to_convo_donald)

3514345

In [27]:
len(url_to_convo_news)

3543168

In [16]:
common_urls = set(url_to_convo_donald.keys() & url_to_convo_news.keys())

In [17]:
print("No. of common urls:", len(common_urls))

No. of common urls: 54022


In [32]:
list(common_urls)[:10]

['https://medium.com/@fightfortheftr/at-t-paid-200-000-to-trumps-attorney-michael-cohen-and-the-payments-stop-right-after-trump-s-3356687f4827',
 'http://www.breitbart.com/texas/2016/08/18/thousands-middle-eastern-illegal-immigrants-busted-forged-papers-border/',
 'https://www.bostonglobe.com/metro/2018/06/13/about-face-hospital-will-disperse-portraits-past-white-male-luminaries-put-focus-diversity/0pICgbpsw7QoHFFJQQEZOJ/story.html',
 'http://www.sgvtribune.com/government-and-politics/20170329/west-covina-walnut-leaders-say-sanctuary-state-bill-would-let-felons-go-free',
 'https://www.nytimes.com/2017/01/25/us/politics/cia-detainee-prisons.html?_r=0',
 'https://www.google.com/amp/s/amp.cnn.com/cnn/2017/08/24/us/charleston-active-shooter/index.html',
 'http://www.cbsnews.com/news/france-terrorist-act-thwarted-arrest-seven-strasbourg-marseille/',
 'https://nypost.com/2018/10/12/the-moment-a-kids-backpack-got-him-accused-of-sexual-assault/',
 'http://www.bbc.com/news/world-us-canada-38721

In [33]:
donald_convo_ids = {url_to_convo_donald[url].id for url in common_urls}
news_convo_ids = {url_to_convo_news[url].id for url in common_urls}

In [36]:
donald_corpus.filter_conversations_by(lambda convo: convo.id in donald_convo_ids)
news_corpus.filter_conversations_by(lambda convo: convo.id in news_convo_ids)

In [39]:
donald_corpus.dump('donald_corpus', base_path='/sauna/fake-news/fake-news-url-match')

In [40]:
news_corpus.dump('news_corpus', base_path='/sauna/fake-news/fake-news-url-match')

In [5]:
donald_corpus = Corpus(filename='/sauna/fake-news/fake-news-url-match/donald_corpus')

In [10]:
news_corpus = Corpus(filename='/sauna/fake-news/fake-news-url-match/news_corpus')

In [11]:
hc = convokit.HyperConvo(prefix_len=8, min_thread_len=8, include_root=False)

In [12]:
hc.fit_transform(donald_corpus)
hc.fit_transform(news_corpus)

  "norm.max": lambda l: np.max(l) / np.sum(l),
  if len(l) > 1 else np.nan,
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  pk = 1.0*pk / np.sum(pk, axis=0)
  if len(l) > 1 else np.nan


<convokit.model.corpus.Corpus at 0x7fa6b55ce650>

In [18]:
valid_convo_pairs = []
for url in common_urls:
    donald_convo = url_to_convo_donald[url]
    news_convo = url_to_convo_news[url]
    
    if "hyperconvo" in news_convo.meta and "hyperconvo" in donald_convo.meta:
        valid_convo_pairs.append((news_convo, donald_convo))


In [19]:
print("There are {} valid conversation pairs".format(len(valid_convo_pairs)))

There are 900 valid conversation pairs


In [21]:
# Subset corpus to conversations that have threads of sufficient length
donald_convo_trunc = set([x[1].id for x in valid_convo_pairs])
news_convo_trunc = set([x[0].id for x in valid_convo_pairs])

In [22]:
donald_corpus.filter_conversations_by(lambda convo: convo.id in donald_convo_trunc)
news_corpus.filter_conversations_by(lambda convo: convo.id in news_convo_trunc)

donald_corpus.dump('donald_corpus_trunc', base_path='/sauna/fake-news/fake-news-url-match')
news_corpus.dump('news_corpus_trunc', base_path='/sauna/fake-news/fake-news-url-match')

In [31]:
pos_convo_ids = [x[0].id for x in valid_convo_pairs] # news
neg_convo_ids = [x[1].id for x in valid_convo_pairs] # donald

In [25]:
import random

In [29]:
thread_pairs = []
for news_convo, donald_convo in valid_convo_pairs:
    news_thread_id = random.choice(list(news_convo.meta['hyperconvo'].keys()))
    donald_thread_id = random.choice(list(donald_convo.meta['hyperconvo'].keys()))
    thread_pairs.append((news_thread_id, donald_thread_id))

In [30]:
pos_thread_ids = [x for x, y in thread_pairs]
neg_thread_ids = [y for x, y in thread_pairs]

In [32]:
convo_to_thread = dict()
for idx in range(len(pos_convo_ids)):
    convo_to_thread[pos_convo_ids[idx]] = pos_thread_ids[idx]
    convo_to_thread[neg_convo_ids[idx]] = neg_thread_ids[idx]

In [41]:
corpus = news_corpus.merge(donald_corpus)



In [42]:
thread_ids = set(pos_thread_ids + neg_thread_ids)

In [43]:
path_stats = {k: v for k, v in hc.retrieve_motif_pathway_stats(corpus).items() if k in thread_ids}

In [44]:
threads_motifs = hc.retrieve_motifs(corpus)

In [45]:
latent_motif_count = {thread_id: hc._latent_motif_count(motif_dict, trans=False)[0] for thread_id, motif_dict in threads_motifs.items() if thread_id in thread_ids}

In [46]:
motif_counts = {k: v for k, v in hc.retrieve_motif_counts(corpus).items() if k in thread_ids}

In [49]:
hyperconvo_feats = {k: v for k, v in hc.retrieve_feats(corpus).items() if k in thread_ids}

In [50]:
# Use only the first 10 comments in each thread
thread_pfxs = corpus.utterance_threads(prefix_len=8, include_root=False)

def get_num_users(thread):
    return len(set(utt.user.name for utt in thread.values()))

thread_to_usercount = dict()
for thread_id in thread_pfxs:
    if thread_id in thread_ids:
        thread_to_usercount[thread_id] = {"num_users": get_num_users(thread_pfxs[thread_id])}

In [51]:
hyperconv_df = pd.DataFrame.from_dict(hyperconvo_feats, orient='index')
hyperconv_feat_names = list(hyperconv_df.columns)

In [52]:
path_stats_df = pd.DataFrame.from_dict(path_stats, orient='index')
columns = ['PATH-'+', '.join(filter(lambda x: type(x) == str, col)).strip() for col in path_stats_df.columns.values]
path_stats_df.columns = columns
path_feat_names = list(path_stats_df.columns)

In [53]:
motif_counts_df = pd.DataFrame.from_dict(motif_counts, orient='index')
motif_feat_names = list(motif_counts_df.columns)

In [54]:
latentmotif_df = pd.DataFrame.from_dict(latent_motif_count, orient='index')
latentmotif_df.columns = ['LATENT_'+c for c in latentmotif_df.columns]
latent_motif_feat_names = list(latentmotif_df.columns)

In [55]:
num_users_df = pd.DataFrame.from_dict(thread_to_usercount, orient='index')
num_users_feat = ['num_users']

In [58]:
hyperconv_df.shape

(1800, 144)

In [59]:
path_stats_df.shape

(1800, 100)

In [60]:
motif_counts_df.shape

(1800, 16)

In [61]:
latentmotif_df.shape

(1800, 16)

In [62]:
num_users_df.shape

(1800, 1)

In [63]:
feats_df.shape

(1800, 277)

In [64]:
feats_df = pd.concat([hyperconv_df, path_stats_df, motif_counts_df, latentmotif_df, num_users_df], axis=1, sort=False)

In [65]:
feats_df.shape

(1800, 277)

In [66]:
from convokit import PairedPrediction

In [68]:
random.seed(2019)
print("TASK: {}\n".format("Predicing if a thread is from r/news"))
    
for feature_set, name in [(hyperconv_feat_names, "hyperconvo"),
                    (hyperconv_feat_names + num_users_feat, "hyperconv-usercount"),
                    (latent_motif_feat_names, "latentmotif"),
                    (latent_motif_feat_names + num_users_feat, "latentmotif-usercount"),
                    (path_feat_names, "motifpaths"),
                    (path_feat_names + num_users_feat, "motifpaths-usercount"),
                    (hyperconv_feat_names + motif_feat_names, "hyperconv-motif"),
                    (hyperconv_feat_names + path_feat_names, "hyperconv-paths"),
                    (hyperconv_feat_names + latent_motif_feat_names, "hyperconv-latent"),
                    (hyperconv_feat_names + motif_feat_names + path_feat_names + latent_motif_feat_names, "hyperconvo-motifall"),
                    (num_users_feat, "usercount")
                   ]:
    pp = PairedPrediction()
    print("Feature set: {}".format(name))
    pp.fit_predict(feats_df[feature_set], pos_thread_ids, neg_thread_ids, test_size=0.2)
    pp.print_extreme_coefs(feature_set, num_features=5)

TASK: Predicing if a thread is from r/news

Feature set: hyperconvo
Excluded 23 data point(s) that contained NaN values.
Train accuracy: 0.6676
Test accuracy: 0.5966

TOP 5 FEATURES
mean-nonzero[outdegree over C->c responses]: 0.502
mean-nonzero[outdegree over C->C responses]: 0.502
entropy[outdegree over C->c mid-thread responses]: 0.485
entropy[outdegree over C->C mid-thread responses]: 0.485
norm.2nd-largest[outdegree over C->c mid-thread responses]: 0.422

BOTTOM 5 FEATURES
prop-nonzero[indegree over C->C mid-thread responses]: -0.587
max[indegree over C->C mid-thread responses]: -0.695
entropy[outdegree over C->c responses]: -0.701
entropy[outdegree over C->C responses]: -0.701
mean-nonzero[indegree over C->C mid-thread responses]: -0.768

Feature set: hyperconv-usercount
Excluded 23 data point(s) that contained NaN values.
Train accuracy: 0.6705
Test accuracy: 0.5909

TOP 5 FEATURES
entropy[outdegree over C->c mid-thread responses]: 0.497
entropy[outdegree over C->C mid-thread re

## Cumulative BoW paired

In [70]:
threads_text = dict()
for thread_id in thread_ids:
    threads_text[thread_id] = {"text": " ".join(utt.text for utt in thread_pfxs[thread_id].values())}

In [71]:
text_df = pd.DataFrame.from_dict(threads_text, orient='index')

In [74]:
from sklearn.model_selection import train_test_split
pos_neg_train, pos_neg_test = train_test_split(list(zip(pos_thread_ids, neg_thread_ids)), test_size=0.2, random_state=42)

In [75]:
pos_train = [x[0] for x in pos_neg_train]
neg_train = [x[1] for x in pos_neg_train] 
pos_test = [x[0] for x in pos_neg_test]
neg_test = [x[1] for x in pos_neg_test]

In [76]:
train_text_df = text_df.loc[pos_train + neg_train]
test_text_df = text_df.loc[pos_test + neg_test]

In [77]:
len(test_text_df)

360

In [199]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0.01, max_df=0.7)

In [200]:
train_text_arr = cv.fit_transform(train_text_df['text'])
train_text_transform_df = pd.DataFrame(train_text_arr.toarray(), columns=cv.get_feature_names(), index=train_text_df.index)
train_text_transform_df.head()

Unnamed: 0,000,06,10,100,11,12,13,14,15,16,...,yesterday,yet,york,young,your,yourself,youtu,youtube,yup,zero
e3ka4ly,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
dzugzsa,0,0,0,0,0,0,0,2,0,0,...,0,2,0,2,7,0,0,0,0,0
d8vstww,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dfb128r,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e8ho9zb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [201]:
test_text_arr = cv.transform(test_text_df['text'])
test_text_transform_df = pd.DataFrame(test_text_arr.toarray(), columns=cv.get_feature_names(), index=test_text_df.index)   
test_text_transform_df.head()

Unnamed: 0,000,06,10,100,11,12,13,14,15,16,...,yesterday,yet,york,young,your,yourself,youtu,youtube,yup,zero
do7v72z,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
dwyzf3e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
dflvadr,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dxxw1f7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cvl1059,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [202]:
from pandas import DataFrame
import numpy as np
def _generate_paired_X_y(feats: DataFrame, pos_ids, neg_ids):

    X, y = [], []
    flip = True

    excluded = 0
    for idx in range(len(pos_ids)):
        pos_feats = np.array(feats.loc[pos_ids[idx]])
        neg_feats = np.array(feats.loc[neg_ids[idx]])

        if (np.isnan(pos_feats).any() or np.isnan(neg_feats).any()):
            excluded += 1
            continue

        if flip:
            y.append(1)
            diff = pos_feats - neg_feats
        else:
            y.append(0)
            diff = neg_feats - pos_feats

        X.append(diff)
        flip = not flip

    if excluded > 0:
        print("Excluded {} data point(s) that contained NaN values.".format(excluded))

    return np.array(X), np.array(y)

In [203]:
X_train, y_train = _generate_paired_X_y(train_text_transform_df, pos_train, neg_train)
X_test, y_test = _generate_paired_X_y(test_text_transform_df, pos_test, neg_test)

In [204]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])       

In [205]:
clf = clf.fit(X_train, y_train)

In [206]:
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- {}: {:.4f} train, {:.4f} test".format("cumulative_bow", train_acc, test_acc))

- cumulative_bow: 1.0000 train, 0.7667 test


In [207]:
def print_extreme_coefs(clf, feature_names, num_features: int = 5):
    coefs = clf.named_steps['logreg'].coef_[0].tolist()

    assert len(feature_names) == len(coefs)

    feats_coefs = sorted(list(zip(feature_names, coefs)), key=lambda x: x[1], reverse=True)

    print()
    print("TOP {} FEATURES".format(num_features))
    for ft, coef in feats_coefs[:num_features]:
        print("{}: {:.3f}".format(ft, coef))
    print()
    print("BOTTOM {} FEATURES".format(num_features))
    for ft, coef in feats_coefs[-num_features:]:
        print("{}: {:.3f}".format(ft, coef))
    print()

In [208]:
print_extreme_coefs(clf, cv.get_feature_names(), num_features=20)


TOP 20 FEATURES
removed: 0.750
wing: 0.343
anything: 0.312
assume: 0.272
before: 0.262
states: 0.258
comes: 0.241
reference: 0.237
places: 0.232
people: 0.230
effect: 0.229
doesn: 0.229
worst: 0.225
focus: 0.222
ban: 0.216
saudi: 0.216
nonsense: 0.214
children: 0.212
blaming: 0.209
fox: 0.207

BOTTOM 20 FEATURES
quite: -0.203
total: -0.203
created: -0.203
please: -0.205
uses: -0.208
maga: -0.211
win: -0.212
sub: -0.215
special: -0.222
interview: -0.226
white: -0.236
now: -0.238
holding: -0.241
png: -0.242
leaders: -0.244
youtube: -0.244
soros: -0.250
kek: -0.281
pede: -0.286
cuck: -0.333

