In [1]:
import os

In [2]:
fake_news_dir = '/sauna/fake-news'
news_dir = '/sauna/reddit_201810_raw/corpus/newreddits_nsfw~-~news/news/'
donald_dir = '/sauna/reddit_201810_raw/corpus/TheTwoBeerQueers~-~The_Donald/The_Donald/'

In [3]:
import pandas as pd

In [5]:
os.chdir('/home/caleb/Cornell-Conversational-Analysis-Toolkit')

In [7]:
import convokit
from convokit import Corpus, Conversation, Utterance

In [8]:
donald_corpus = Corpus(filename=donald_dir)

In [23]:
news_corpus = Corpus(filename=news_dir)

In [15]:
def get_url_to_convo_dict(corpus: Corpus):
    url_to_convo = dict()
    for convo in corpus.iter_conversations():
        url = convo.meta['url']
        if url in url_to_convo:
            convo1_len = len(list(url_to_convo[url].iter_utterances()))
            convo2_len = len(list(convo.iter_utterances()))
            if convo2_len > convo1_len:
                url_to_convo[url] = convo
        else:
            url_to_convo[url] = convo
    return url_to_convo

In [16]:
url_to_convo_donald = get_url_to_convo_dict(donald_corpus)

In [26]:
url_to_convo_news = get_url_to_convo_dict(news_corpus)

In [22]:
len(url_to_convo_donald)

3514345

In [27]:
len(url_to_convo_news)

3543168

In [28]:
common_urls = set(url_to_convo_donald.keys() & url_to_convo_news.keys())

In [31]:
print("No. of common urls:", len(common_urls))

No. of common urls: 54022


In [32]:
list(common_urls)[:10]

['https://medium.com/@fightfortheftr/at-t-paid-200-000-to-trumps-attorney-michael-cohen-and-the-payments-stop-right-after-trump-s-3356687f4827',
 'http://www.breitbart.com/texas/2016/08/18/thousands-middle-eastern-illegal-immigrants-busted-forged-papers-border/',
 'https://www.bostonglobe.com/metro/2018/06/13/about-face-hospital-will-disperse-portraits-past-white-male-luminaries-put-focus-diversity/0pICgbpsw7QoHFFJQQEZOJ/story.html',
 'http://www.sgvtribune.com/government-and-politics/20170329/west-covina-walnut-leaders-say-sanctuary-state-bill-would-let-felons-go-free',
 'https://www.nytimes.com/2017/01/25/us/politics/cia-detainee-prisons.html?_r=0',
 'https://www.google.com/amp/s/amp.cnn.com/cnn/2017/08/24/us/charleston-active-shooter/index.html',
 'http://www.cbsnews.com/news/france-terrorist-act-thwarted-arrest-seven-strasbourg-marseille/',
 'https://nypost.com/2018/10/12/the-moment-a-kids-backpack-got-him-accused-of-sexual-assault/',
 'http://www.bbc.com/news/world-us-canada-38721

In [33]:
donald_convo_ids = {url_to_convo_donald[url].id for url in common_urls}
news_convo_ids = {url_to_convo_news[url].id for url in common_urls}

In [36]:
donald_corpus.filter_conversations_by(lambda convo: convo.id in donald_convo_ids)
news_corpus.filter_conversations_by(lambda convo: convo.id in news_convo_ids)

In [39]:
donald_corpus.dump('donald_corpus', base_path='/sauna/fake-news/fake-news-url-match')

In [40]:
news_corpus.dump('news_corpus', base_path='/sauna/fake-news/fake-news-url-match')

In [57]:
hc = convokit.HyperConvo(prefix_len=8, min_thread_len=8, include_root=False)

In [58]:
hc.fit_transform(donald_corpus)
hc.fit_transform(news_corpus)

  "norm.max": lambda l: np.max(l) / np.sum(l),
  if len(l) > 1 else np.nan,
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  pk = 1.0*pk / np.sum(pk, axis=0)
  if len(l) > 1 else np.nan


<convokit.model.corpus.Corpus at 0x7ed806452490>

In [61]:
valid_convo_pairs = []
for url in common_urls:
    donald_convo = url_to_convo_donald[url]
    news_convo = url_to_convo_news[url]
    
    if "hyperconvo" in news_convo.meta and "hyperconvo" in donald_convo.meta:
        valid_convo_pairs.append((news_convo, donald_convo))


In [63]:
print("There are {} valid conversation pairs".format(len(valid_convo_pairs)))

There are 900 valid conversation pairs


In [64]:
import random

In [69]:
thread_pairs = []
for news_convo, donald_convo in valid_convo_pairs:
    news_thread_id = random.choice(list(news_convo.meta['hyperconvo'].keys()))
    donald_thread_id = random.choice(list(donald_convo.meta['hyperconvo'].keys()))
    thread_pairs.append((news_convo.meta['hyperconvo'][news_thread_id], donald_convo.meta['hyperconvo'][donald_thread_id]))

In [76]:
news_feats = [x for x, y in thread_pairs]
donald_feats = [y for x, y in thread_pairs]

In [77]:
news_df = pd.DataFrame(news_feats)
donald_df = pd.DataFrame(donald_feats)

In [83]:
len(news_df)

900

In [84]:
import numpy as np

In [87]:
X, y = [], []
flip = True
for i in range(len(news_df)):
    if np.isnan(news_df.iloc[i]).any() or np.isnan(donald_df.iloc[i]).any(): continue
    
    if flip:
        y.append(1)
        diff = news_df.iloc[i] - donald_df.iloc[i]
    else:
        y.append(0)
        diff = donald_df.iloc[i] - news_df.iloc[i]
    X.append(diff)
    flip = not flip


In [90]:
X_df = pd.DataFrame(X)

In [91]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import normalize, StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- {}: {:.4f} train, {:.4f} test".format(name, train_acc, test_acc))
print_extreme_coefs(clf, feats)