In [1]:
import os

In [2]:
fake_news_dir = '/sauna/fake-news'
news_dir = '/sauna/reddit_201810_raw/corpus/newreddits_nsfw~-~news/news/'
donald_dir = '/sauna/reddit_201810_raw/corpus/TheTwoBeerQueers~-~The_Donald/The_Donald/'

In [4]:
os.chdir(fake_news_dir)

In [5]:
os.listdir()

['labels.csv',
 'articles.db',
 '.DS_Store',
 'titles.tar',
 'articles.tar',
 'articles',
 'titles']

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('labels.csv', index_col=0)

In [8]:
df.head(5)

Unnamed: 0,"NewsGuard, Does not repeatedly publish false content","NewsGuard, Gathers and presents information responsibly","NewsGuard, Regularly corrects or clarifies errors","NewsGuard, Handles the difference between news and opinion responsibly","NewsGuard, Avoids deceptive headlines","NewsGuard, Website discloses ownership and financing","NewsGuard, Clearly labels advertising","NewsGuard, Reveals who's in charge, including any possible conflicts of interest","NewsGuard, Provides information about content creators","NewsGuard, score",...,"Allsides, community_agree","Allsides, community_disagree","Allsides, community_label","BuzzFeed, leaning","PolitiFact, Pants on Fire!","PolitiFact, False","PolitiFact, Mostly False","PolitiFact, Half-True","PolitiFact, Mostly True","PolitiFact, True"
21stCenturyWire,,,,,,,,,,,...,,,,left,,,,,,
ABC News,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95.0,...,8964.0,6949.0,somewhat agree,,,,,,,
AMERICAblog News,,,,,,,,,,,...,,,,left,,,,,,
Activist Post,,,,,,,,,,,...,,,,left,,,,,,
Addicting Info,,,,,,,,,,,...,,,,left,,,,,,


In [9]:
df.shape

(194, 57)

In [10]:
# Dropping rows with any na
df.dropna().shape

(0, 57)

In [11]:
# Dropping columns with any na
df2 = df[df.columns[~df.isnull().any()]]

In [12]:
df2.shape

(194, 0)

In [13]:
# Dropping rows with all na
df.dropna(how='all').shape

(155, 57)

In [14]:
# Dropping columns with all na
df[df.columns[~df.isnull().all()]].shape

(194, 56)

In [15]:
df = df.dropna(how='all')
df = df[df.columns[~df.isnull().all()]]

In [16]:
df.shape

(155, 56)

In [17]:
df.head()

Unnamed: 0,"NewsGuard, Does not repeatedly publish false content","NewsGuard, Gathers and presents information responsibly","NewsGuard, Regularly corrects or clarifies errors","NewsGuard, Handles the difference between news and opinion responsibly","NewsGuard, Avoids deceptive headlines","NewsGuard, Website discloses ownership and financing","NewsGuard, Clearly labels advertising","NewsGuard, Reveals who's in charge, including any possible conflicts of interest","NewsGuard, Provides information about content creators","NewsGuard, score",...,"Allsides, community_agree","Allsides, community_disagree","Allsides, community_label","BuzzFeed, leaning","PolitiFact, Pants on Fire!","PolitiFact, False","PolitiFact, Mostly False","PolitiFact, Half-True","PolitiFact, Mostly True","PolitiFact, True"
21stCenturyWire,,,,,,,,,,,...,,,,left,,,,,,
ABC News,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,95.0,...,8964.0,6949.0,somewhat agree,,,,,,,
AMERICAblog News,,,,,,,,,,,...,,,,left,,,,,,
Activist Post,,,,,,,,,,,...,,,,left,,,,,,
Addicting Info,,,,,,,,,,,...,,,,left,,,,,,


**Notes**:
- Every row has missing data. Every column has missing data. 
- 39 rows have no data at all. 1 row has no column at all.
- NewsGuard, by inspection, is sometimes quite odd. Al Jazeera is deemed not credible (see overall_class.) Bipartisan Report is recorded as: Does not repeatedly publish false content, Gathers and presents information responsibly, Regularly corrects or clarifies errors, Handles the difference between news and opinion responsibly, Avoids deceptive headlines, Website discloses ownership and financing -- yet it somehow has one of the lowest credibility scores. In fact, the organizations that score the highest on these have 0s (i.e. False) for all these metrics, suggesting that the data has been flipped?
- We don't have by-article labels, only by-source labels. In other words, we will be assuming that all news from a site is either real or fake. 

To get a clear signal for whether a site has real or fake news, we will use the labels that are unequivocal about the site's classification. We also generally limit these to binary features, as the non-binary features are counts that are not straightforward to interpret.

**Fake news**
- Media Bias / Fact Check, fake_news (turns out that 0 sites have fake_news == 1)
- Media Bias / Fact Check, conspiracy
- Media Bias / Fact Check, pseudoscience

**Real news**
- NewsGuard, score: 100
- Media Bias / Fact Check, factual_reporting: 5

In [18]:
fake_news_df = df[(df['Media Bias / Fact Check, fake_news'] == 1) | (df['Media Bias / Fact Check, conspiracy'] == 1) | (df['Media Bias / Fact Check, pseudoscience'] == 1)]

In [19]:
fake_news_df.shape

(9, 56)

In [20]:
fake_news_sites = list(fake_news_df.index)

In [21]:
fake_news_sites

['Breitbart',
 'DC Gazette',
 'Daily Mail',
 'LewRockwell',
 'Newswars',
 'Pamela Geller Report',
 'The D.C. Clothesline',
 'The Gateway Pundit',
 'True Activist']

In [22]:
real_news_df = df[(df['NewsGuard, score'] == 100) | (df['Media Bias / Fact Check, factual_reporting'] == 5)]

In [23]:
real_news_df.shape

(30, 56)

In [24]:
real_news_sites = list(real_news_df.index)

## Finding posts that appear in both subreddits

In [26]:
os.chdir('/home/caleb/Cornell-Conversational-Analysis-Toolkit')

In [27]:
from convokit import Corpus
import convokit

In [28]:
donald_corpus = Corpus(filename=donald_dir)

In [29]:
donald_corpus.print_summary_stats()

Number of Users: 617191
Number of Utterances: 38640598
Number of Conversations: 3830155


In [30]:
next(donald_corpus.iter_conversations())

Conversation({'_owner': <convokit.model.corpus.Corpus object at 0x7f2f6f6ee950>, '_id': '4cstwp', '_utterance_ids': ['4cstwp', 'd1l5e8t', 'd1l5egq', 'd1l5grh', 'd1l5h2s', 'd1l5h5z', 'd1l5mm4', 'd1l5mut', 'd1l5ocj', 'd1l5q5c', 'd1l5qsl', 'd1l5ssq', 'd1l5thm', 'd1l5ukw', 'd1l5vsv', 'd1l5vt6', 'd1l5vu7', 'd1l5wyn', 'd1l5x8q', 'd1l5y03', 'd1l5z1q', 'd1l615j', 'd1l65gw', 'd1l65or', 'd1l66fm', 'd1l692u', 'd1l6afl', 'd1l6nds', 'd1l6syz', 'd1l778w', 'd1l7aaj', 'd1l7dmi', 'd1l7q34', 'd1l8872', 'd1l8ihv', 'd1l8onm', 'd1l8rrl', 'd1lblj8', 'd1lbnhe', 'd1lc1b1', 'd1lchkh', 'd1leke9', 'd1lemam', 'd1lfz68', 'd1li0aq', 'd1li3d8', 'd1lit2w', 'd1lkela', 'd1lsh9u', 'd1ly9pn'], '_usernames': None, '_meta': {'title': 'Who is going to match my level of cringe?!', 'num_comments': 45, 'domain': 'imgur.com', 'timestamp': 1459468808, 'subreddit': 'The_Donald', 'gilded': 0, 'gildings': None, 'stickied': False, 'author_flair_text': 'NY'}})

In [31]:
donald_title_domains = [(convo.meta['title'], convo.meta['domain']) for convo in donald_corpus.iter_conversations()]

In [32]:
len(donald_title_domains)

3830155

In [33]:
donald_title_domains = set(donald_title_domains)

In [34]:
len(donald_title_domains)

3674684

In [35]:
fake_news_sites

['Breitbart',
 'DC Gazette',
 'Daily Mail',
 'LewRockwell',
 'Newswars',
 'Pamela Geller Report',
 'The D.C. Clothesline',
 'The Gateway Pundit',
 'True Activist']

In [36]:
fake_news_urls = {'breitbart.com', 'dcgazette.com', "dailymail.co.uk", 
                  "lewrockwell.com", "newswars.com", "gellerreport.com",
                  "dcclothesline.com", "thegatewaypundit.com", "trueactivist.com"
                 }

In [37]:
filtered_donald_domains = {(k, v) for k, v in donald_title_domains if v in fake_news_urls}

In [38]:
len(filtered_donald_domains)

98334

In [40]:
from collections import Counter

In [41]:
domains_ctr = Counter([domain for title, domain in filtered_donald_domains])

no. of posts per website (filtering for duplicate titles; does not always filter duplicate articles because url is not inside the conversation metadata)

In [42]:
domains_ctr 

Counter({'breitbart.com': 61778,
         'thegatewaypundit.com': 18591,
         'dailymail.co.uk': 16675,
         'dcclothesline.com': 248,
         'lewrockwell.com': 276,
         'trueactivist.com': 43,
         'gellerreport.com': 301,
         'newswars.com': 404,
         'dcgazette.com': 18})

In [46]:
fake_news_df = pd.read_pickle('examples/fake-news/fake_news_df.pkl')
real_news_df = pd.read_pickle('examples/fake-news/real_news_df.pkl')

In [86]:
from collections import defaultdict
fake_news_titles = set()
for index, row in fake_news_df.iterrows():
    fake_news_titles.add(row.Title.lower())
    
real_news_titles = set()
for index, row in real_news_df.iterrows():
    real_news_titles.add(row.Title.lower())

In [76]:
# there are about 20000 fake news titles are 120000 real news titles

In [96]:
fake_news_matches_donald = set()
real_news_matches_donald = set()
for convo in donald_corpus.iter_conversations():
    title = convo.meta['title'].lower()
    if title in fake_news_titles:
        fake_news_matches_donald.add(title)
    elif title in real_news_titles:
        real_news_matches_donald.add(title)

In [88]:
Counter(fake_news_matches).most_common()[:10]

[('breitbart.com', 119),
 ('i.redd.it', 102),
 ('self.The_Donald', 98),
 ('thegatewaypundit.com', 64),
 ('archive.is', 37),
 ('imgur.com', 35),
 ('infowars.com', 35),
 ('youtube.com', 24),
 ('dailycaller.com', 23),
 ('i.imgur.com', 22)]

In [89]:
Counter(real_news_matches).most_common()[:10]

[('i.redd.it', 213),
 ('self.The_Donald', 101),
 ('thehill.com', 67),
 ('imgur.com', 41),
 ('i.imgur.com', 33),
 ('youtube.com', 32),
 ('bbc.com', 30),
 ('edition.cnn.com', 30),
 ('i.reddituploads.com', 27),
 ('cnn.com', 27)]

In [80]:
# Totals
print("Number of fake news matches (potential overlap): ", len(fake_news_matches))
print("Number of real news matches (potential overlap): ", len(real_news_matches))

Number of fake news matches (potential overlap):  760
Number of real news matches (potential overlap):  1178


In r/The_Donald:
- Match on lowercased string (possible overlaps): Fake (758), Real (1175)
- Match on lowercased punctuation escaped string (possible overlaps): Fake (760), Real (1178)
- Match on lowercased string (no overlaps): Fake (380), Real (642)

**overlaps occur because some titles may be posted multiples times

In [97]:
news_corpus = Corpus(filename=news_dir)

In [98]:
fake_news_matches_news = set()
real_news_matches_news = set()
for convo in news_corpus.iter_conversations():
    title = convo.meta['title'].lower()
    if title in fake_news_titles:
        fake_news_matches_news.add(title)
    elif title in real_news_titles:
        real_news_matches_news.add(title)

In [100]:
len(fake_news_matches_news)

59

In [101]:
len(real_news_matches_news)

1942

In r/news:

- Match on lowercased string (no overlaps): Fake (59), Real (1942)

Fake news that appear in both r/The_Donald and r/news:

In [105]:
len(fake_news_matches_news.intersection(fake_news_matches_donald))

24

In [106]:
fake_news_matches_news.intersection(fake_news_matches_donald)

{'adam schiff spoofed with russian claim of nude trump pic',
 'calexit',
 'california',
 'california judge rules twitter can be sued for falsely advertising free speech',
 'climate change',
 'dc police respond to mass shooting threat at event in trump hotel',
 'draining the swamp',
 'fbi admits it used multiple spies to infiltrate trump campaign',
 'free at last',
 'leftist antifa terrorist arrested with bombs with plans to sell to friends to kill law enforcement',
 'live trump press conference',
 'michael avenatti locks account after admitting kavanaugh accuser might not come forward',
 'remembrance',
 'republican senators doxxed by someone in house shortly after questioning kavanaugh',
 'saudi arabia',
 'social media',
 'ted cruz defeats jimmy kimmel in charity basketball game',
 'texas teenager attacked for wearing make america great again hat',
 'trump administration seeks public comments on marijuana reclassification',
 'trump mulls pulling some federal agents from california',
 '

In [107]:
# Real news that appears in both r/news and r/The_Donald

In [108]:
len(real_news_matches_news.intersection(real_news_matches_donald))

143

## Let's condense r/The_Donald to its matches and proceed with a simpler task

In [113]:
title_to_convoid_fake = dict()
title_to_convoid_real = dict()
for convo in donald_corpus.iter_conversations():
    title = convo.meta['title'].lower()
    if title in fake_news_matches_donald or title in real_news_matches_donald:
        if title in fake_news_matches_donald:
            d = title_to_convoid_fake
        else:
            d = title_to_convoid_real
        if title in d:
            prev_convo_comments = d[title].meta['num_comments']
            if convo.meta['num_comments'] > prev_convo_comments:
                d[title] = convo
        else:
            d[title] = convo

In [114]:
len(title_to_convoid_fake)

380

In [115]:
len(title_to_convoid_real)

642

In [119]:
title_to_convoid_fake = {title: convo.id for title, convo in title_to_convoid_fake.items()}

In [121]:
title_to_convoid_real = {title: convo.id for title, convo in title_to_convoid_real.items()}

In [None]:
os.chdir('./examples/fake-news')

In [117]:
with open('donald_fake_titles.json', 'w') as f:
    json.dump(title_to_convoid_fake, f)
    
with open('donald_real_titles.json', 'w') as f:
    json.dump(title_to_convoid_real, f)

KeyboardInterrupt: 