In [30]:
import os

import ujson
from tqdm import tqdm
from peewee import SqliteDatabase, fn
from playhouse.shortcuts import model_to_dict

from database import Page

In [31]:
path_data = '/Volumes/ExternalSSD/FakeNewsRecognition/'

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Extract-the-reliable-sources" data-toc-modified-id="Extract-the-reliable-sources-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Extract the reliable sources</a></span></li><li><span><a href="#Insert-webhose-to-standard-Page-DB-format" data-toc-modified-id="Insert-webhose-to-standard-Page-DB-format-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Insert webhose to standard Page DB format</a></span></li></ul></div>

In [2]:
path = '/Volumes/ExternalSSD/FakeNewsRecognition/624_webhose-2016-11_20170904081158/'

In [3]:
files = os.listdir(path)

In [8]:
domain_language_counter = {}

with tqdm() as progress:
    for file in files:
        with open(path + file, 'r') as in_file:
            for line in in_file:
                article = ujson.loads(line)
                domain = article['thread']['site_full']
                language = article['language']
                key = domain, language
                domain_language_counter.setdefault(key, 0)
                domain_language_counter[key] += 1
            
        progress.update()

499610it [06:18, 1320.41it/s]


In [12]:
domain_language_counter_sorted = list(domain_language_counter.items())
sorted(domain_language_counter_sorted, key=lambda x: x[1], reverse=True)

[(('archive.org', 'english'), 50470),
 (('www.reuters.com', 'english'), 33991),
 (('answers.yahoo.com', 'english'), 24751),
 (('www.yahoo.com', 'english'), 22454),
 (('www.sfgate.com', 'english'), 22281),
 (('www.chron.com', 'english'), 18532),
 (('news.yahoo.com', 'english'), 16755),
 (('uk.finance.yahoo.com', 'english'), 12715),
 (('indianexpress.com', 'english'), 11672),
 (('www.huffingtonpost.com', 'english'), 10939),
 (('abcnews.go.com', 'english'), 8657),
 (('www.msn.com', 'english'), 7629),
 (('www.businessinsider.com', 'english'), 6681),
 (('uk.reuters.com', 'english'), 6574),
 (('www.theguardian.com', 'english'), 6370),
 (('ca.reuters.com', 'english'), 5771),
 (('www.wsj.com', 'english'), 5550),
 (('af.reuters.com', 'english'), 5416),
 (('www.nhl.com', 'english'), 5378),
 (('www.legacy.com', 'english'), 5301),
 (('tv.yahoo.com', 'english'), 5299),
 (('www.latimes.com', 'english'), 5167),
 (('www.washingtonpost.com', 'english'), 5074),
 (('sports.yahoo.com', 'english'), 5057),


# Extract the reliable sources

In [23]:
domains_reliable = ['www.reuters.com', 'www.yahoo.com', 'www.sfgate.com', 'www.chron.com', 'news.yahoo.com',
                    'uk.finance.yahoo.com', 'indianexpress.com', 'www.huffingtonpost.com', 'abcnews.go.com',
                    'www.msn.com', 'www.businessinsider.com', 'uk.reuters.com', 'www.theguardian.com', 
                    'ca.reuters.com', 'www.wsj.com', 'af.reuters.com', 'www.nhl.com', 'www.legacy.com', 
                    'www.latimes.com', 'www.washingtonpost.com', 'sports.yahoo.com', 'www.forbes.com', 
                    'www.nytimes.com', 'www.nydailynews.com', 'www.bloomberg.com', 'in.reuters.com', 
                    'www.usatoday.com', 'www.ndtv.com', 'music.yahoo.com', 'www.cnbc.com', 'ca.news.yahoo.com', 
                    'www.cbsnews.com', 'www.marketwatch.com', 'm.mlb.com', 'nypost.com', 'www.npr.org', 
                    'bleacherreport.com', 'zeenews.india.com', 'finance.yahoo.com', 'feeds.reuters.com', 
                    'www.cbssports.com', 'edition.cnn.com', 'people.com', 'www.cnet.com', 'nz.sports.yahoo.com', 
                    'news.abs-cbn.com', 'www.nba.com', 'online.wsj.com', 'de.finance.yahoo.com','www.investing.com', 
                    'www.cnn.com', 'www.eonline.com', 'newsinfo.inquirer.net', 'www.nbcnews.com', 'www.politico.com', 
                    'profit.ndtv.com', 'wiki.mozilla.org', 'disneyworld.disney.go.com', 'uk.news.yahoo.com', 
                    'video.foxnews.com', 'au.finance.yahoo.com', 'latino.foxnews.com', 'nz.news.yahoo.com', 
                    'www.engadget.com', 'www.vice.com', 'www.wikihow.com', 'money.cnn.com', 'washpost.bloomberg.com', 
                    'au.be.yahoo.com', 'www.realclearpolitics.com', 'motherboard.vice.com', 'au.news.yahoo.com', 
                    'www.nfl.com', 'www.buzzfeed.com', 'www.theatlantic.com', 'www.foxnews.com']

In [24]:
domains_reliable_set = set(domains_reliable)

In [29]:
with tqdm() as progress:
    with open('/Volumes/ExternalSSD/FakeNewsRecognition/webhose/raw_reliable.jsonl', 'w') as out_raw_reliable:
        for file in files:
            with open(path + file, 'r') as in_file:
                line = in_file.readlines()[0]
                article = ujson.loads(line)
                domain = article['thread']['site_full']
                if domain in domains_reliable_set:
                    out_raw_reliable.write(ujson.dumps(article) + '\n')

            progress.update()

499610it [11:07, 747.96it/s] 


In [21]:
urls_sample = []
with tqdm() as progress:
    for file in files:
        with open(path + file, 'r') as in_file:
            line = in_file.readlines()[0]
            article = ujson.loads(line)
            domain = article['thread']['site_full']
            if domain in domains_reliable_set 'www.yahoo.com': # and len(article['text']) > 500: 
                urls_sample.append(article['url'])
                if len(urls_sample) > 100:
                    break
            
        progress.update()
        
article

2608it [00:03, 722.29it/s]


{'author': '',
 'crawled': '2016-11-20T05:19:07.399+02:00',
 'entities': {'locations': [],
  'organizations': [{'name': 'getty images', 'sentiment': 'none'}],
  'persons': [{'name': 'eric mcc', 'sentiment': 'negative'},
   {'name': 'jane asher', 'sentiment': 'negative'},
   {'name': 'olivia munn', 'sentiment': 'none'},
   {'name': 'hailee steinfeld', 'sentiment': 'none'},
   {'name': 'julianne hough', 'sentiment': 'none'},
   {'name': 'jon kopaloff/filmmagic gigi hadid', 'sentiment': 'none'},
   {'name': 'ciara', 'sentiment': 'none'},
   {'name': 'robert downey jr.', 'sentiment': 'none'},
   {'name': 'matt bomer', 'sentiment': 'none'},
   {'name': 'chrissy teigen', 'sentiment': 'none'},
   {'name': 'nina dobrev', 'sentiment': 'none'},
   {'name': 'taraji p. henson', 'sentiment': 'none'},
   {'name': 'janelle monae', 'sentiment': 'none'},
   {'name': 'laura cavanaugh/filmmagic', 'sentiment': 'none'},
   {'name': 'karlie kloss', 'sentiment': 'none'},
   {'name': 'jay pharoah', 'sentiment

In [22]:
urls_sample

['https://www.yahoo.com/news/britains-hull-grabs-lpga-tour-championship-lead-212834416--golf.html?ref=gs',
 'https://www.yahoo.com/tv/ruffled-trump-tweets-finger-wagging-174508165.html',
 'https://www.yahoo.com/music/ad-rock-joins-beastie-boys-203447300.html',
 'https://www.yahoo.com/news/melania-trump-not-moving-into-white-house-in-january-reports-181652896.html',
 'https://www.yahoo.com/news/haitians-vote-hoping-restore-constitutional-order-144522833.html?ref=gs',
 'https://www.yahoo.com/tv/saturday-night-live-ratings-dip-171057604.html',
 'https://www.yahoo.com/news/poacher-costa-sends-chelsea-top-table-183702470--sow.html?ref=gs',
 'https://www.yahoo.com/news/insigne-ends-goal-drought-brace-napolis-2-1-193341434.html?ref=gs',
 'https://www.yahoo.com/style/3-moves-guarantee-super-sexy-173002306.html',
 'https://www.yahoo.com/news/sweet-charity-theater-review-sutton-foster-galumphs-her-221450520.html',
 'https://www.yahoo.com/style/amazon-package-x-ray-going-184009116.html',
 'https:

In [17]:
len(article['text'])

412

# Insert webhose to standard Page DB format

In [51]:
peewee_database_webhose = SqliteDatabase(path_data + 'webhose/news_cleaned_webhose.db')
Page._meta.database = peewee_database_webhose
Page.create_table()

In [52]:
batch_size = 50
pages_to_insert = []
with tqdm() as progress:
    with open('/Volumes/ExternalSSD/FakeNewsRecognition/webhose/raw_reliable.jsonl', 'r') as in_raw_reliable:
        for line in in_raw_reliable:
            article = ujson.loads(line)
            pages_to_insert.append({
                'scraped_page_id': 0,
                'batch': 0,
                'domain': article['thread']['site_full'],
                'type': 'reliable',
                'url': article['url'],
                'content': article['text'],
                'scraped_at': article['crawled'],
                'title': article['title'],
                'authors': article['author']
            })
            progress.update()
            
            if len(pages_to_insert) >= batch_size:
                with peewee_database_webhose.atomic():
                    Page.insert_many(pages_to_insert).execute()
                    pages_to_insert = []
            
with peewee_database_webhose.atomic():
    Page.insert_many(pages_to_insert).execute()
    pages_to_insert = []

342351it [03:20, 1706.87it/s]
