In [None]:
# !pip install newspaper 

In [1]:
import requests
import argparse
import time
import json
import io
import gzip
import csv
import codecs
import sys
import newspaper
from bs4 import BeautifulSoup

In [2]:
def search_domain(domain):

    record_list = []

    print ("[*] Trying target domain: %s" % domain)
    for index in index_list:
        print ("[*] Trying index %s" % index)

        cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
        cc_url += "url=%s&matchType=domain&output=json" % domain

        response = requests.get(cc_url)

        if response.status_code == 200: # if the api call returns data successfully

            records = response.content.splitlines()
            count = 0
            for record in records:
                if count%1000 == 0:
                    record = json.loads(record)
                    if record["status"] == "200": # if the record contains the link to the required archive
                        record_list.append(record)
                count+=1

            print ("[*] Added %d results." % len(records))


    print ("[*] Found a total of %d hits." % len(record_list))

    return record_list

#
# Downloads a page from Common Crawl
#
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://commoncrawl.s3.amazonaws.com/'

    # We can then use the Range header to ask for just this set of bytes
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

    # The page is stored compressed (gzip) to save space
    # We can extract it using the GZIP library
    f = gzip.GzipFile(fileobj = io.BytesIO(resp.content))
##
##    # What we have now is just the WARC response, formatted:
    data = f.read().decode("utf-8")

    response = ""

    if len(data):
        try:
            warc, header, response = data.strip().split('\r\n\r\n', 2)
        except:
            pass
##
    return response

In [3]:
domain = "https://theonion.com/"

# list of available indices
index_list = ["2020-45"] #november/december 2020

record_list = search_domain(domain)

[*] Trying target domain: https://theonion.com/
[*] Trying index 2020-45
[*] Added 12315 results.
[*] Found a total of 10 hits.


In [4]:
d={}
for record in record_list:
    html_content = download_page(record)
    print ("[*] Retrieved %d bytes for %s" % (len(html_content),record['url']))
    article = newspaper.Article(url = ' ')
    article.set_html(html_content)
    article.parse()
    if len(article.text.split()) > 100: # to filter out non-news articles
        articleDetails = {'title': article.title, 'body': article.text}
        d[record['url']] = articleDetails
##    print ("content retrieved is ")
##    print(html_content)

[*] Retrieved 440813 bytes for https://www.theonion.com/
[*] Retrieved 222247 bytes for https://www.theonion.com/cat-treat-package-going-on-about-delicious-creamy-cent-1840832224
[*] Retrieved 220513 bytes for https://www.theonion.com/kfc-selling-sandwich-shaped-meteorite-1819563658
[*] Retrieved 217959 bytes for https://www.theonion.com/paula-broadwell-crashing-on-petraeus-family-s-couch-unt-1819574200
[*] Retrieved 220989 bytes for https://www.theonion.com/study-girls-internalize-gender-stereotypes-by-age-6-1819563455
[*] Retrieved 534479 bytes for https://www.theonion.com/the-week-in-pictures-week-of-march-2-2020-1842015833/slides/21
[*] Retrieved 214242 bytes for https://entertainment.theonion.com/fans-celebrate-vanna-white-s-first-show-as-guest-wheel-1840346038
[*] Retrieved 214775 bytes for https://local.theonion.com/pumpkin-spends-summer-getting-huge-to-avoid-being-picke-1845322943
[*] Retrieved 217025 bytes for https://politics.theonion.com/epa-chief-pruitt-welcomes-delegation-

In [5]:
with open('newsData.json', 'w') as fp:
    json.dump(d, fp)