# Scrape all news articles from the ABC News archives

http://www.abc.net.au/news/archive/

## Algorithm
1. Initiate first_date; base_url; page_number
2. For every date from first_date to today:
  1. Call the url http://www.abc.net.au/news/archive/yyyy,mm,dd?page=n
  2. If the last article link on the page has not been seen previously:
    1. For each article link in the page:
      1. Add the link text, link url and date to the dataframe list of discovered articles
    2. Increment page_number

## Finished

In [94]:
import sys
from bs4 import BeautifulSoup
import requests
import datetime
import json
import hashlib

In [21]:
# Static variables
first_date = datetime.date(2003, 2, 19)
archive_base_url = 'http://www.abc.net.au/news/archive/'
abc_url = 'http://abc.net.au'

In [169]:
class ArticleLink:
    '''
    Represents an entry in ABC News's news archive article list.
    Attributes:
       title: The article title (from the link text)
       url: The URL of the linked article
       timestamp: The posting timestamp of the article
       summary: A short summary of the article
       topics: A list of topics that this article relates to
       md5: MD5 hash of the article link for duplicate checking
       raw_list_element: (optional) the raw html list element that the 
           ArticleLink was parsed from
    '''
    title = None
    url = None
    timestamp = None
    summary = None
    topics = None
    raw_list_element = None
    md5 = None
    
    
    def __init__(self, title, url, timestamp, summary, topics, raw_list_element=None):
        self.title = title
        self.url = url
        self.timestamp = timestamp
        self.summary = summary
        self.topics = topics
        self.raw_list_element = raw_list_element
        if self.url != None:
            b = bytearray()
            b.extend(map(ord, self.url))
            self.md5 = hashlib.md5(b).hexdigest()
    
    def __str__(self):
        result = (
            'Title:\t\t' + str(self.title) + '\n' +
            'URL:\t\t' + str(self.url) + '\n' +
            'Timestamp:\t' + str(self.timestamp) + '\n' +
            'Summary:\t' + str(self.summary) + '\n' +
            'MD5:\t\t' + str(self.md5) + '\n' +
            'Topics:\n'
        )
        if self.topics == None or len(self.topics) == 0:
            result = result + '\tNone\n'
        else:
            for topic in self.topics:
                result = result + '\t' + str(topic) + '\n'
        return result
    
    def __repr__(self):
        return self.md5
    
    def full_string_representation(self):
        return str(self) + 'Raw HTML:\n' + str(self.raw_list_element)

In [189]:
def make_archive_url(date, page):
    '''
    Given a date object and a page number (n), returns the url for the ABC News archive in the following format:
    http://www.abc.net.au/news/archive/yyyy,mm,dd?page=n
    
    Parameters:
    date: a date object
    page: a numeric representing the page number of the results
    '''
    dd = ('0' + str(date.day))[-2:]    # forces the day into a dd format (with leading zero if needed)
    mm = ('0' + str(date.month))[-2:]  # forces the month into an mm format (with leading zero if needed)
    yyyy = str(date.year)
    p = '?page=' + str(page)
    return archive_base_url + yyyy + ',' + mm + ',' + dd + p


def get_article_links_from_page(url):
    '''
    Given a URL of an ABC News Archive page, returns a list of ArticleLinks.
    '''
    result = []
    
    # attempt to get the URL
    try:
        get_result = requests.get(url)
        get_result.raise_for_status()
        html_response = get_result.text
    except:
        print('Error requesting the following URL:', url, '\n', sys.exc_info()[0])
        raise
    
    
    # setup the parser
    soup = BeautifulSoup(html_response, 'lxml')
    
    # get the articles. Note that every second may be a blank ()'\n').
    article_list = soup.find('ul', {'class' : 'article-index'})   
    # print('Length of article_list.contents: ', 
    #       len([a for a in article_list.contents if len(a)>1]))
    for article in article_list.contents:
        if len(article) > 1:
            # initiate the article elements as None incase we can't parse them
            tmp_title = None
            tmp_link = None
            tmp_timestamp = None
            tmp_summary = None
            tmp_topics = None
            tmp_list_elem = None
            # parse the article meta-data from the list of links
            try:
                tmp_list_elem = article
            except AttributeError:
                print('!! Could not parse raw_article:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_title = article.h3.text.strip()
            except AttributeError:
                print('!! Could not parse title:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_link = abc_url + str(article.h3.a['href'])
            except AttributeError:
                print('!! Could not parse link:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_timestamp = article.p.span.text
            except AttributeError:
                print('!! Could not parse timestamp:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_summary = article.p.next_sibling.string
            except AttributeError:
                print('!! Could not parse summary:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                # Many articles are not tagged with topics
                if '<strong>Topics:</strong>' in str(article):
                    tmp_topics = [tag.text for tag in 
                              article.p.next_sibling.next_element.next_element.next_element.findAll('a')]
            except AttributeError:
                print('!! Could not parse topics:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            # add to the resultign list of article links
            result.append(ArticleLink(
                    tmp_title, tmp_link, tmp_timestamp, tmp_summary, tmp_topics, tmp_list_elem))
    return result


def get_all_article_links_for_date(date):
    '''
    For a given date, get all ArticleLinks from ABC News Archive.
    Parameters:
        date: A datetime.date object representing the requested date
    '''
    article_list = {} # all articles found to date; key is md5, value is article object

    # loop through all pages for a single date
    no_duplicates = True # will be set to False when the first duplicate is found
    page = 1

    while no_duplicates:
        print('------------', 'Page:', page, '------------')
        # get the article list for the page
        print('--', make_archive_url(date, page), '--')
        page_articles = get_article_links_from_page(make_archive_url(date, page))
        # add any article not already seen
        # this is done because an invalid page# returns the final page rather than a 404
        for article in page_articles:
            if article.md5 in article_list:
                print('!!Found ID', article.md5, 'before: Ceasing search for articles for', str(date))
                no_duplicates = False
                break
            else:
                # add it to the dict
                article_list[article.md5] = article
                print(str(article))
        page += 1
    return article_list



    

## Development

### Issues:
* (**fixed**) Topics will pick up the next article's title and topics if there are none of it's own.


In [None]:
def save_article_links_to_csv(start_date, end_date, meta_data_file_out, topics_file_out, index=0):
    '''
    Extract all ArticleLinks between a specified date range (inclusive) and save 
    them to two csv files: one for the metadata and one for the topics/tags.
    A unique ID (numeric) is recorded for each article, allowing the metadata and the
    topics csvs to be linked after extraction.
    Parameters:
        start_date: a datetime.date object representing the desired start date (inclusive)
        end_date: a datetime.date object representing the desired end date (inclusive)
        meta_data_file_out: an open file object to which is recorded the article 
            metadata (excluding topic tags) in csv format
        topics_file_out: an open file object to which is recorded the article topic tags 
            in csv format
        index: the starting id field (numeric) for the article list
    '''
    return None

## Testing

In [160]:
# __str__ Print an ArticleLink string
c = ArticleLink('title', 
                'http://www.abc.net.au/news/archive/2003,02,19?page=1', 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                ['t1','t2'],
               '<html></html>')
print(str(c))

Title:		title
URL:		http://www.abc.net.au/news/archive/2003,02,19?page=1
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		b9f5fd0be3c350f8bbc7de978df6c146
Topics:
	t1
	t2



In [161]:
# __str__ Print an ArticleLink string, with failed link parse
c = ArticleLink('title', 
                None, 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                ['t1','t2'],
               '<html></html>')
print(str(c))

Title:		title
URL:		None
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		None
Topics:
	t1
	t2



In [163]:
# __str__ Print an ArticleLink string, with failed topics parse
c = ArticleLink('title', 
                'http://www.abc.net.au/news/archive/2003,02,19?page=1', 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                None)
print(str(c))

Title:		title
URL:		http://www.abc.net.au/news/archive/2003,02,19?page=1
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		b9f5fd0be3c350f8bbc7de978df6c146
Topics:
	None



In [170]:
# __str__ Print an ArticleLink string, with failed topics parse
c = ArticleLink('title', 
                'http://www.abc.net.au/news/archive/2003,02,19?page=1', 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                None,
               '<html></html>')
print(c.full_string_representation())

Title:		title
URL:		http://www.abc.net.au/news/archive/2003,02,19?page=1
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		b9f5fd0be3c350f8bbc7de978df6c146
Topics:
	None
Raw HTML:
<html></html>


In [107]:
# make_archive_url
today = datetime.date.today()
print(make_archive_url(today, 0))

http://www.abc.net.au/news/archive/2016,09,07?page=0


In [8]:
# loop through all dates in range

date_index = first_date
while date_index <= datetime.date.today():
    # do stuff
    date_index += datetime.timedelta(days=1)

2003-02-19
2003-02-20
2003-02-21
2003-02-22
2003-02-23
2003-02-24
2003-02-25
2003-02-26
2003-02-27
2003-02-28
2003-03-01
2003-03-02
2003-03-03
2003-03-04
2003-03-05
2003-03-06
2003-03-07
2003-03-08
2003-03-09
2003-03-10
2003-03-11
2003-03-12
2003-03-13
2003-03-14
2003-03-15
2003-03-16
2003-03-17
2003-03-18
2003-03-19
2003-03-20
2003-03-21
2003-03-22
2003-03-23
2003-03-24
2003-03-25
2003-03-26
2003-03-27
2003-03-28
2003-03-29
2003-03-30
2003-03-31
2003-04-01
2003-04-02
2003-04-03
2003-04-04
2003-04-05
2003-04-06
2003-04-07
2003-04-08
2003-04-09
2003-04-10
2003-04-11
2003-04-12
2003-04-13
2003-04-14
2003-04-15
2003-04-16
2003-04-17
2003-04-18
2003-04-19
2003-04-20
2003-04-21
2003-04-22
2003-04-23
2003-04-24
2003-04-25
2003-04-26
2003-04-27
2003-04-28
2003-04-29
2003-04-30
2003-05-01
2003-05-02
2003-05-03
2003-05-04
2003-05-05
2003-05-06
2003-05-07
2003-05-08
2003-05-09
2003-05-10
2003-05-11
2003-05-12
2003-05-13
2003-05-14
2003-05-15
2003-05-16
2003-05-17
2003-05-18
2003-05-19
2003-05-20

In [171]:
url1 = 'http://www.abc.net.au/news/archive/2003,02,19?page=2'

link_list = get_article_links_from_page(url1)
for a in link_list:
    print(a.full_string_representation())

Title: US, British aircraft attack Sth Iraq target
Topics (raw): <p class="topics">
<strong>Topics:</strong>
<a href="/news/topic/iraq">iraq</a>
</p>
tmp_topics: None
Title: Restraint order issued against Anti-Discrimination Commissioner
Topics (raw): <li><h3>
<a href="/news/2003-02-19/dying-korean-subway-passengers-phoned-for-help/2688468">Dying Korean subway passengers phoned for help</a>
</h3>
<p class="published">Posted <span class="timestamp">February 19, 2003 18:10:00</span></p><p>More tragic details are emerging about yesterday's South Korean subway inferno.</p></li>
tmp_topics: None
Title: Dying Korean subway passengers phoned for help
Topics (raw): <li><h3>
<a href="/news/2003-02-19/radioactive-spill-at-wmcs-olympic-dam-mine/2688466">Radioactive spill at WMC's Olympic Dam mine</a>
</h3>
<p class="published">Posted <span class="timestamp">February 19, 2003 18:10:00</span></p><p>Mining company WMC has reported a spill of 210 cubic metres of radioactive liquid at its Olympic Dam 

In [185]:
date = datetime.date(2015, 1, 1)
links = get_all_article_links_for_date(date)

------------ Page: 1 ------------
-- http://www.abc.net.au/news/archive/2015,01,01?page=1 --
Title:		Asian Cup volunteers providing home ground help
Video
URL:		http://abc.net.au/news/2015-01-01/asian-cup-volunteers-providing-home-ground-help/5996608
Timestamp:	January 01, 2015 22:34:00
Summary:	The Asian Cup is the second oldest continental soccer championship in the world.
MD5:		f04594f4fd910ac918d918222aa4400c
Topics:
	volunteers
	soccer
	australia
	asia

Title:		Indian police accused of 'profiling' Muslims in terrorism drill video
URL:		http://abc.net.au/news/2015-01-01/indian-police-terrorism-drill-video-targeting-muslims-condemned/5996552
Timestamp:	January 01, 2015 22:28:25
Summary:	Police in the Indian prime minister's home state of Gujarat are condemned for staging an anti-terrorism exercise featuring "militants" dressed up as Muslims.
MD5:		548dd4dd91c6e7836cf299cc4d59d904
Topics:
	islam
	hinduism
	multiculturalism
	law-crime-and-justice
	police
	india
	asia

Title:		Scorcher

In [188]:
links['428ac25a68e99f1e76af0865850fb11c'].title

'Sport in 90 Seconds\nVideo'