# Scrape all news articles from the ABC News archives

http://www.abc.net.au/news/archive/

## Algorithm
1. Initiate first_date; base_url; page_number
2. For every date from first_date to today:
  1. Call the url http://www.abc.net.au/news/archive/yyyy,mm,dd?page=n
  2. If the last article link on the page has not been seen previously:
    1. For each article link in the page:
      1. Add the link text, link url and date to the dataframe list of discovered articles
    2. Increment page_number

In [1]:
from bs4 import BeautifulSoup
import requests
import datetime


In [36]:
# Static variables

first_date = datetime.date(2003, 2, 19)
archive_base_url = 'http://www.abc.net.au/news/archive/'
abc_url = 'http://abc.net.au'

In [73]:
class ArticleLink:
    '''
    Represents an entry in ABC News's news archive article list.
    Attributes:
       title: The article title (from the link text)
       url: The URL of the linked article
       timestamp: The posting timestamp of the article
       summary: A short summary of the article
       topics: A list of topics that this article relates to
    '''
    title = None
    url = None
    timestamp = None
    summary = None
    topics = None
    
    
    def __init__(self):
        self.title = None
        self.url = None
        self.timestamp = None
        self.summary = None
        self.topics = None
    
    def __init__(self, title, url, timestamp, summary, topics):
        self.title = title
        self.url = url
        self.timestamp = timestamp
        self.summary = summary
        self.topics = topics
    
    def __repr__(self):
        return_date = ''
        if self.timestamp != None:
            return_date = time.strftime('%Y-%m-%dT%H:%M:%SZ', self.timestamp)
        
        result = (
            'Title:\t' + str(self.title) + '\n' +
            'URL:\t' + str(self.url) + '\n' +
            'Timestamp:\t' + str(return_date) + '\n' +
            'Summary:\t' + str(self.summary) + '\n' +
            'Topics:\n'
        )
        for topic in self.topics:
            result = result + '\t' + str(topic) + '\n'
    
    def to_json():
        return None
    
    

In [37]:
def make_archive_url(date, page):
    '''
    Given a date object and a page number (n), returns the url for the ABC News archive in the following format:
    http://www.abc.net.au/news/archive/yyyy,mm,dd?page=n
    
    Parameters:
    date: a date object
    page: a numeric representing the page number of the results
    '''
    dd = ('0' + str(date.day))[-2:]    # forces the day into a dd format (with leading zero if needed)
    mm = ('0' + str(date.month))[-2:]  # forces the month into an mm format (with leading zero if needed)
    yyyy = str(date.year)
    p = '?page=' + str(page)
    return archive_base_url + yyyy + ',' + mm + ',' + dd + p

def url_ok(url):
    return requests.get(url).status_code == requests.codes.ok



In [6]:
today = datetime.date.today()
print(make_archive_url(today, 3))

http://www.abc.net.au/news/archive/2016,08,27?page=3


In [37]:
# The ABC site returns the last page of results when `page` exceeds the number of pages
print(url_works('http://www.abc.net.au/news/archive/2003,02,28?page=99'))
print(requests.get('http://www.abc.net.au/news/archive/2003,02,28?page=99').status_code)

True
200


In [8]:
# loop through all dates in range

date_index = first_date
while date_index <= datetime.date.today():
    # do stuff
    date_index += datetime.timedelta(days=1)

2003-02-19
2003-02-20
2003-02-21
2003-02-22
2003-02-23
2003-02-24
2003-02-25
2003-02-26
2003-02-27
2003-02-28
2003-03-01
2003-03-02
2003-03-03
2003-03-04
2003-03-05
2003-03-06
2003-03-07
2003-03-08
2003-03-09
2003-03-10
2003-03-11
2003-03-12
2003-03-13
2003-03-14
2003-03-15
2003-03-16
2003-03-17
2003-03-18
2003-03-19
2003-03-20
2003-03-21
2003-03-22
2003-03-23
2003-03-24
2003-03-25
2003-03-26
2003-03-27
2003-03-28
2003-03-29
2003-03-30
2003-03-31
2003-04-01
2003-04-02
2003-04-03
2003-04-04
2003-04-05
2003-04-06
2003-04-07
2003-04-08
2003-04-09
2003-04-10
2003-04-11
2003-04-12
2003-04-13
2003-04-14
2003-04-15
2003-04-16
2003-04-17
2003-04-18
2003-04-19
2003-04-20
2003-04-21
2003-04-22
2003-04-23
2003-04-24
2003-04-25
2003-04-26
2003-04-27
2003-04-28
2003-04-29
2003-04-30
2003-05-01
2003-05-02
2003-05-03
2003-05-04
2003-05-05
2003-05-06
2003-05-07
2003-05-08
2003-05-09
2003-05-10
2003-05-11
2003-05-12
2003-05-13
2003-05-14
2003-05-15
2003-05-16
2003-05-17
2003-05-18
2003-05-19
2003-05-20

In [4]:
# get a page for a date
# find the article links
url = 'http://www.abc.net.au/news/archive/2003,02,28?page=1'
html = ''

try:
    html_response = requests.get(url).text
except Exception as e:
    raise e
else:
    html = html_response

# print(html)
soup = BeautifulSoup(html, 'lxml')
# look for <div class="section" id="article-index">
article_list = soup.find('ul', {'class' : 'article-index'})


Every second entry in article_list.contents is an '\n', starting at index=0. The alternating ones are article links.

In [50]:
print('Length of article_list.contents: ', len(article_list.contents))
print('article_list.contents[1]:\n', article_list.contents[1])
article_list.contents[0]

Length of article_list.contents:  51
article_list.contents[1]:
 <li><h3>
<a href="/news/2003-02-28/dali-graphics-drenched-in-uk-gallery-flooding/1222774">Dali graphics drenched in UK gallery flooding</a>
</h3>
<p class="published">Posted <span class="timestamp">February 28, 2003 23:30:00</span></p><p>Graphics by surrealist Salvador Dali have been damaged by a gallery sprinkler system, a spokeswoman for the Dali Universe exhibition in the UK said.</p>
<p class="topics">
<strong>Topics:</strong>
<a href="/news/topic/arts-and-entertainment">arts-and-entertainment</a>,


	
	<a href="/news/topic/contemporary-art">contemporary-art</a>,


	
	<a href="/news/topic/digital-multimedia">digital-multimedia</a>,


	
	<a href="/news/topic/united-kingdom">united-kingdom</a>
</p></li>


'\n'

In [58]:
print('Link text:\t', article_list.contents[1].h3.text.strip())
print('Link URL:\t', article_list.contents[1].h3.a['href'])
print('Timestamp:\t', article_list.contents[1].p.span.text)
print('Summary:\t', article_list.contents[1].p.next_sibling.string)
print('Topics:\t', article_list.contents[1].p.next_sibling.next_sibling)


Link text:	 Dali graphics drenched in UK gallery flooding
Link URL:	 /news/2003-02-28/dali-graphics-drenched-in-uk-gallery-flooding/1222774
Timestamp:	 February 28, 2003 23:30:00
Summary:	 Graphics by surrealist Salvador Dali have been damaged by a gallery sprinkler system, a spokeswoman for the Dali Universe exhibition in the UK said.
Topics:	 



[<a href="/news/topic/arts-and-entertainment">arts-and-entertainment</a>,
 <a href="/news/topic/contemporary-art">contemporary-art</a>,
 <a href="/news/topic/digital-multimedia">digital-multimedia</a>,
 <a href="/news/topic/united-kingdom">united-kingdom</a>]

In [62]:
[tag.text for tag in article_list.contents[1].p.next_sibling.next_element.next_element.next_element.findAll('a')]

['arts-and-entertainment',
 'contemporary-art',
 'digital-multimedia',
 'united-kingdom']

In [74]:
a = ArticleLink()

TypeError: __init__() missing 5 required positional arguments: 'title', 'url', 'timestamp', 'summary', and 'topics'