# Scrape all news articles from the ABC News archives

http://www.abc.net.au/news/archive/

## Algorithm
1. Initiate first_date; base_url; page_number
2. For every date from first_date to today:
  1. Call the url http://www.abc.net.au/news/archive/yyyy,mm,dd?page=n
  2. If the last article link on the page has not been seen previously:
    1. For each article link in the page:
      1. Add the link text, link url and date to the dataframe list of discovered articles
    2. Increment page_number

## Finished

In [8]:
import sys
from bs4 import BeautifulSoup
import requests
import datetime
import json
import hashlib
import csv
import time

In [9]:
# Static variables
first_date = datetime.date(2003, 2, 19)
archive_base_url = 'http://www.abc.net.au/news/archive/'
abc_url = 'http://abc.net.au'

In [26]:
class ArticleLink:
    '''
    Represents an entry in ABC News's news archive article list.
    Attributes:
       title: The article title (from the link text)
       url: The URL of the linked article
       timestamp: The posting timestamp of the article
       summary: A short summary of the article
       topics: A list of topics that this article relates to
       md5: MD5 hash of the article link for duplicate checking
       raw_list_element: (optional) the raw html list element that the 
           ArticleLink was parsed from
    '''
    title = None
    url = None
    timestamp = None
    summary = None
    topics = None
    raw_list_element = None
    md5 = None
    
    
    def __init__(self, title, url, timestamp, summary, topics, raw_list_element=None):
        self.title = title
        self.url = url
        self.timestamp = timestamp
        self.summary = summary
        self.topics = topics
        self.raw_list_element = raw_list_element
        if self.url != None:
            b = bytearray()
            b.extend(map(ord, self.url))
            self.md5 = hashlib.md5(b).hexdigest()
    
    def __str__(self):
        result = (
            'Title:\t\t' + str(self.title) + '\n' +
            'URL:\t\t' + str(self.url) + '\n' +
            'Timestamp:\t' + str(self.timestamp) + '\n' +
            'Summary:\t' + str(self.summary) + '\n' +
            'MD5:\t\t' + str(self.md5) + '\n' +
            'Topics:\n'
        )
        if self.topics == None or len(self.topics) == 0:
            result = result + '\tNone\n'
        else:
            for topic in self.topics:
                result = result + '\t' + str(topic) + '\n'
        return result
    
    def __repr__(self):
        return self.md5
    
    def full_string_representation(self):
        '''
        Returns the extended string representation of the ArticleLink object.
        In addition to what is returned by str(), the full raw HTML of the
        article list item is appended to the string.
        '''
        return str(self) + 'Raw HTML:\n' + str(self.raw_list_element)
    
    def to_metadata_list(self):
        '''
        Returns the metadata elements of the ArticleLink object as a list:
        [title, url, timestamp, summary, md5]
        Does not include the topics, which are a list of their own.
        '''
        return [self.title, self.url, self.timestamp, self.summary, self.md5]
    
    def to_dict(self):
        '''
        Returns the ArticleLink object as a dictionary:
        {
         'title': title, 
         'url': url, 
         'timestamp': timestamp, 
         'summary': summary, 
         'md5': md5,
         'topics': [topics]
        }
        '''
        return {
            'title': self.title, 
            'url': self.url, 
            'timestamp': self.timestamp, 
            'summary': self.summary, 
            'md5': self.md5,
            'topics': self.topics 
        }
    
    def to_dict_with_index(self, index):
        '''
        Returns the ArticleLink object as a dictionary:
        {
         'index': index,
         'title': title, 
         'url': url, 
         'timestamp': timestamp, 
         'summary': summary, 
         'md5': md5,
         'topics': [topics]
        }
        '''
        return {
            'index': index,
            'title': self.title, 
            'url': self.url, 
            'timestamp': self.timestamp, 
            'summary': self.summary, 
            'md5': self.md5,
            'topics': self.topics 
        }
    


In [27]:
def make_archive_url(date, page):
    '''
    Given a date object and a page number (n), returns the url for the ABC News archive in the following format:
    http://www.abc.net.au/news/archive/yyyy,mm,dd?page=n
    
    Parameters:
    date: a date object
    page: a numeric representing the page number of the results
    '''
    dd = ('0' + str(date.day))[-2:]    # forces the day into a dd format (with leading zero if needed)
    mm = ('0' + str(date.month))[-2:]  # forces the month into an mm format (with leading zero if needed)
    yyyy = str(date.year)
    p = '?page=' + str(page)
    return archive_base_url + yyyy + ',' + mm + ',' + dd + p


def get_article_links_from_page(url):
    '''
    Given a URL of an ABC News Archive page, returns a list of ArticleLinks.
    '''
    result = []
    
    # attempt to get the URL
    try:
        get_result = requests.get(url)
        get_result.raise_for_status()
        html_response = get_result.text
    except:
        print('Error requesting the following URL:', url, '\n', sys.exc_info()[0])
        raise
    
    
    # setup the parser
    soup = BeautifulSoup(html_response, 'lxml')
    
    # get the articles. Note that every second may be a blank ()'\n').
    article_list = soup.find('ul', {'class' : 'article-index'})   
    # print('Length of article_list.contents: ', 
    #       len([a for a in article_list.contents if len(a)>1]))
    for article in article_list.contents:
        if len(article) > 1:
            # initiate the article elements as None incase we can't parse them
            tmp_title = None
            tmp_link = None
            tmp_timestamp = None
            tmp_summary = None
            tmp_topics = None
            tmp_list_elem = None
            # parse the article meta-data from the list of links
            try:
                tmp_list_elem = article
            except AttributeError:
                print('!! Could not parse raw_article:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_title = article.h3.text.strip()
            except AttributeError:
                print('!! Could not parse title:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_link = abc_url + str(article.h3.a['href'])
            except AttributeError:
                print('!! Could not parse link:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_timestamp = article.p.span.text
            except AttributeError:
                print('!! Could not parse timestamp:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                tmp_summary = article.p.next_sibling.string
            except AttributeError:
                print('!! Could not parse summary:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            try:
                # Many articles are not tagged with topics
                if '<strong>Topics:</strong>' in str(article):
                    tmp_topics = [tag.text for tag in 
                              article.p.next_sibling.next_element.next_element.next_element.findAll('a')]
            except AttributeError:
                print('!! Could not parse topics:\n',
                     '   URL:', url, '\n',
                     '   HTML:', article, '\n')
            # add to the resultign list of article links
            result.append(ArticleLink(
                    tmp_title, tmp_link, tmp_timestamp, tmp_summary, tmp_topics, tmp_list_elem))
    return result


def get_all_article_links_for_date(date):
    '''
    For a given date, get all ArticleLinks from ABC News Archive and return them as a
        dict using the md5 of the ArticleLink as the key and the ArticleLink itself as 
        the value.
    Parameters:
        date: A datetime.date object representing the requested date
    '''
    article_list = {} # all articles found to date; key is md5, value is article object

    # loop through all pages for a single date
    no_duplicates = True # will be set to False when the first duplicate is found
    page = 1

    while no_duplicates:
        print('------------', 'Page:', page, '------------')
        # get the article list for the page
        print('--', make_archive_url(date, page), '--')
        page_articles = get_article_links_from_page(make_archive_url(date, page))
        # add any article not already seen
        # this is done because an invalid page# returns the final page rather than a 404
        for article in page_articles:
            if article.md5 in article_list:
                print('!!Found ID', article.md5, 'before: Ceasing search for articles for', str(date))
                no_duplicates = False
                break
            else:
                # add it to the dict
                article_list[article.md5] = article
                print(str(article.md5))
        page += 1
    return article_list



    

## Development

### Issues:
* (**fixed**) Topics will pick up the next article's title and topics if there are none of it's own.


In [67]:
# def write_articles_to_csv(article_dict,
#                           metadata_file_out,
#                           topics_file_out,
#                           start_IDs_at=0):
#     '''
#     Writes the provided dictionary of ArticleLinks to two separate csv files:
#         * metadata (everything but topics) are written to metadata_file_out
#         * topics are written to topics_file_out
#     Each record in the csv is written with a unique ID (starting as specified)
#         to allow joining the two csv files after writing.
    
#     Parameters:
#         * article_dict:
#             A dict of ArticleLinks, using the md5 of the ArticleLink as the key 
#             and the ArticleLink itself as the value.
#         * metadata_file_out:
#             File name for the file containing the ArticleLink metadata.
#         * topics_file_out:
#             File name for the file containing the ArticleLink topics.
#         * start_IDs_at:
#             Starting point for the unique ArticleLink IDs. Default = 0.
#     '''
#     index_id = index
#     metadata_output_fields = ['index', 'title', 'url', 'timestamp', 'summary', 'md5']
                
#     # prepare the output files: meta_out for the metadata, topics_out for the topics
#     with open(metadata_file_out, 'w', newline='') as meta_out, open(topics_file_out, 'w', newline='') as topics_out:
#         meta_writer = csv.DictWriter(meta_out, 
#                                      fieldnames=metadata_output_fields, 
#                                      extrasaction='ignore')
#         topics_writer = csv.writer(topics_out)

def save_article_links_to_csv(start_date, 
                              end_date, 
                              metadata_file_out, 
                              topics_file_out, 
                              index=0,
                              pause_after_day=60):
    '''
    Extract all ArticleLinks between a specified date range (inclusive) and save 
    them to two csv files: one for the metadata and one for the topics/tags.
    A unique ID (numeric) is recorded for each article, allowing the metadata and the
    topics csvs to be linked after extraction.
    Parameters:
        start_date: a datetime.date object representing the desired start date (inclusive)
        end_date: a datetime.date object representing the desired end date (inclusive).
        metadata_file_out: file name to which is written the article metadata 
            (excluding topic tags) in csv format
        topics_file_out: file name to which is recorded the article topic tags in csv format
        index: the starting id field (numeric) for the article list. Default=0
        pause_after_day: seconds to pause for after getting all articles for a single day.
            Default=60
    '''
    
    index_id = index
    metadata_output_fields = ['index', 'title', 'url', 'timestamp', 'summary', 'md5']
                
    # prepare the output files: meta_out for the metadata, topics_out for the topics
    with open(metadata_file_out, 'w', newline='') as meta_out, open(topics_file_out, 'w', newline='') as topics_out:
        meta_writer = csv.DictWriter(meta_out, 
                                     fieldnames=metadata_output_fields, 
                                     extrasaction='ignore')
        topics_writer = csv.writer(topics_out)
        # for every date in the range
        date_index = start_date
        while date_index <= datetime.date.today():
            # get the day's ArticleLinks
            tmp_articles = get_all_article_links_for_date(date_index)
            # prepare them as dicts with the addition of an index 
            # starting from the specified index value
            articles = [article.to_dict_with_index(i) 
                        for (i, article) 
                        in enumerate(tmp_articles.values(), start=index_id)]
            # prepare the topics into a list of [index, topic] where the index
            # is repeated for each topic in the topic list for a single ArticleLink
            tmp_topics = [article.topics for article in articles]
            topics_rows = []
            for i in range(len(tmp_topics)):
                if tmp_topics[i] != None:
                    for j in range(len(tmp_topics[i])):
                        topics_rows.append([i, tmp_topics[i][j]])
                else:
                    topics_rows.append([i, tmp_topics[i]])
            # write the header once only for each set
            if date_index == start_date:
                meta_writer.writeheader()
                topics_writer.writerow(['index', 'topic'])
            # write all the rows at once
            meta_writer.writerows(articles)
            topics_writer.writerows(topics)
            # increment the date_index the right way and pause for politeness
            date_index += datetime.timedelta(days=1)
            time.sleep(pause_after_day)
    
    return None

## Testing

In [14]:
# __str__ Print an ArticleLink string
c = ArticleLink('title', 
                'http://www.abc.net.au/news/archive/2003,02,19?page=1', 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                ['t1','t2'],
               '<html></html>')
print(str(c))

Title:		title
URL:		http://www.abc.net.au/news/archive/2003,02,19?page=1
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		b9f5fd0be3c350f8bbc7de978df6c146
Topics:
	t1
	t2



In [15]:
c.to_dict()

{'md5': 'b9f5fd0be3c350f8bbc7de978df6c146',
 'summary': 'this is the summary',
 'timestamp': datetime.date(2003, 2, 19),
 'title': 'title',
 'topics': ['t1', 't2'],
 'url': 'http://www.abc.net.au/news/archive/2003,02,19?page=1'}

In [24]:
c.to_dict_with_index(1)

{'index': 1,
 'md5': 'b9f5fd0be3c350f8bbc7de978df6c146',
 'summary': 'this is the summary',
 'timestamp': datetime.date(2003, 2, 19),
 'title': 'title',
 'topics': ['t1', 't2'],
 'url': 'http://www.abc.net.au/news/archive/2003,02,19?page=1'}

In [28]:
l = [c, c, c]
[article.to_dict_with_index(i) for (i, article) in enumerate(l, start=0)]

[{'index': 0,
  'md5': 'b9f5fd0be3c350f8bbc7de978df6c146',
  'summary': 'this is the summary',
  'timestamp': datetime.date(2003, 2, 19),
  'title': 'title',
  'topics': ['t1', 't2'],
  'url': 'http://www.abc.net.au/news/archive/2003,02,19?page=1'},
 {'index': 1,
  'md5': 'b9f5fd0be3c350f8bbc7de978df6c146',
  'summary': 'this is the summary',
  'timestamp': datetime.date(2003, 2, 19),
  'title': 'title',
  'topics': ['t1', 't2'],
  'url': 'http://www.abc.net.au/news/archive/2003,02,19?page=1'},
 {'index': 2,
  'md5': 'b9f5fd0be3c350f8bbc7de978df6c146',
  'summary': 'this is the summary',
  'timestamp': datetime.date(2003, 2, 19),
  'title': 'title',
  'topics': ['t1', 't2'],
  'url': 'http://www.abc.net.au/news/archive/2003,02,19?page=1'}]

In [161]:
# __str__ Print an ArticleLink string, with failed link parse
c = ArticleLink('title', 
                None, 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                ['t1','t2'],
               '<html></html>')
print(str(c))

Title:		title
URL:		None
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		None
Topics:
	t1
	t2



In [163]:
# __str__ Print an ArticleLink string, with failed topics parse
c = ArticleLink('title', 
                'http://www.abc.net.au/news/archive/2003,02,19?page=1', 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                None)
print(str(c))

Title:		title
URL:		http://www.abc.net.au/news/archive/2003,02,19?page=1
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		b9f5fd0be3c350f8bbc7de978df6c146
Topics:
	None



In [170]:
# __str__ Print an ArticleLink string, with failed topics parse
c = ArticleLink('title', 
                'http://www.abc.net.au/news/archive/2003,02,19?page=1', 
                datetime.date(2003, 2, 19), 
                'this is the summary', 
                None,
               '<html></html>')
print(c.full_string_representation())

Title:		title
URL:		http://www.abc.net.au/news/archive/2003,02,19?page=1
Timestamp:	2003-02-19
Summary:	this is the summary
MD5:		b9f5fd0be3c350f8bbc7de978df6c146
Topics:
	None
Raw HTML:
<html></html>


In [107]:
# make_archive_url
today = datetime.date.today()
print(make_archive_url(today, 0))

http://www.abc.net.au/news/archive/2016,09,07?page=0


In [8]:
# loop through all dates in range

date_index = first_date
while date_index <= datetime.date.today():
    # do stuff
    date_index += datetime.timedelta(days=1)

2003-02-19
2003-02-20
2003-02-21
2003-02-22
2003-02-23
2003-02-24
2003-02-25
2003-02-26
2003-02-27
2003-02-28
2003-03-01
2003-03-02
2003-03-03
2003-03-04
2003-03-05
2003-03-06
2003-03-07
2003-03-08
2003-03-09
2003-03-10
2003-03-11
2003-03-12
2003-03-13
2003-03-14
2003-03-15
2003-03-16
2003-03-17
2003-03-18
2003-03-19
2003-03-20
2003-03-21
2003-03-22
2003-03-23
2003-03-24
2003-03-25
2003-03-26
2003-03-27
2003-03-28
2003-03-29
2003-03-30
2003-03-31
2003-04-01
2003-04-02
2003-04-03
2003-04-04
2003-04-05
2003-04-06
2003-04-07
2003-04-08
2003-04-09
2003-04-10
2003-04-11
2003-04-12
2003-04-13
2003-04-14
2003-04-15
2003-04-16
2003-04-17
2003-04-18
2003-04-19
2003-04-20
2003-04-21
2003-04-22
2003-04-23
2003-04-24
2003-04-25
2003-04-26
2003-04-27
2003-04-28
2003-04-29
2003-04-30
2003-05-01
2003-05-02
2003-05-03
2003-05-04
2003-05-05
2003-05-06
2003-05-07
2003-05-08
2003-05-09
2003-05-10
2003-05-11
2003-05-12
2003-05-13
2003-05-14
2003-05-15
2003-05-16
2003-05-17
2003-05-18
2003-05-19
2003-05-20

In [30]:
url1 = 'http://www.abc.net.au/news/archive/2003,02,19?page=2'

link_list = get_article_links_from_page(url1)
for a in link_list:
    print(a.full_string_representation())

Title:		US, British aircraft attack Sth Iraq target
URL:		http://abc.net.au/news/2003-02-19/us-british-aircraft-attack-sth-iraq-target/2688472
Timestamp:	February 19, 2003 18:16:00
Summary:	There has been another attack by American and British aircraft on a target in southern Iraq.
MD5:		41dc72a7ba702cd04a24e527d214de1c
Topics:
	iraq
Raw HTML:
<li><h3>
<a href="/news/2003-02-19/us-british-aircraft-attack-sth-iraq-target/2688472">US, British aircraft attack Sth Iraq target</a>
</h3>
<p class="published">Posted <span class="timestamp">February 19, 2003 18:16:00</span></p><p>There has been another attack by American and British aircraft on a target in southern Iraq.</p>
<p class="topics">
<strong>Topics:</strong>
<a href="/news/topic/iraq">iraq</a>
</p></li>
Title:		Restraint order issued against Anti-Discrimination Commissioner
URL:		http://abc.net.au/news/2003-02-19/restraint-order-issued-against-anti-discrimination/2688478
Timestamp:	February 19, 2003 18:16:00
Summary:	A Hobart magistr

In [63]:
date = datetime.date(2015, 1, 1)
links = get_all_article_links_for_date(date)

------------ Page: 1 ------------
-- http://www.abc.net.au/news/archive/2015,01,01?page=1 --
Title:		Asian Cup volunteers providing home ground help
Video
URL:		http://abc.net.au/news/2015-01-01/asian-cup-volunteers-providing-home-ground-help/5996608
Timestamp:	January 01, 2015 22:34:00
Summary:	The Asian Cup is the second oldest continental soccer championship in the world.
MD5:		f04594f4fd910ac918d918222aa4400c
Topics:
	volunteers
	soccer
	australia
	asia

Title:		Indian police accused of 'profiling' Muslims in terrorism drill video
URL:		http://abc.net.au/news/2015-01-01/indian-police-terrorism-drill-video-targeting-muslims-condemned/5996552
Timestamp:	January 01, 2015 22:28:25
Summary:	Police in the Indian prime minister's home state of Gujarat are condemned for staging an anti-terrorism exercise featuring "militants" dressed up as Muslims.
MD5:		548dd4dd91c6e7836cf299cc4d59d904
Topics:
	islam
	hinduism
	multiculturalism
	law-crime-and-justice
	police
	india
	asia

Title:		Scorcher

In [64]:
links

{'021a2b0d61821bf2dbbf384cfcf3d9dd': 021a2b0d61821bf2dbbf384cfcf3d9dd,
 '0372cb9e17e51c81ad89e48c7a4a69ca': 0372cb9e17e51c81ad89e48c7a4a69ca,
 '03f80732b9b11c7634decc1c56b29efb': 03f80732b9b11c7634decc1c56b29efb,
 '0487ed6158605b026f56af1054f59024': 0487ed6158605b026f56af1054f59024,
 '054e0f562e68898ab33bd5f7919a0c64': 054e0f562e68898ab33bd5f7919a0c64,
 '07d5a3542601e7bd267f8c6e737d10f9': 07d5a3542601e7bd267f8c6e737d10f9,
 '0b6aae68ffc91a058c33d1d8e8434783': 0b6aae68ffc91a058c33d1d8e8434783,
 '12e8d87d6ca6d2695de6f0850c5299b4': 12e8d87d6ca6d2695de6f0850c5299b4,
 '170eff37205ecac4e42c0483de40145d': 170eff37205ecac4e42c0483de40145d,
 '179e011759bfe35bab8b234f13ea8a85': 179e011759bfe35bab8b234f13ea8a85,
 '190840e843cf3ee58894e10a220867b0': 190840e843cf3ee58894e10a220867b0,
 '1cd3565e1e377914d8c7521e0a6fa2b4': 1cd3565e1e377914d8c7521e0a6fa2b4,
 '20fefb54696e8505226b80b22ff5c776': 20fefb54696e8505226b80b22ff5c776,
 '224d7c004c5491a16aee84dca9503980': 224d7c004c5491a16aee84dca9503980,
 '228e

In [188]:
links['428ac25a68e99f1e76af0865850fb11c'].title

'Sport in 90 Seconds\nVideo'

In [59]:
#get this to [(id1, topic1), (id1, topic2), (id2, topic1)] etc
topics = [article.topics for article in link_list]
# print(topics)
# print(topics[0])
# print(len(topics))
# 
for i in range(len(topics)):
    if topics[i] != None:
        for j in range(len(topics[i])):
            print(i, topics[i][j])
    else:
        print(i, topics[i])

list = []
for i in range(len(topics)):
    if topics[i] != None:
        for j in range(len(topics[i])):
            list.append([i, topics[i][j]])
    else:
        list.append([i, topics[i]])
list

0 iraq
1 None
2 None
3 None
4 government-and-politics
4 health
5 elections
5 nsw
6 None
7 business-economics-and-finance
8 None
9 information-and-communication
9 science-and-technology
10 business-economics-and-finance
10 health
11 act
12 health
12 science-and-technology
13 act
14 government-and-politics
15 sailing
16 australian-football-league
17 elections
17 nsw
18 None
19 None
20 government-and-politics
20 iraq
21 None
22 government-and-politics
23 government-and-politics
24 None


[[0, 'iraq'],
 [1, None],
 [2, None],
 [3, None],
 [4, 'government-and-politics'],
 [4, 'health'],
 [5, 'elections'],
 [5, 'nsw'],
 [6, None],
 [7, 'business-economics-and-finance'],
 [8, None],
 [9, 'information-and-communication'],
 [9, 'science-and-technology'],
 [10, 'business-economics-and-finance'],
 [10, 'health'],
 [11, 'act'],
 [12, 'health'],
 [12, 'science-and-technology'],
 [13, 'act'],
 [14, 'government-and-politics'],
 [15, 'sailing'],
 [16, 'australian-football-league'],
 [17, 'elections'],
 [17, 'nsw'],
 [18, None],
 [19, None],
 [20, 'government-and-politics'],
 [20, 'iraq'],
 [21, None],
 [22, 'government-and-politics'],
 [23, 'government-and-politics'],
 [24, None]]

In [69]:
first_day_articles = get_all_article_links_for_date(first_date)


------------ Page: 1 ------------
-- http://www.abc.net.au/news/archive/2003,02,19?page=1 --
Title:		Air NZ staff in Aust strike for pay rise
URL:		http://abc.net.au/news/2003-02-19/air-nz-staff-in-aust-strike-for-pay-rise/2688530
Timestamp:	February 19, 2003 23:10:00
Summary:	Air New Zealand says industrial action by its Australian-based staff early next week will cause minimal disruption.
MD5:		99daf21a008a9f78eea5a49a20dd8e6a
Topics:
	business-economics-and-finance
	pacific

Title:		States may be forced to label ethanol fuel
URL:		http://abc.net.au/news/2003-02-19/states-may-be-forced-to-label-ethanol-fuel/2688526
Timestamp:	February 19, 2003 23:06:00
Summary:	The Federal Government says all petrol pumps could be labelled within weeks
MD5:		fef4fc35247dc55194769bfaf46bc100
Topics:
	business-economics-and-finance
	environment

Title:		Korean subway fire: 314 still missing
URL:		http://abc.net.au/news/2003-02-19/korean-subway-fire-314-still-missing/2688528
Timestamp:	February 19, 2003

In [75]:
# prepare them as dicts with the addition of an index 
# starting from the specified index value
article_list = [article.to_dict_with_index(i) 
    for (i, article) 
    in enumerate(first_day_articles.values(), start=0)]
metadata_output_fields = ['index', 'title', 'url', 'timestamp', 'summary', 'md5']
                
# prepare the output files: meta_out for the metadata, topics_out for the topics
with open('test_metadata_file_out.csv', 'w', newline='') as meta_out:
    meta_writer = csv.DictWriter(meta_out, 
                                 fieldnames=metadata_output_fields, 
                                 extrasaction='ignore')
    meta_writer.writerows(article_list)

In [76]:
save_article_links_to_csv(datetime.date(2003,2,19),
                          datetime.date(2003,2,19),
                          'meta_out.csv',
                          'topics_out.csv')

------------ Page: 1 ------------
-- http://www.abc.net.au/news/archive/2003,02,19?page=1 --
Title:		Air NZ staff in Aust strike for pay rise
URL:		http://abc.net.au/news/2003-02-19/air-nz-staff-in-aust-strike-for-pay-rise/2688530
Timestamp:	February 19, 2003 23:10:00
Summary:	Air New Zealand says industrial action by its Australian-based staff early next week will cause minimal disruption.
MD5:		99daf21a008a9f78eea5a49a20dd8e6a
Topics:
	business-economics-and-finance
	pacific

Title:		States may be forced to label ethanol fuel
URL:		http://abc.net.au/news/2003-02-19/states-may-be-forced-to-label-ethanol-fuel/2688526
Timestamp:	February 19, 2003 23:06:00
Summary:	The Federal Government says all petrol pumps could be labelled within weeks
MD5:		fef4fc35247dc55194769bfaf46bc100
Topics:
	business-economics-and-finance
	environment

Title:		Korean subway fire: 314 still missing
URL:		http://abc.net.au/news/2003-02-19/korean-subway-fire-314-still-missing/2688528
Timestamp:	February 19, 2003

AttributeError: 'dict' object has no attribute 'topics'