In [69]:
import json
import time
from scraper import getheadlines, getheadlines_JS
from textanalyser import generatesentiment
import boto
from time import gmtime, strftime

In [70]:
def generateheadlinefile(source_file, output_file, s3key, scraperfn):
    with open(source_file) as data_file:
        sources = json.load(data_file)
        data = {}
        data["time"] = time.strftime("%c")
        data["news-data"] = []

    for source in sources:
        headlines = scraperfn(source["url"], source["xpath"])
        sentiments = [generatesentiment("headline", headline) for headline in headlines]
        print(sentiments)

        data["news-data"].append({
            "name": source["name"],
            "sentiments": sentiments
        })  
        
#     with open(output_file, 'w') as outfile:
#         json.dump(data, outfile)
    
    filename = strftime("%Y-%m-%d", gmtime())
    s3 = boto.connect_s3()
    bucket = s3.create_bucket('data.newsemote')
    key = bucket.new_key(s3key + '/' + filename + '.json')
    key.set_contents_from_string(json.dumps(data))
    key.set_acl('public-read')
    
    #keep copy of file in latest.json
    latest = bucket.new_key(s3key + '/latest.json')
    latest.set_contents_from_string(json.dumps(data))
    latest.set_acl('public-read')
    
    #retrieve files in bucket
    files = bucket.list(prefix=s3key)
    files = sorted(files, key=lambda k: k.last_modified, reverse=True)
    filtered = [f for f in files if f.name != s3key + '/latest.json' and f.name != 'last7.json']
    filtered = filtered[:7]
    filelist = [file.name for file in filtered]
    print(filelist)
    
    #write to list file
    latestlist = bucket.new_key(s3key + '/last7.json')
    latestlist.set_contents_from_string(json.dumps(filelist))
    latestlist.set_acl('public-read')

In [71]:
generateheadlinefile('news_sources_au.json', 'data-au.json', "au", getheadlines)
generateheadlinefile('news_sources_us.json', 'data-us.json', 'us', getheadlines_JS)

[{'headline': "Donald Trump Syria chemical attack 'has changed my view of Assad' ", 'sentiment': {'neg': 0.237, 'neu': 0.763, 'pos': 0.0, 'compound': -0.4767}, 'v_neg': [], 's_neg': [{'word': 'attack', 'score': -2.1}], 'v_pos': [], 's_pos': []}, {'headline': "Tinned spaghetti pizza New Zealand's prime minister shocks with 'monstrous' recipe ", 'sentiment': {'neg': 0.206, 'neu': 0.794, 'pos': 0.0, 'compound': -0.3818}, 'v_neg': [], 's_neg': [{'word': 'shocks', 'score': -1.6}], 'v_pos': [], 's_pos': []}, {'headline': "Centrelink scandal Welfare advocates say 'income averaging' creating incorrect debts ", 'sentiment': {'neg': 0.221, 'neu': 0.611, 'pos': 0.168, 'compound': -0.1779}, 'v_neg': [], 's_neg': [{'word': 'scandal', 'score': -1.9}], 'v_pos': [], 's_pos': [{'word': 'creating', 'score': 1.2}]}, {'headline': 'Qantas Non-stop Sydney-to-London flights could happen by 2022 ', 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, 'v_neg': [], 's_neg': [], 'v_pos': [], 's_po