In [25]:
import requests
import json
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

In [26]:
API_ENDPOINT = 'http://content.guardianapis.com/search'
with open('guardian-api.txt', 'r') as f:
    API_KEY = f.read().strip()
    
DATA_DIR = join('data', 'guardian_articles')
makedirs(DATA_DIR, exist_ok=True)

In [27]:
search_params = {
    'from-date': '',
    'to-date': '',
    'order-by': 'newest',
    'show-fields': 'all',
    'page-size': 50,
    'api-key': API_KEY,
    'page' : ''
}

#example search query
#(https://content.guardianapis.com/search?q=debate%20AND%20economy&tag=politics/politics&from-date=2014-01-01&api-key=test


In [28]:
start_date = date(2017, 6, 1)
end_date = date(2018, 3, 30)
delta = end_date - start_date

for i in range(delta.days + 1):
    dt = start_date + timedelta(i)
    dtstr = dt.strftime('%Y-%m-%d')
    filename = join(DATA_DIR, dt.strftime('%Y-%m-%d') + '.json')
    
    #if file not exists then only download it
    if not exists(filename):
        search_params['from-date'] = dtstr
        search_params['to-date'] = dtstr
        current_page = 1
        total_pages = 1
        pages_data = []
        
        #collect all data for pages
        while current_page <= total_pages:
            print("getting page no .... ", current_page)
            search_params['page'] = current_page
            response = requests.get(API_ENDPOINT, search_params)
            data = response.json()
            results = data['response']['results']
            current_page += 1
            total_pages = data['response']['pages']
            pages_data.extend(results)
        
        #write all data to json file
        with open(filename, 'w') as f:
            print("Writing to {} ...".format(filename))
            f.write(json.dumps(pages_data, indent=2))
            
            
    

getting page no ....  1
getting page no ....  2
getting page no ....  3
getting page no ....  4
getting page no ....  5
getting page no ....  6
Writing to data/articles/2017-06-01.json ...
getting page no ....  1
getting page no ....  2
getting page no ....  3
getting page no ....  4
getting page no ....  5
getting page no ....  6
Writing to data/articles/2017-06-02.json ...
getting page no ....  1
getting page no ....  2
getting page no ....  3
getting page no ....  4
Writing to data/articles/2017-06-03.json ...
getting page no ....  1
getting page no ....  2
getting page no ....  3
getting page no ....  4
Writing to data/articles/2017-06-04.json ...
getting page no ....  1
getting page no ....  2
getting page no ....  3
getting page no ....  4
getting page no ....  5
Writing to data/articles/2017-06-05.json ...
getting page no ....  1
getting page no ....  2
getting page no ....  3
getting page no ....  4
getting page no ....  5
Writing to data/articles/2017-06-06.json ...
getting pa