News site: Get the news articles published on March 11-12. Decide which news site you want to get articles from. Make sure you get the following information:

- date
- title
- full article
- author


In [1]:
import requests
import json
from bs4 import BeautifulSoup

In [2]:
BASE_URL = 'https://content.guardianapis.com/search'

page_size = 50
QUERY_JSON = {
    'from-date': '2021-03-11',
    'to-date': '2021-03-12',
    'type': 'article',
    'page-size': str(page_size),
    'show-fields': 'byline',
    'show-blocks': 'body',
    'api-key': 'test'
}

QUERY_STRING = ''
first = True
for key, val in QUERY_JSON.items():
    QUERY_STRING += ('?' if first else '&') + key + '=' + val
    first = False
print(QUERY_STRING)

?from-date=2021-03-11&to-date=2021-03-12&type=article&page-size=50&show-fields=byline&show-blocks=body&api-key=test


In [3]:
'''
- date
- title
- full article
- author
'''

def get_article_json(news):
    date = news['webPublicationDate']
    title = news['webTitle']
    author = news['fields']['byline'] if 'fields' in news else 'anonymous'

    # Article
    blocks = news['blocks']           
    body = blocks['body']
    article_html = ''
    for block in range(blocks['totalBodyBlocks']):
        article_html += body[block]['bodyHtml'] + '\n'

    article = BeautifulSoup(article_html, "lxml").text

    article_json = {
        'date': date,
        'title': title,
        'full_article': article,
        'author': author
    }

    return article_json

In [4]:
html = requests.get(BASE_URL + QUERY_STRING)
res_json = json.loads(html.content)['response']

# Get Current Page
total_articles = res_json['total']
pages = res_json['pages'] + 1

articles_json = []

articles = res_json['results']
# 1st iteration
for i in range(page_size):
    articles_json.append(get_article_json(articles[i]))

# 2nd onwards
for page in range(2, pages):
    new_html = requests.get(BASE_URL + QUERY_STRING + '&page=' + str(page))
    new_res_json = json.loads(new_html.content)['response']
    
    start_index = new_res_json['startIndex']
    page_size = min(page_size, total_articles - start_index + 1)
    
    articles = new_res_json['results']
    
    for i in range(page_size):
        articles_json.append(get_article_json(articles[i]))

# articles_json

In [5]:
# verification
print(total_articles)
assert len(articles_json) == total_articles, 'Number of articles are not the same'

420


In [6]:
# Write to file
with open('march_11_2021-march_12_2021_the_guardian_articles.json', 'w') as fp:
    fp.write(json.dumps(articles_json, indent=4))
    fp.close()