### Guardian API access

You will need to apply for a free developer key in order to use the API fully within Jupyter:

[Guardian Open Platform - Getting started](https://open-platform.theguardian.com/access/)

You can explore what is possible with the API here:

[Guardian Open Platform - explore](https://open-platform.theguardian.com/explore/)

In [None]:
#import required libraries
import requests
import json
import re
import time

In [None]:
#load your personal API key
with open('private/guardian_key.txt', 'r') as file:
    key = file.read().strip()
len(key)

In [None]:
 #build a search URL
base_url = 'https://content.guardianapis.com/'
search_string = "ukraine"
production_office = "aus"
from_date = "2023-03-01"

full_url = base_url+f"search?q={search_string}&production-office={production_office}&from-date={from_date}&show-fields=body&api-key={key}"

#url = baseUrl+'"'+searchString+'"'+'&production-office='+production_office+'&from-date='+fromDate+'&api-key='+key
print(full_url[:120])

In [None]:
# get data from server
response = requests.get(full_url)
resp_data = response.json()['response']
resp_data

In [None]:
num_pages = resp_data['pages']
num_pages

In [None]:
def articles_from_page_results(page_results):
    articles = {}
    for result in page_results:
        article_date = result['webPublicationDate']
        article_title = result['webTitle']+f" [{article_date}]"
        article_html = result['fields']['body']
        article_text = re.sub(r'<.*?>','',article_html)
        articles[article_title] = article_text
    return articles

In [None]:
def get_all_articles_for_response(response_json,full_url):
    total_pages = response_json['pages']
    total_articles = response_json['total']
    print(f"Fetching {total_articles} articles from {total_pages} pages...")
    all_articles = {}
    page1_articles = articles_from_page_results(response_json['results'])
    all_articles.update(page1_articles)
    print("Added articles for page: 1")
    
    for page in range(2,total_pages+1):
        print("Getting articles from API for page:",page)
        page_response = requests.get(full_url+f"&page={page}")
        page_data = page_response.json()['response']
        print("Processing results for page:",page_data['currentPage'])
        page_articles = articles_from_page_results(page_data['results'])
        print(f"Fetched {len(page_articles)} articles.")
        all_articles.update(page_articles)
        print("Added articles for page:",page)
        print(f"Status: {len(all_articles)} articles.")
        time.sleep(1) # make sure we're not hitting the API to hard
    
    print(f"FINISHED: Fetched {len(all_articles)} articles.")
    return all_articles


In [None]:
my_articles = get_all_articles_for_response(resp_data,full_url)

In [None]:
print("Total Articles:",len(my_articles))
for title,text in my_articles.items():
    print(title)

In [None]:
file_path = "data/"
file_name = "ukraine_articles.json"

with open(f"{file_path}{file_name}",'w', encoding='utf-8') as fp:
    fp.write(json.dumps(my_articles))