### Imports and API Key

In [9]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
import re
from alive_progress import alive_bar

# The Guardian API key
keyG = "fad78733-31a0-4ea7-8823-ba815b578899"


### Useful functions and API query setup

In [10]:
# Directories
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))
PARENT_DIR = os.path.dirname(ROOT_DIR)

# Function that clears text of a dict of substrings
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text


# Function that returns the number of articles in the current API query page
def numArticlesInPage(json):
    if json["response"]["total"] - json["response"]["startIndex"] >= 200:
        return 200
    else:
        return json["response"]["total"] - json["response"]["startIndex"] + 1


# Query setup function
def guardian(page):
    return requests.get(
    "https://content.guardianapis.com/search?api-key=" + keyG + 
    "&from-date=2022-02-01" + 
    "&type=article" + 
    "&page=" + str(page) + 
    "&tag=world/ukraine" + 
    "&order-by=newest" + 
    "&show-fields=body" + 
    "&page-size=200"
    )


# Dict of undesirable substrings
rep = {
    "Sign up to First Edition, our free daily newsletter – every weekday morning at 7am": "",
    "Sign up to First Edition, our free daily newsletter – every weekday at 7am BST": "",
    "Sign up to receive Guardian Australia’s fortnightly Rural Network email newsletter": "",
    "Sign up for the Rural Network email newsletter Join the Rural Network group on Facebook to be part of the community": "",
    "Sign up to the daily Business Today email or follow Guardian Business on Twitter at @BusinessDesk": "",
    "Photograph:": "",
    "Related:": "",
}

# Instancing a query to fetch basic information
json_guardian = guardian(14).json()
numPages = json_guardian["response"]["pages"]
numArticles = json_guardian["response"]["total"]
# print(json.dumps(json_guardian, indent=2))

print(f"-> There are {numArticles} articles in {numPages} pages for the period and tags specified.")

-> There are 2660 articles in 14 pages for the period and tags specified.


### API Query

In [13]:
# Instancing
urls = []
titles = []
bodies = []
dates = []

# Scraper
with alive_bar(numArticles, title="-> API Query", spinner="dots_waves", bar="smooth", force_tty=True) as bar:

    # Going through all pages available for the query
    for i in range(1, numPages + 1):

        json_guardian = guardian(i).json()

        # Going through all articles in a page
        for j in range(0, numArticlesInPage(json_guardian)):

            urls.append(json_guardian["response"]["results"][j]["webUrl"])
            dates.append(json_guardian["response"]["results"][j]["webPublicationDate"])

            title = json_guardian["response"]["results"][j]["webTitle"]
            titles.append(re.sub(r"\|.*$", "", title)) # removing authors from titles
            
            body = BeautifulSoup(json_guardian["response"]["results"][j]["fields"]["body"], "html.parser").get_text()
            body = replace_all(body, rep)  # replacing substrings
            bodies.append(re.sub(r"[\t\r\n]", "", body)) # removing line breaks
            bar()

# Transforming fetched info to dataframe
data = pd.DataFrame({"URL": urls, "Date": dates, "Title": titles, "Text": bodies})

# Saving to csv
os.makedirs(PARENT_DIR + "/data", exist_ok=True)
data.to_csv(PARENT_DIR + "/data/guardian.csv", index=True, header=True)

print(f"-> {len(data)} articles fetched successfully!")

-> API Query |████████████████████████████████████████| 2660/2660 [100%] in 40.4s (65.78/s)                             
-> 2660 articles fetched successfully!


In [12]:
data.head(25)

Unnamed: 0,URL,Date,Title,Text
0,https://www.theguardian.com/world/2022/jun/17/...,2022-06-17T16:43:26Z,France and Turkey propose rival plans to get g...,Rival plans to export Ukraine’s vitally needed...
1,https://www.theguardian.com/politics/2022/jun/...,2022-06-17T15:57:58Z,Boris Johnson promises Ukraine UK-led troop tr...,Boris Johnson has announced that the UK will o...
2,https://www.theguardian.com/tv-and-radio/2022/...,2022-06-17T14:11:10Z,UK is asked to host Eurovision in 2023 after U...,The UK has been offered the chance to host Eur...
3,https://www.theguardian.com/world/2022/jun/17/...,2022-06-17T12:15:51Z,EU says Ukraine should be given candidate stat...,The European Commission has said Ukraine shoul...
4,https://www.theguardian.com/world/2022/jun/17/...,2022-06-17T11:51:35Z,Third American volunteer fighter reported miss...,A third American volunteer fighting in Ukraine...
5,https://www.theguardian.com/world/2022/jun/17/...,2022-06-17T00:31:37Z,Russia-Ukraine war: what we know on day 114 of...,The EU’s executive arm has recommended that U...
6,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T16:59:27Z,"Scholz, Macron and Draghi vow support for Ukra...","The leaders of France, Germany and Italy have ..."
7,https://www.theguardian.com/law/2022/jun/16/ru...,2022-06-16T16:28:07Z,Russian spy caught trying to infiltrate war cr...,A Russian spy tried and failed to secure an in...
8,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T15:16:27Z,Hundreds of Ukrainian refugees left homeless i...,Hundreds of Ukrainian families have been left ...
9,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T13:13:29Z,UK aims sanctions at Russians accused of abduc...,A fresh wave of sanctions against Russia has b...
