### Imports and API Key

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import os
import re
from alive_progress import alive_bar

# The Guardian API key
keyG = "fad78733-31a0-4ea7-8823-ba815b578899"


### Useful functions and API query setup

In [2]:
# Function that clears text of a dict of substrings
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text


# Function that returns the number of articles in the current API query page
def numArticlesInPage(json):
    if json["response"]["total"] - json["response"]["startIndex"] >= 200:
        return 200
    else:
        return json["response"]["total"] - json["response"]["startIndex"] + 1


# Query setup function
def guardian(page):
    return requests.get(
    "https://content.guardianapis.com/search?api-key=" + keyG + 
    "&from-date=2022-02-01" + 
    "&type=article" + 
    "&page=" + str(page) + 
    "&tag=world/ukraine" + 
    "&order-by=newest" + 
    "&show-fields=body" + 
    "&page-size=200"
    )


# Dict of undesirable substrings
rep = {
    "Sign up to First Edition, our free daily newsletter – every weekday morning at 7am": "",
    "Sign up to First Edition, our free daily newsletter – every weekday at 7am BST": "",
    "Sign up to receive Guardian Australia’s fortnightly Rural Network email newsletter": "",
    "Sign up for the Rural Network email newsletter Join the Rural Network group on Facebook to be part of the community": "",
    "Sign up to the daily Business Today email or follow Guardian Business on Twitter at @BusinessDesk": "",
    "Photograph:": "",
    "Related:": "",
}

# Instancing a query to fetch basic information
json_guardian = guardian(14).json()
numPages = json_guardian["response"]["pages"]
numArticles = json_guardian["response"]["total"]
# print(json.dumps(json_guardian, indent=2))

print(f"-> There are {numArticles} articles in {numPages} pages for the period and tags specified.")

-> There are 2655 articles in 14 pages for the period and tags specified.


### API Query

In [None]:
# Instancing
urls = []
titles = []
bodies = []
dates = []

# Scraper
with alive_bar(numArticles, title="-> API Query", spinner="dots_waves", bar="smooth", force_tty=True) as bar:

    # Going through all pages available for the query
    for page in range(1, numPages + 1):

        json_guardian = guardian(page).json()

        # Going through all articles in a page
        for i in range(0, numArticlesInPage(json_guardian)):

            urls.append(json_guardian["response"]["results"][i]["webUrl"])
            titles.append(json_guardian["response"]["results"][i]["webTitle"])
            dates.append(json_guardian["response"]["results"][i]["webPublicationDate"])
            soup = BeautifulSoup(json_guardian["response"]["results"][i]["fields"]["body"], "html.parser").get_text()

            soup = replace_all(soup, rep)  # replacing substrings
            soup = re.sub(r"[\t\r\n]", "", soup)  # removing line breaks
            bodies.append(soup)
            bar()

# Transforming fetched info to dataframe
dict_data = {"URL": urls, "Date": dates, "Title": titles, "Text": bodies}
data = pd.DataFrame(dict_data)

# Saving to csv
os.makedirs(os.getcwd() + "/data", exist_ok=True)
data.to_csv("data/guardian.csv", index=True, header=True)

print(f"-> {len(data)} articles fetched successfully!")

In [4]:
data.head(25)

Unnamed: 0,URL,Date,Title,Text
0,https://www.theguardian.com/world/2022/jun/17/...,2022-06-17T00:31:37Z,Russia-Ukraine war: what we know on day 114 of...,Hundreds of civilians sheltering at the Azot ...
1,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T16:59:27Z,"Scholz, Macron and Draghi vow support for Ukra...","The leaders of France, Germany and Italy have ..."
2,https://www.theguardian.com/law/2022/jun/16/ru...,2022-06-16T16:28:07Z,Russian spy caught trying to infiltrate war cr...,A Russian spy tried and failed to secure an in...
3,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T15:16:27Z,Hundreds of Ukrainian refugees left homeless i...,Hundreds of Ukrainian families have been left ...
4,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T13:13:29Z,UK aims sanctions at Russians accused of abduc...,A fresh wave of sanctions against Russia has b...
5,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T09:55:42Z,Plan needed to make Russia pay reparations to ...,Western and Ukrainian rhetoric claiming Russia...
6,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T04:00:49Z,‘It was worse than hell’: life in Mariupol und...,A month after the end of the siege at Mariupol...
7,https://www.theguardian.com/us-news/2022/jun/1...,2022-06-16T00:12:21Z,Two US volunteers in Ukraine feared taken pris...,Two American volunteers in Ukraine have gone m...
8,https://www.theguardian.com/world/2022/jun/16/...,2022-06-16T00:01:33Z,Russia-Ukraine war: what we know on day 113 of...,French President Emmanuel Macron praised the ...
9,https://www.theguardian.com/uk-news/2022/jun/1...,2022-06-15T22:53:21Z,Ex-Russian football captain Igor Denisov conde...,"Igor Denisov, the former captain of Russia’s n..."
