In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import requests
from IPython.core.interactiveshell import InteractiveShell
from wasabi import Printer

InteractiveShell.ast_node_interactivity = "all"


msg = Printer()


In [None]:
def retrieve_and_format_request(**kwargs):
    # get base url, assign default otherwise
    base_url = kwargs.get("base_url", "https://api.trove.nla.gov.au/v2/result?")

    # get single-arg params
    single_arg_params = "&".join(
        [f"{k}={v}" for k, v in kwargs.items() if k not in ["base_url"]]
    )
    # concat params
    query = base_url + single_arg_params

    # query, format, return
    return requests.get(query)


def get_articles(title_name, title_id, trove_key, year):
    # get x articles for a given publication, year, within qld
    query = {
        "base_url": "https://api.trove.nla.gov.au/v2/result?",
        "state": "qld",
        "zone": "newspaper",
        "q": [title_name],
        "key": trove_key,
        "encoding": "json",
        "n": "200",
        "l-year": year,
        "l-title": title_id,
        "include": "articletext",
    }
    res = retrieve_and_format_request(**query).json()
    try:
        return res["response"]["zone"][0]["records"]["article"]
    except:
        return False


def snake_case_single(string):
    # given some string, convert to snake case equivalent
    ignore = ["and", "&"]
    substrings = re.findall("\w+|[^\w\s]", string, re.UNICODE)
    substrings = [e.lower() for e in substrings if len(e) >= 2 and e not in ignore]
    return "_".join(substrings)


def format_time(start, end):
    # given start/end time objects, return formatted time
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)


## Retrieve list of QLD newspapers

In [None]:
qld_newspapers = {
    "base_url": "https://api.trove.nla.gov.au/v2/newspaper/titles?",
    "state": "qld",
    "key": trove_key,
    "encoding": "json",
    "n": "200",
    "include": "years",
}

qld_newspapers_res = retrieve_and_format_request(**qld_newspapers).json()["response"][
    "records"
]["newspaper"]

qld_publications = (
    pd.DataFrame.from_records(qld_newspapers_res)
    .assign(endDate=lambda x: x.endDate.apply(pd.to_datetime))
    .assign(startDate=lambda x: x.startDate.apply(pd.to_datetime))
    .assign(
        num_operating_years=lambda x: x.apply(
            lambda y: y.endDate.year - y.startDate.year, axis=1
        )
    )
)

# save out
qld_publications.to_csv("../datasets/qld_publications.csv", index=False)


## Using QLD publications, retrieve 200 newspaper articles from each publication (if 200 exist), for each year of operation

In [None]:
save_dir_prefix = Path("../datasets/publications/")
request_counter = 0

# for each publication (title)
for idx, title in qld_publications.iterrows():
    # create save dir
    save_dir = save_dir_prefix / snake_case_single(title.title.split("(")[0])
    if save_dir.exists():
        shutil.rmtree(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)
    msg.info(f"Created directory: {save_dir}")

    if request_counter == 95:
        # sleep if approaching throttle limit
        msg.info("Sleeping for 30 seconds to prevent throttling..")
        time.sleep(30)
        request_counter = 0  # and reset

    before = timeit.default_timer()
    # for each operating year, for a publication
    for year in range(title.startDate.year, title.endDate.year):
        res = get_articles(
            title.title.split("(")[0].strip(), title.id, trove_key, str(year)
        )

        if res:
            # if res, create save dir
            article_year_save_path = save_dir / str(year)
            (
                pd.DataFrame(res)
                .drop_duplicates(subset=["articleText"])
                .assign(title_name=lambda x: x.title.apply(lambda y: y["value"]))
                .assign(title_id=lambda x: x.title.apply(lambda y: y["id"]))
                .to_csv(f"{article_year_save_path.as_posix()}.csv", index=False)
            )

        # increment request counter
        request_counter += 1
    after = timeit.default_timer()
    msg.info(
        f"Finished collecting articles for {title.title.split('(')[0]} in {format_time(before, after)}"
    )
