# core

> Corpress functions. You can call any of these functions directly, but use the corpress function if you want to gather data and output a corpus in one step. 

In [1]:
#| default_exp core

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import requests
from bs4 import BeautifulSoup
import logging
import time
import os
import glob
import json
import html
import pandas as pd
import csv
from slugify import slugify

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

In [4]:
#| hide
basepath_for_test_data = '../test_data/'

In [5]:
#| hide
urls_for_testing = {
    'has_link': {
        'url': 'https://adho.org/',
        'endpoint_type': 'posts',
        'endpoint_url': 'https://adho.org/wp-json/wp/v2/posts',
        'description': 'Example URL with a <link> tag broadcasting the REST API endpoint' 
    }, 
    'no_link': {
        'url': 'https://www.whitehouse.gov/',
        'endpoint_type': 'posts',
        'endpoint_url': 'https://www.whitehouse.gov/wp-json/wp/v2/posts',
        'description': 'Example URL with no <link> tag and no REST API endpoint' 
    }, 
    'api_link': {
        'url': 'https://adho.org/wp-json/',
        'endpoint_type': 'pages',
        'endpoint_url': 'https://adho.org/wp-json/wp/v2/pages',
        'description': 'Example URL, which is the REST API endpoint' 
    }, 
    'error_status': {
        'url': 'https://httpstat.us/403',
        'endpoint_type': 'posts',
        'endpoint_url': None,
        'description': 'Example URL, which returns an error status code' 
    }, 
}


In [24]:
#| export
def get_api_url(url: str, # the URL of the WordPress website 
                endpoint_type: str = 'posts', # posts or pages
                headers: dict = None, # optional headers for requests
                ): # None if no endpoint detected, otherwise returns the endpoint URL
    """Queries a URL to get the REST API route for the endpoint type provided. """

    if not headers:
        headers = {}

    endpoint_url = None
        
    if endpoint_type == 'posts':
        endpoint = 'wp/v2/posts'
    elif endpoint_type == 'pages':
        endpoint = 'wp/v2/pages'
    else:
        logging.error('The endpoint must be posts or pages.')
        return None

    if url.endswith(endpoint):
        logging.info(f'URL {endpoint_url} appears to be REST API {endpoint_type} route')
        endpoint_url = url
    else:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            try:
                json_data = response.json()
                if 'routes' in json_data:
                    endpoint_url = json_data['routes']['/' + endpoint]['_links']['self'][0]['href']
                    logging.info('URL is REST API endpoint')
                    logging.info(f'Extracted {endpoint_type} route {endpoint_url}')
            except (requests.JSONDecodeError, KeyError) as e:
                soup = BeautifulSoup(response.content, 'lxml')
                link = soup.find('link', rel="https://api.w.org/")
                if link:
                    if link['href'].endswith('/'):
                        endpoint_url = link['href'] + endpoint
                    else:
                        endpoint_url = link['href'] + '/' + endpoint
                    logging.info('Found REST API endpoint link')
                    logging.info(f'Setting {endpoint_type} route {endpoint_url}')
                else:
                    if url.endswith('/'):
                        endpoint_url = url + 'wp-json/' + endpoint
                    else:
                        endpoint_url = url + '/wp-json/' + endpoint
                    logging.info('No REST API endpoint link in markup')
                    logging.info(f'Guessing {endpoint_type} route based on URL {endpoint_url}')
        except requests.HTTPError as e:
            logging.error(f'{url} returned status code {response.status_code}')
            
    return endpoint_url


In [25]:
#| hide
endpoint_url = get_api_url('https://adho.org/', 'posts')

2024-08-23 14:33:46 - INFO - Found REST API endpoint link
2024-08-23 14:33:46 - INFO - Setting posts route https://adho.org/wp-json/wp/v2/posts


In [26]:
#| hide
for key, data in urls_for_testing.items():
    print(f"Testing {data['description']}: {data['url']}")
    assert get_api_url(data['url'], data['endpoint_type']) == data['endpoint_url']
    print()
    time.sleep(1)

Testing Example URL with a <link> tag broadcasting the REST API endpoint: https://adho.org/


2024-08-23 14:33:50 - INFO - Found REST API endpoint link
2024-08-23 14:33:50 - INFO - Setting posts route https://adho.org/wp-json/wp/v2/posts



Testing Example URL with no <link> tag and no REST API endpoint: https://www.whitehouse.gov/


2024-08-23 14:33:52 - INFO - No REST API endpoint link in markup
2024-08-23 14:33:52 - INFO - Guessing posts route based on URL https://www.whitehouse.gov/wp-json/wp/v2/posts



Testing Example URL, which is the REST API endpoint: https://adho.org/wp-json/


2024-08-23 14:33:55 - INFO - URL is REST API endpoint
2024-08-23 14:33:55 - INFO - Extracted pages route https://adho.org/wp-json/wp/v2/pages



Testing Example URL, which returns an error status code: https://httpstat.us/403


2024-08-23 14:33:57 - ERROR - https://httpstat.us/403 returned status code 403





In [15]:
#| export
def get_json(endpoint_url: str, # the URL of the WordPress REST API endpoint
             endpoint_type: str = 'posts', # the type of data to download
             headers: dict = None, # optional headers for requests
             params: dict = None, # optional parameters to pass to the API
             json_save_path: str = None, # path to save the JSON data 
             seconds_between_requests: int = 5, # number of seconds to wait between requests, must be at least 1
             max_pages: int = None # maximum number of pages to download
            ) -> bool: # True if successful, False otherwise 
    """Download and save JSON data from a specific REST API endpoint. """

    if not endpoint_url:
        logging.error('No endpoint URL provided')
        return False
    
    if seconds_between_requests < 1:
        seconds_between_requests = 1
        logging.warning('Setting minimum seconds between requests to 1 as value provided is less than 1')
    
    if not params:
        params = {}
    
    if not headers:
        headers = {}

    if not json_save_path:
        logging.error('No path provided to save JSON data')
        return False

    if not os.path.exists(json_save_path):
        os.makedirs(json_save_path)
        logging.info(f'Created JSON save path: {json_save_path}')
    else:
        logging.info(f'Using JSON save path: {json_save_path}')

    if endpoint_type == 'posts':
        pass
    elif endpoint_type == 'pages':
        pass
    else:
        logging.error('The endpoint must be posts or pages.')
        return False

    if max_pages is not None:
        logging.info(f'Max pages to retrieve from API is set: {max_pages}')

    has_more = True
    page = 1
    total_pages = False
    consecutive_errors = 0

    while has_more == True:
        try:
            params['page'] = page
            r = requests.get(endpoint_url, params=params, headers=headers)
            
            logging.info(f'Downloading {r.url}')
            r.raise_for_status()

            if total_pages == False:
                total_pages = int(r.headers['X-WP-TotalPages'])
                logging.info(f'Total pages to retrieve is {total_pages}')
                digits = len(str(total_pages))

            filename = os.path.join(json_save_path, f'{endpoint_type}-{page:0{digits}}.json')

            with open(filename, 'wb') as f:
                f.write(r.content)
                #logging.info(f'Saved to {filename}')

            page += 1
            if page > total_pages:
                has_more = False

            if max_pages is not None and page > max_pages:
                has_more = False

            consecutive_errors = 0
        except requests.HTTPError as e:
            logging.error(f'Error downloading page {page} from {endpoint_url}')
            logging.error(f'Status code: {r.status_code}')
            if page == 1:
                logging.error('It appears that this website does not provide access to the REST API. Exiting.')
            else:
                logging.error('Exiting based on status code error. If this is a 403 or 400, it may be that the website is refusing repeated access to their REST API.')
            return False
        # exception for Timeout or ConnectionError
        except (requests.Timeout, requests.ConnectionError) as e:
            logging.error(f'Error downloading page {page} ({e}) from {endpoint_url}')
            consecutive_errors += 1
            if consecutive_errors > 3:
                return False

        time.sleep(seconds_between_requests)

    return True



In [16]:
#| hide
json_save_path = os.path.join(basepath_for_test_data, 'json/')
if not os.path.exists(json_save_path):
    os.makedirs(json_save_path)
# clean up files in test directory
for file in os.listdir(os.path.join(json_save_path)):
    os.remove(os.path.join(json_save_path, file))

params = {}
# test returns True
assert get_json(endpoint_url = 'https://adho.org/wp-json/wp/v2/posts', params = params, json_save_path = json_save_path, max_pages=2) == True

# check that there are two files in the test directory
assert len(os.listdir(json_save_path)) == 2

# test post file names
assert os.path.exists(os.path.join(json_save_path, 'posts-01.json'))
assert os.path.exists(os.path.join(json_save_path, 'posts-02.json'))


2024-08-23 14:19:01 - INFO - Using JSON save path: ../test_data/json/
2024-08-23 14:19:01 - INFO - Max pages to retrieve from API is set: 2
2024-08-23 14:19:03 - INFO - Downloading https://adho.org/wp-json/wp/v2/posts?page=1
2024-08-23 14:19:03 - INFO - Total pages to retrieve is 21
2024-08-23 14:19:10 - INFO - Downloading https://adho.org/wp-json/wp/v2/posts?page=2


In [17]:
#| export
def create_corpus(corpus_format: str = 'txt', # format of the corpus files, txt or csv
                  json_save_path: str = None, # path to JSON data 
                  corpus_save_path: str = None, # path to save corpus in txt format
                  csv_save_file: str = None, # path to CSV file to output corpus in CSV format (or metadata if txt corpus)
                  include_title_in_text: bool = True # include the title in the text file 
                 ) -> bool: # True if successful, False if there are errors parsing the JSON 
    """Create a corpus from downloaded JSON data in txt or csv format. """

    if corpus_format == 'txt':
        columns = ['date', 'datetime', 'type', 'id', 'title', 'link', 'filename']
        csv_file_type = 'metadata'
    elif corpus_format == 'csv':
        columns = ['date', 'datetime', 'type', 'id', 'title', 'link', 'text']
        csv_file_type = 'corpus'
    else:
        logging.error('Corpus format must be txt or csv')
        return False

    logging.info(f'Creating corpus in {corpus_format} format')

    if not json_save_path:
        logging.error('No path provided to json data')
        return False
    else:
        if not os.path.exists(json_save_path):
            logging.error('Path to JSON data does not exist')
            return False
    
    if corpus_format == 'txt':
        if not corpus_save_path:
            logging.error('No corpus save path provided')
            return False
        if not os.path.exists(corpus_save_path):
            os.makedirs(corpus_save_path)
            logging.info(f'Created corpus save path: {corpus_save_path}')
        else:
            logging.info(f'Using corpus save path: {corpus_save_path}')

    if corpus_format == 'csv':
        if not csv_save_file:
            logging.error('No path provided to save CSV corpus')
            return False
    
    if csv_save_file:
        csv_save_path = os.path.dirname(csv_save_file)
        if not os.path.exists(os.path.dirname(csv_save_path)):
            os.makedirs(os.path.dirname(csv_save_path))
            logging.info(f'Created path to save CSV corpus: {os.path.dirname(csv_save_path)}')

    try:
        # if csv_save_path is provided (regardless of format) create it and write first row
        if csv_save_file:
            logging.info(f'Creating CSV file for {csv_file_type}: {csv_save_file}')
            fw = open(csv_save_file, 'w', encoding='utf-8')
            writer = csv.writer(fw)
            writer.writerow(columns)
 
        file_list = glob.glob(json_save_path + '/*.json')
        for file in file_list:
            with open(file, 'r', encoding='utf-8') as f:
                logging.info(f"Processing JSON: {os.path.basename(file)}")
                data = json.load(f)
                for article in data:
                    title = html.unescape(article['title']['rendered'])
                    filename = f"{article['date'][0:10]}-{article['type']}-{article['id']}-{slugify(title, max_length=100)}.txt"
                    #logging.info(f"Processing {article['type']}: {title}")
                    soup = BeautifulSoup(article['content']['rendered'], 'lxml')
                    content = soup.get_text().strip()

                    if csv_save_file:
                        if corpus_format == 'csv':
                            writer.writerow([article['date'][0:10], article['date'], article['type'], article['id'], article['link'], title, content])
                        else:
                            writer.writerow([article['date'][0:10], article['date'], article['type'], article['id'], article['link'], title, filename])
                        
                    if corpus_format == 'txt':
                        #logging.info(f'Saving corpus file {filename}')
                        with open(corpus_save_path + filename, 'w', encoding='utf-8') as txtfile:
                            if include_title_in_text:
                                txtfile.write(title + '\n\n')
                            txtfile.write(content)

    except json.JSONDecodeError as e:
        logging.error(f'Exception (JSONDecodeError) - error decoding JSON file: {os.path.basename(file)}')
        return False
    except KeyError as e:
        logging.error(f'Exception (KeyError) - indicating unexpected JSON file content: {os.path.basename(file)}')
        return False
    except Exception as e:
        logging.error(f'Exception - {e} - exiting by raising error ...')
        raise
    
    return True
   


In [18]:
#| hide

json_save_path = os.path.join(basepath_for_test_data, 'json/')
json_corrupt_save_path = os.path.join(basepath_for_test_data, 'json_corrupt/')
corpus_save_path = os.path.join(basepath_for_test_data, 'txt/')
metadata_csv_save_file = os.path.join(basepath_for_test_data, 'metadata/metadata.csv')
corpus_csv_save_file = os.path.join(basepath_for_test_data, 'corpus/corpus.csv')

# clean up files in test directory
for file in os.listdir(corpus_save_path):
    os.remove(os.path.join(corpus_save_path, file))
for file in os.listdir(os.path.dirname(metadata_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(metadata_csv_save_file), file))
for file in os.listdir(os.path.dirname(corpus_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(corpus_csv_save_file), file))

assert create_corpus(
        corpus_format = 'txt',
        json_save_path = json_save_path, 
        corpus_save_path = corpus_save_path) == True

# check that there are 20 files in the test directory
assert len(os.listdir(corpus_save_path)) == 20

# corrupt JSON file should return False
assert create_corpus(
        corpus_format = 'txt',
        json_save_path = json_corrupt_save_path, 
        corpus_save_path = corpus_save_path) == False

2024-08-23 14:19:15 - INFO - Creating corpus in txt format
2024-08-23 14:19:15 - INFO - Using corpus save path: ../test_data/txt/
2024-08-23 14:19:15 - INFO - Processing JSON: posts-02.json
2024-08-23 14:19:15 - INFO - Processing JSON: posts-01.json
2024-08-23 14:19:15 - INFO - Creating corpus in txt format
2024-08-23 14:19:15 - INFO - Using corpus save path: ../test_data/txt/
2024-08-23 14:19:15 - INFO - Processing JSON: posts-001.json
2024-08-23 14:19:15 - ERROR - Exception (JSONDecodeError) - error decoding JSON file: posts-001.json


In [19]:
#| hide
json_save_path = os.path.join(basepath_for_test_data, 'json/')
corpus_save_path = os.path.join(basepath_for_test_data, 'txt/')
metadata_csv_save_file = os.path.join(basepath_for_test_data, 'metadata/metadata.csv')
corpus_csv_save_file = os.path.join(basepath_for_test_data, 'corpus/corpus.csv')

# clean up files in test directory
for file in os.listdir(corpus_save_path):
    os.remove(os.path.join(corpus_save_path, file))
for file in os.listdir(os.path.dirname(metadata_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(metadata_csv_save_file), file))
for file in os.listdir(os.path.dirname(corpus_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(corpus_csv_save_file), file))

# test CSV returns True
assert create_corpus(corpus_format='csv', 
                     json_save_path=json_save_path, 
                     corpus_save_path=None, 
                     csv_save_file=corpus_csv_save_file) == True

assert os.path.exists(corpus_csv_save_file)

# test returns True
assert create_corpus(corpus_format='txt', 
                     json_save_path=json_save_path, 
                     corpus_save_path=corpus_save_path, 
                     csv_save_file=metadata_csv_save_file) == True

assert os.path.exists(metadata_csv_save_file)



2024-08-23 14:19:15 - INFO - Creating corpus in csv format
2024-08-23 14:19:15 - INFO - Creating CSV file for corpus: ../test_data/corpus/corpus.csv
2024-08-23 14:19:15 - INFO - Processing JSON: posts-02.json
2024-08-23 14:19:15 - INFO - Processing JSON: posts-01.json
2024-08-23 14:19:15 - INFO - Creating corpus in txt format
2024-08-23 14:19:15 - INFO - Using corpus save path: ../test_data/txt/
2024-08-23 14:19:15 - INFO - Creating CSV file for metadata: ../test_data/metadata/metadata.csv
2024-08-23 14:19:15 - INFO - Processing JSON: posts-02.json
2024-08-23 14:19:15 - INFO - Processing JSON: posts-01.json


In [20]:
#| export
def result_reporting(result: dict, # the result dictionary
                     output: bool = True # output the results
                     ) -> dict: # returns the result dictionary
    """Outputs the results of the corpress process"""

    # output dataframe
    df = pd.DataFrame(result.items(), columns=['Key', 'Value'])
    
    try: # if in a Jupyter notebook
        display(df)
    except NameError:
        print(df)

    return result
    

In [21]:
#| export
def corpress(url: str, # the URL of the WordPress website 
            endpoint_type: str = 'posts', # posts or pages
            headers: dict = None, # optional headers for requests
            params: dict = None, # optional parameters to pass to the API
            corpus_format: str = 'txt', # format of the corpus files, txt or csv
            json_save_path: str = None, # path to save the JSON data 
            corpus_save_path: str = None, # path to save the corpus in txt format
            csv_save_file: str = None, # path to CSV file to output corpus in CSV format (or metadata if txt corpus)
            seconds_between_requests: int = 5, # number of seconds to wait between requests
            max_pages: int = None, # maximum number of pages to download
            include_title_in_text: bool = True, # option to include the title in the text file 
            output: bool = True # option to output the results of the process
            ) -> dict: # dictionary with results of each stage of the process and the number of texts in the corpus
    """Retrieve data from the REST API and create a corpus."""
    
    result = {
        'url': url,
        'endpoint_url': None,
        'headers': headers,
        'params': None,
        'get_api_url': False,
        'get_json': False,
        'create_corpus': False,
        'corpus_format': corpus_format,
        'corpus_save_path': corpus_save_path,
        'csv_save_file': csv_save_file,
        'corpus_texts_count': 0
    }

    # get the endpoint_url
    endpoint_url = get_api_url(url, endpoint_type, headers)
    
    if not endpoint_url:
        logging.error('No endpoint URL detected. Exiting.')
        return result_reporting(result, output)
    else:
        result['get_api_url'] = True
        result['endpoint_url'] = endpoint_url

    # if params is a dict
    if isinstance(params, dict):
        result['params'] = params.copy()
    
    # download the data
    get_json_result = get_json(endpoint_url, endpoint_type, headers, params, json_save_path, seconds_between_requests, max_pages)

    if get_json_result == False:
        logging.error('Error downloading data. Exiting.')
        return result_reporting(result, output)
    else:
        result['get_json'] = True

    # create the corpus
    create_corpus_result = create_corpus(corpus_format, json_save_path, corpus_save_path, csv_save_file, include_title_in_text)

    if create_corpus_result == False:
        logging.error('Error creating corpus')
        return result_reporting(result, output)
    else:
        result['create_corpus'] = True

    if corpus_format == 'txt':
        result['corpus_texts_count'] = len(os.listdir(corpus_save_path))
    elif corpus_format == 'csv':
        result['corpus_texts_count'] = len(pd.read_csv(csv_save_file))

    return result_reporting(result, output)

In [22]:
#| hide

json_save_path = os.path.join(basepath_for_test_data, 'json/')
corpus_save_path = os.path.join(basepath_for_test_data, 'txt/')
metadata_csv_save_file = os.path.join(basepath_for_test_data, 'metadata/metadata.csv')
corpus_csv_save_file = os.path.join(basepath_for_test_data, 'corpus/corpus.csv')

# cleanup test directories
for file in os.listdir(json_save_path):
    os.remove(os.path.join(json_save_path, file))
for file in os.listdir(corpus_save_path):
    os.remove(os.path.join(corpus_save_path, file))
for file in os.listdir(os.path.dirname(metadata_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(metadata_csv_save_file), file))
for file in os.listdir(os.path.dirname(corpus_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(corpus_csv_save_file), file))

# test a site that has no endpoint
result = corpress(url = 'https://www.whitehouse.gov/', 
                endpoint_type='posts',
                corpus_format='txt',
                json_save_path = json_save_path, 
                corpus_save_path = corpus_save_path, 
                max_pages=2)

assert result['get_api_url'] == True
assert result['get_json'] == False
assert result['create_corpus'] == False
assert result['corpus_format'] == 'txt'
assert result['corpus_save_path'] == corpus_save_path
assert result['csv_save_file'] == None
assert result['corpus_texts_count'] == 0

# cleanup test directories
for file in os.listdir(json_save_path):
    os.remove(os.path.join(json_save_path, file))
for file in os.listdir(corpus_save_path):
    os.remove(os.path.join(corpus_save_path, file))
for file in os.listdir(os.path.dirname(metadata_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(metadata_csv_save_file), file))
for file in os.listdir(os.path.dirname(corpus_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(corpus_csv_save_file), file))

# test with search params and csv format
result = corpress(url = 'https://adho.org/', 
                endpoint_type='pages',
                params={'search': 'conference'},
                corpus_format='csv',
                json_save_path = json_save_path, 
                corpus_save_path = corpus_save_path, 
                csv_save_file = corpus_csv_save_file,
                max_pages=2)

assert result['get_api_url'] == True
assert result['get_json'] == True
assert result['create_corpus'] == True
assert result['corpus_format'] == 'csv'
assert result['corpus_save_path'] == corpus_save_path
assert result['csv_save_file'] == corpus_csv_save_file
assert result['corpus_texts_count'] == 20

# cleanup test directories
for file in os.listdir(json_save_path):
    os.remove(os.path.join(json_save_path, file))
for file in os.listdir(corpus_save_path):
    os.remove(os.path.join(corpus_save_path, file))
for file in os.listdir(os.path.dirname(metadata_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(metadata_csv_save_file), file))
for file in os.listdir(os.path.dirname(corpus_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(corpus_csv_save_file), file))

# test txt format with metadata file
result = corpress(url = 'https://adho.org/', 
                endpoint_type='posts',
                corpus_format='txt',
                json_save_path = json_save_path, 
                corpus_save_path = corpus_save_path, 
                csv_save_file = metadata_csv_save_file,
                max_pages=2)

assert result['get_api_url'] == True
assert result['get_json'] == True
assert result['create_corpus'] == True
assert result['corpus_format'] == 'txt'
assert result['corpus_save_path'] == corpus_save_path
assert result['csv_save_file'] == metadata_csv_save_file
assert result['corpus_texts_count'] == 20

# cleanup test directories
for file in os.listdir(json_save_path):
    os.remove(os.path.join(json_save_path, file))
for file in os.listdir(corpus_save_path):
    os.remove(os.path.join(corpus_save_path, file))
for file in os.listdir(os.path.dirname(metadata_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(metadata_csv_save_file), file))
for file in os.listdir(os.path.dirname(corpus_csv_save_file)):
    os.remove(os.path.join(os.path.dirname(corpus_csv_save_file), file))

2024-08-23 14:19:15 - INFO - No REST API endpoint link in markup
2024-08-23 14:19:15 - INFO - Guessing posts route based on URL https://www.whitehouse.gov/wp-json/wp/v2/posts
2024-08-23 14:19:15 - INFO - Using JSON save path: ../test_data/json/
2024-08-23 14:19:15 - INFO - Max pages to retrieve from API is set: 2
2024-08-23 14:19:16 - INFO - Downloading https://www.whitehouse.gov/wp-json/wp/v2/posts?page=1
2024-08-23 14:19:16 - ERROR - Error downloading page 1 from https://www.whitehouse.gov/wp-json/wp/v2/posts
2024-08-23 14:19:16 - ERROR - Status code: 403
2024-08-23 14:19:16 - ERROR - It appears that this website does not provide access to the REST API. Exiting.
2024-08-23 14:19:16 - ERROR - Error downloading data. Exiting.


Unnamed: 0,Key,Value
0,url,https://www.whitehouse.gov/
1,endpoint_url,https://www.whitehouse.gov/wp-json/wp/v2/posts
2,headers,
3,params,
4,get_api_url,True
5,get_json,False
6,create_corpus,False
7,corpus_format,txt
8,corpus_save_path,../test_data/txt/
9,csv_save_file,


2024-08-23 14:19:17 - INFO - Found REST API endpoint link
2024-08-23 14:19:17 - INFO - Setting pages route https://adho.org/wp-json/wp/v2/pages
2024-08-23 14:19:17 - INFO - Using JSON save path: ../test_data/json/
2024-08-23 14:19:17 - INFO - Max pages to retrieve from API is set: 2
2024-08-23 14:19:20 - INFO - Downloading https://adho.org/wp-json/wp/v2/pages?search=conference&page=1
2024-08-23 14:19:20 - INFO - Total pages to retrieve is 5
2024-08-23 14:19:27 - INFO - Downloading https://adho.org/wp-json/wp/v2/pages?search=conference&page=2
2024-08-23 14:19:32 - INFO - Creating corpus in csv format
2024-08-23 14:19:32 - INFO - Creating CSV file for corpus: ../test_data/corpus/corpus.csv
2024-08-23 14:19:32 - INFO - Processing JSON: pages-2.json
2024-08-23 14:19:32 - INFO - Processing JSON: pages-1.json


Unnamed: 0,Key,Value
0,url,https://adho.org/
1,endpoint_url,https://adho.org/wp-json/wp/v2/pages
2,headers,
3,params,{'search': 'conference'}
4,get_api_url,True
5,get_json,True
6,create_corpus,True
7,corpus_format,csv
8,corpus_save_path,../test_data/txt/
9,csv_save_file,../test_data/corpus/corpus.csv


2024-08-23 14:19:34 - INFO - Found REST API endpoint link
2024-08-23 14:19:34 - INFO - Setting posts route https://adho.org/wp-json/wp/v2/posts
2024-08-23 14:19:34 - INFO - Using JSON save path: ../test_data/json/
2024-08-23 14:19:34 - INFO - Max pages to retrieve from API is set: 2
2024-08-23 14:19:35 - INFO - Downloading https://adho.org/wp-json/wp/v2/posts?page=1
2024-08-23 14:19:35 - INFO - Total pages to retrieve is 21
2024-08-23 14:19:42 - INFO - Downloading https://adho.org/wp-json/wp/v2/posts?page=2
2024-08-23 14:19:47 - INFO - Creating corpus in txt format
2024-08-23 14:19:47 - INFO - Using corpus save path: ../test_data/txt/
2024-08-23 14:19:47 - INFO - Creating CSV file for metadata: ../test_data/metadata/metadata.csv
2024-08-23 14:19:47 - INFO - Processing JSON: posts-02.json
2024-08-23 14:19:47 - INFO - Processing JSON: posts-01.json


Unnamed: 0,Key,Value
0,url,https://adho.org/
1,endpoint_url,https://adho.org/wp-json/wp/v2/posts
2,headers,
3,params,
4,get_api_url,True
5,get_json,True
6,create_corpus,True
7,corpus_format,txt
8,corpus_save_path,../test_data/txt/
9,csv_save_file,../test_data/metadata/metadata.csv


In [23]:
#| hide
import nbdev; nbdev.nbdev_export()