In [None]:
import json
import os
import requests
import datetime
import numpy as np
import pandas as pd
import glob

In [None]:
# Set target directory of logged API json files, will create new one if it does not exist
data_dir = 'rw_api_archive'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

In [None]:
# Download a copy of RW dataset, layer, and widget endpoints. Note 'env' and 'application' params in URL string
api_list = ['dataset','layer','widget']
MAX_ITER = 20

## Handle pagination
def follow_pagination(link_obj):
	if not isinstance(link_obj, dict):
		return (-1, '')
	if link_obj['self'] == link_obj['last']:
		return (0, '')
	return (1, link_obj['next'])

## Loop through each endpoint
for endpoint in api_list:

    data = []
    current_url = f'http://api.resourcewatch.org/v1/{endpoint}?page[size]=1000&env=production&application=rw'

    for i in range(MAX_ITER):
        with requests.get(current_url) as r:
            print(current_url)
            if r.ok:
                ds = json.loads(r.content)
            else:
                raise ValueError(f'API request failed: {current_url}')
        assert 'data' in ds
        assert 'links' in ds

        # Add each page of results together
        data.extend(ds['data'])

        # Stop if there are no more results
        code, link = follow_pagination(ds['links'])
        if code == 1:
            current_url = link
        elif code == 0:
            print("Last page reached")
            break
        elif code == -1:
            raise TypeError('links object in API response malformed')
        else:
            raise ValueError(f'pagination response malformed or not understood')
        
        
    d8 = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S")

    with open(f'{data_dir}/{endpoint}_{d8}.json', 'w') as fp:
            json.dump(data, fp)       


In [None]:
# select filename of most recent matching json files in target directory and load as json
recent_dataset_dl = max(glob.iglob(f'{data_dir}/datas*.json'), key=os.path.getctime)
recent_layer_dl = max(glob.iglob(f'{data_dir}/layer*.json'), key=os.path.getctime)
recent_widget_dl = max(glob.iglob(f'{data_dir}/widge*.json'), key=os.path.getctime)

with open(recent_dataset_dl) as json_file:
    rw_dataset_data = json.load(json_file)
    
with open(recent_layer_dl) as json_file:
    rw_layer_data = json.load(json_file)
    
with open(recent_widget_dl) as json_file:
    rw_widget_data = json.load(json_file)

In [None]:
# Set substring, will search for string across all json objects
subs = 'ene_028_access_clean_cooking' #<-- SET SUBSTRING HERE
layers_using = [x for x in rw_layer_data if str(x).count(subs) != 0] 
widgets_using = [x for x in rw_widget_data if str(x).count(subs) != 0] 
datasets_using = [x for x in rw_dataset_data if str(x).count(subs) != 0] 

In [None]:
# List ids of all matching objects
litems = [datasets_using, layers_using, widgets_using]
print(subs)
for i, x in enumerate(['datasets', 'layers', 'widgets']):
    print(x)
    for j in litems[i]:
        print('https://resourcewatch.org/admin/data/{}/{}/edit'.format(x, j['id']))