# Table of Contents
 <p>

In [2]:
import requests
import json
import pandas as pd
from pprint import pprint
from multiprocessing import Pool

This script will get all datasets from selected application with their layers, and it will check if they are working properly.  

Finally, it will deliver a csv file as status report for each one of them.  

| Column name | Description |
|:---|:---|
| connector_provider  | dataset connector provider |
| connector_url  | if available url connected  |
| connector_url_status | [status code](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes) |
| dataset_id  | api dataset id |
| dataset_name  |  dataset name |
| dataset_sql_error | sql endpoint error explained if available  |
| dataset_sql_status  |  sql endpoint [status code](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes) |
| n_layers |  number of layers associated |
| n_widgets |  number of widgets associated |

In [53]:
def req_error_handler(url):
    '''
    Handle the url exceptions following: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
    '''
    try: 
        s = requests.get(url, timeout=20)
        status = s.status_code
    except requests.HTTPError as e:
        s=None
        status= e.response.status_code
        pass
    except:
        s=None
        status= 408
        pass
    finally:
        return s, status

def f(dataset):
    '''
    1º check if the sql query fails
    2º if 1º fails checks the provider url fail and catch the errors.
    '''
    data={}
    geometryUrl='https://api.resourcewatch.org/v1/query/{0}?sql=select * from {1} limit 1'
    provider ={
        'gee': {'url':'https://api.resourcewatch.org/v1/query/{0}?sql=select st_metadata(the_raster_webmercator) from "{1}" limit 1', 'connector_url':False},
        'cartodb': {'url':geometryUrl,'connector_url':True},
        'featureservice': {'url':geometryUrl,'connector_url':True},
        'nexgddp':{'url':'https://api.resourcewatch.org/v1/query/{0}?sql=select * from {1} where year=1960 limit 1','connector_url':False},
        'rasdaman':{'url':'https://api.resourcewatch.org/v1/query/{0}?wcps=FOR c in ({1}) return 1','connector_url':False},
        'wms':{'connector_url':False},
        'csv':{'url':geometryUrl,'connector_url':True},
        'tsv':{'url':geometryUrl,'connector_url':True},
        'json':{'url':geometryUrl,'connector_url':True}
    }
    
    if dataset['attributes']['provider']!='wms'and dataset['attributes']['provider'] in provider.keys():
         
        url = geometryUrl if dataset['attributes']['provider']=='gee' and dataset['attributes']['tableName'][:3]=='ft:' else provider[dataset['attributes']['provider']]['url']
        s, statusq = req_error_handler(url.format(dataset['id'], dataset['attributes']['tableName']))
            
        if s==None or statusq!=200:

            connectorUrl = dataset['attributes']['connectorUrl'] if dataset['attributes']['connectorUrl'] is not None or dataset['attributes']['connectorUrl']!= '' else None
            conn, status = req_error_handler(connectorUrl)

            data['dataset_id']=dataset['id']
            data['dataset_name']=dataset['attributes']['name']
            data['dataset_sql_status']=statusq
            data['dataset_sql_error']=s.text if s != None else None
            data['connector_provider']=dataset['attributes']['provider']
            data['connector_url_status']= status
            data['connector_url']= connectorUrl if connectorUrl!= None or connectorUrl!= '' else None
            data['n_layers'] = len(dataset['attributes']['layer'])
            data['n_widgets'] = len(dataset['attributes']['widget'])
            return data
        
    elif dataset['attributes']['provider']=='wms':
        for layer in dataset['attributes']['layer']:
                url = layer['attributes']['layerConfig']['body']['url'] if 'url' in layer['attributes']['layerConfig']['body'] else None
                s, statusq = req_error_handler(url)
                if statusq!=200 or s==None:
                    data['dataset_id']=dataset['id']
                    data['dataset_name']=dataset['attributes']['name']
                    data['dataset_sql_status']=None
                    data['dataset_sql_error']=None
                    data['connector_provider']=dataset['attributes']['connectorUrl']
                    data['connector_url_status']=statusq if s!=None else 408
                    data['connector_url']=dataset['attributes']['connectorUrl']
                    data['n_layers'] = len(dataset['attributes']['layer'])
                    data['n_widgets'] = len(dataset['attributes']['widget'])
                    return data



In [54]:
def dataFrame(l,application):
    '''
    creates the dataframe from the datasets that are failing and exports it to csv
    saves the file in the same folder the script is located.
    '''
    dDict={
    'dataset_id': [x['dataset_id'] for x in l if x!=None],
    'dataset_name': [x['dataset_name'] for x in l if x!=None],
    'dataset_sql_status': [x['dataset_sql_status'] for x in l if x!=None],
    'dataset_sql_error': [x['dataset_sql_error'] for x in l if x!=None],
    'connector_provider': [x['connector_provider'] for x in l if x!=None],
    'connector_url_status': [x['connector_url_status'] for x in l if x!=None],
    'connector_url': [x['connector_url'] for x in l if x!=None],
    'n_layers': [x['n_layers'] for x in l if x!=None],
    'n_widgets': [x['n_widgets'] for x in l if x!=None]

    }
    pd.DataFrame(dDict).to_csv((application+'.csv'))
    return 'done'

In [55]:
def main(n, application):
    '''
    n: number of threads to use 
    application: api application we want to check its datasets
    '''
    try:
        r = requests.get("https://api.resourcewatch.org/v1/dataset?application="+application+"&status=saved&includes=widget,layer&page[size]=14914800")
    except requests.ConnectionError:
        print("Unexpected error:", requests.ConnectionError)
        raise
    else:
        dataset_list = r.json()['data']
        p = Pool(n)
        l = p.map(f, dataset_list)
        dataFrame(l,application)


In [56]:
main(20,'prep')

In [58]:
main(20,'rw')