## References
- https://janakiev.com/blog/python-filesystem-analysis/

- dependencies:
    - (optional - optional extension in the future: pip install persist-queue[extra] ( https://github.com/peter-wangxu/persist-queue ) 
    - pip install rq ( https://python-rq.org/ ) 
    
- related work
    - https://github.com/cedadev/facet-scanner
    - https://github.com/cedadev/ceda-elasticsearch-tools

# Elastic search ingester 

## Work Plan: 

- implement project readers in pool_tools
   - every reader returns elastic search ingest items
- implement directory walker, generating ingest reqests
   - ingest request is input to project reader
- track index status information in index itself    
- use http://python-rq.org/ to queue ingest items
- use parallel workers to run elastic search ingest items
   - each worker maintains an elesticsearch endpoint


In [None]:
# for development: auto reload of packages
%load_ext autoreload

In [1]:
import os
from pindex import pool_tools

In [2]:
prefix = "/work/ik1017/CMIP6/data/CMIP6/"   # mounted via sshfs

#ipsl_test = "CMIP/IPSL/IPSL-CM6A-LR/1pctCO2/r1i1p1f1/"
ipsl_test = "CMIP/IPSL/IPSL-CM6A-LR/1pctCO2/r1i1p1f1/Ofx/masscello/gn/"
# prefix = /work/ik1017/            # on mistral 
#pool_tools.index("cmip6",prefix+ipsl_test)
pool_tools.tqueue("cmip6",'_1',prefix+ipsl_test)

Project handler initialized: /work/ik1017/CMIP6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/1pctCO2/r1i1p1f1/Ofx/masscello/gn/
['masscello', 'Ofx', 'IPSL-CM6A-LR', '1pctCO2', 'r1i1p1f1', 'gn']
dict_keys(['variable', 'table', 'model', 'experiment', 'member', 'grid', 'time'])
opening:  /work/ik1017/CMIP6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/1pctCO2/r1i1p1f1/Ofx/masscello/gn/v20180727/masscello_Ofx_IPSL-CM6A-LR_1pctCO2_r1i1p1f1_gn.nc
{'masscello_Ofx_IPSL-CM6A-LR_1pctCO2_r1i1p1f1_gn.nc': {'variable': 'masscello', 'table': 'Ofx', 'model': 'IPSL-CM6A-LR', 'experiment': '1pctCO2', 'member': 'r1i1p1f1', 'grid': 'gn', 'file_name': 'masscello_Ofx_IPSL-CM6A-LR_1pctCO2_r1i1p1f1_gn.nc', 'project': 'cmip6', 'dataset_id': '/work/ik1017/CMIP6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/1pctCO2/r1i1p1f1/Ofx/masscello/gn/v20180727', 'tracking_id': 'hdl:21.14100/1300474a-47a2-4029-88e1-dd3957b3a537', 'contact': 'ipsl-cmip6@listes.ipsl.fr', 'st_size': 8762397, 'st_atime': 1581081059.0, 'st_mtime': 1539027566.0, 'st_ctime': 1539

## Test queries

In [None]:
5 * 60 / 500

In [None]:
60 * 60 * 24

## Tests with parallel indexing approaches

In [None]:
from persistqueue import Queue
q = Queue("/tmp/tst1")
in1 = {'a':'a1','b':'b1'}
q.put(in1)
q.put('b')
q.put('c')
q.get()

In [5]:
from persistqueue import Queue
tq = Queue("/tmp/cmip6")
tq.get()
tq.get()

{'variable': 'tsl',
 'table': 'Lmon',
 'model': 'IPSL-CM6A-LR',
 'experiment': '1pctCO2',
 'member': 'r1i1p1f1',
 'grid': 'gr',
 'time': '185001-199912',
 'file_name': 'tsl_Lmon_IPSL-CM6A-LR_1pctCO2_r1i1p1f1_gr_185001-199912.nc',
 'project': 'cmip6',
 'dataset_id': '/work/ik1017/CMIP6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/1pctCO2/r1i1p1f1/Lmon/tsl/gr/v20180727',
 'stime': '185001',
 'etime': '199912',
 'tracking_id': 'hdl:21.14100/fb831079-b93b-493d-a7f0-8e03160a9bc7',
 'contact': 'ipsl-cmip6@listes.ipsl.fr',
 'st_size': 555091391,
 'st_atime': 1581077226.0,
 'st_mtime': 1539030087.0,
 'st_ctime': 1539030087.0}

In [4]:
import os
dirpaths  = [f.path for f in os.scandir(prefix+ipsl_test) if f.is_dir()]
dirpaths

['/work/ik1017/CMIP6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/1pctCO2/r1i1p1f1/Ofx/masscello/gn/v20180727']

In [None]:
q_string = {"query": {
               "bool": {
                   "must": [
                       {"match": {"variable": "tas"}},
                       {"match": {"model":"GFDL-CM4"}},
                       {"match": {"time":'015101-025012'}}
                                  ]             
                      }
               }
           }
r_string = {"query": {
               "bool": {
                   "must": [
                       {"match": {"stime": "055101"}},
                       {"match": {"model":"GFDL-CM4"}}                   
                                  ]             
                      }
               }
           }

r_string = {"query": {
               "bool": {
                   "filter": [
                       {"term": {"etime":'19991231'}}
                   ]
                      }
               }
           }

t_string = {"query": {
              "filter" : [
                 {"term" : { "etime": '19991231'}}
              ]
            }
           }


tr_id_string =  {"query": {
               "bool": {
                   "filter": [
                       {"term": {"tracking_id":"hdl:21.14100/bb6bcc55-8d61-4622-b85f-818b37f5dd2b"}}
                   ]
                      }
               }
           }



In [None]:
import pprint
pp=pprint.PrettyPrinter(indent=4)
res = es.search(index="cmip6", body=tr_id_string)
pp.pprint(res['hits']['hits'])

In [None]:
import redis

r = redis.Redis(password='prolog1..')


In [None]:
r.set('foo', 'bar')
value = r.get('foo')
print(value)

In [None]:
r.save()