# Run notebooks

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from glob import glob

import papermill as pm

import util
import data_access
import intake
import cache_utils

## Get control structure

In [3]:
control = util.get_control_dict()
control

{'data_sources': {'casename': 'gcp-cases',
  'path_to_cat_json': '/glade/u/home/eromashkova/codes/diag-sandbox/catalog/gcp-cases.json',
  'subset': {'case': ['g.e22.GOMIPECOIAF_JRA-1p4-2018.TL319_g17.GCB_2022.BCRC']},
  'cache_metadata_path': '/glade/u/home/eromashkova/codes/my-cesm-experiment-extended/notebooks/cache_metadata',
  'cache_data_path': '/glade/u/home/eromashkova/codes/my-cesm-experiment-extended/notebooks/cache_data'},
 'output_root': './_computed-notebooks',
 'project': 'NCGD0011',
 'book_toc': {'format': 'jb-book',
  'root': 'description',
  'parts': [{'caption': 'Sea Surface Height', 'chapters': [{'file': 'ssh'}]},
   {'caption': 'MOC', 'chapters': [{'glob': 'moc-view*'}]}]},
 'book_config_keys': {'title': 'My CESM Experiment',
  'repository': {'url': 'https://github.com/matt-long/my-cesm-experiment',
   'path_to_book': 'notebooks',
   'branch': 'main'}},
 'compute_notebooks': {'description': {'use_cluster': False,
   'parameter_groups': {'none': {'casename': 'gcp-case

In [4]:
util.setup_book()

['description.ipynb', 'ssh.ipynb', 'POP_MOC_extract_cat.ipynb', 'moc-view.ipynb']
['ssh.ipynb']
['ssh.ipynb']
[]
['ssh.ipynb']
['description.ipynb', 'ssh.ipynb']
copy files []


## Make a directory for temporary catalogs

In [5]:
os.makedirs("temp_data", exist_ok=True)

## Spin up a cluster

The notebooks are configured to connect to this cluster.

In [6]:
cluster = util.get_Cluster(project=control['project'])
cluster.scale(32)
cluster

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43624 instead


0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/eromashkova/proxy/43624/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.12.206.48:46273,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/eromashkova/proxy/43624/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Run all the notebooks

In [7]:
output_dir = control['output_dir']
full_cat_path = control['data_sources']['path_to_cat_json']
full_cat = intake.open_esm_datastore(full_cat_path)
cache_metadata_path = control['data_sources']['cache_metadata_path']
cache_data_path = control['data_sources']['cache_data_path']

### doing initial subsetting on full catalog, e.g. to only use certain cases

if 'subset' in control['data_sources']:
    first_subset_kwargs = control['data_sources']['subset']
    cat_subset = full_cat.search(**first_subset_kwargs)
    ### this pulls out the name of the catalog from the path
    cat_subset_name = full_cat_path.split("/")[-1].split('.')[0] + "_subset"
    cat_subset.serialize(directory="temp_data", name=cat_subset_name, catalog_type="file")
    cat_path = os.getcwd() + "/temp_data/" + cat_subset_name + ".json"

Successfully wrote ESM catalog json file to: file:///glade/u/home/eromashkova/codes/my-cesm-experiment-extended/notebooks/temp_data/gcp-cases_subset.json


### Organizing notebook controls into three lists:

In [8]:
precompute_nbs = dict()
regular_nbs = dict()
dependent_nbs = dict()

for nb, info in control['compute_notebooks'].items():
    if 'dependency' in info:
        dependent_nbs[nb] = info
        
        called_nb = info['dependency']
        
        precompute_nbs[called_nb] = control['compute_notebooks'][called_nb]
        precompute_nbs[called_nb]["needed by"] = list()
        precompute_nbs[called_nb]["needed by"].append(nb)
        
    else:
        regular_nbs[nb] = info

### removing the precompute nbs that got added to regular_nbs

for key, item in precompute_nbs.items():
    if key in regular_nbs:
        regular_nbs.pop(key)
        
        

In [9]:
print(precompute_nbs)
print("--")
print(regular_nbs)
print("--")
print(dependent_nbs)

{'POP_MOC_extract_cat': {'use_cluster': True, 'kernel_name': 'diag-sandbox-moc-pop-tools', 'subset': {'frequency': 'month_1', 'component': 'ocn', 'variable': 'MOC'}, 'parameter_groups': {'none': {'casename': 'gcp-cases'}}, 'needed by': ['moc-view']}}
--
{'description': {'use_cluster': False, 'parameter_groups': {'none': {'casename': 'gcp-cases'}}, 'kernel_name': 'diag-sandbox'}, 'ssh': {'use_cluster': True, 'subset': {'frequency': 'month_1', 'component': 'ocn', 'variable': 'SSH'}, 'parameter_groups': {'none': {'casename': 'gcp-cases'}}, 'kernel_name': 'diag-sandbox'}}
--
{'moc-view': {'use_cluster': True, 'kernel_name': 'diag-sandbox-moc-pop-tools', 'dependency': 'POP_MOC_extract_cat', 'parameter_groups': {'26n': {'casename': 'gcp-cases', 'title_param': '26n'}}}}


## Calculating precompute notebooks

In [10]:
for nb, info in precompute_nbs.items():
    
    parameter_groups = info['parameter_groups']
    use_cluster = info['use_cluster']
    
    ### passing in subset kwargs if they're provided
    if 'subset' in info:
        subset_kwargs = info['subset']
    else:
        subset_kwargs = {}
    
    default_params = {}
    if 'default_params' in info:
        default_params = info['default_params']
        
    for key, parms in parameter_groups.items():
    

        input_path = f'{nb}.ipynb'
        output_path = (
            f'{output_dir}/{nb}-{key}.ipynb'
            if key != 'none' else f'{output_dir}/{nb}.ipynb'
        )
        
        
        result_df = cache_utils.gen_df_query(cache_metadata_path, input_path, 
                               full_cat_path, first_subset=first_subset_kwargs, 
                                             second_subset=subset_kwargs,
                               params=parms)
        
        if not result_df.empty:
            #if multiple matches exist, grabs an arbitrary one (FIX LATER)
            asset_path = result_df.loc[0,'assets']
            precompute_nbs[nb]["asset_path"] = asset_path
            print("Fetching result from cache")
             
        else:
            
            nb_api = pm.inspect_notebook(input_path)
        
            # TODO: validate paramter and API

            asset_path = cache_utils.make_filename(cache_data_path, input_path, full_cat_path) + ".nc"
                                                   
            if nb_api:
                parms_in = dict(**default_params)
                parms_in.update(dict(**parms))
                parms_in['path_to_cat'] = cat_path
                parms_in['cluster_scheduler_address'] = cluster.scheduler_address
                parms_in['subset_kwargs'] = subset_kwargs
                parms_in['asset_path'] = asset_path
            else:
                parms_in = {}

            print(f'executing {input_path}')
            o = pm.execute_notebook(
                input_path=input_path,
                output_path=output_path,
                kernel_name=info['kernel_name'],
                parameters=parms_in,
                engine_name='md_jinja',
                jinja_data=parms,
            )



            cache_utils.make_sidecar_entry(cache_metadata_path, 
                                           input_path, 
                                           full_cat_path, 
                                           asset_path=asset_path, 
                                           first_subset=first_subset_kwargs, 
                                           second_subset= subset_kwargs,
                                           params=parms)

            ### this can only properly handle one save per notebook (FIX LATER)
            precompute_nbs[nb]["asset_path"] = asset_path

0843dd943931ceb494e6b043a9c199561a47b4a08c6bf49e4bb63b3f5e957d32
Fetching result from cache


## Calculating regular notebooks

In [11]:
for nb, info in regular_nbs.items():
    
    parameter_groups = info['parameter_groups']
    use_cluster = info['use_cluster']
    
    ### passing in subset kwargs if they're provided
    if 'subset' in info:
        subset_kwargs = info['subset']
    else:
        subset_kwargs = {}
    
    default_params = {}
    if 'default_params' in info:
        default_params = info['default_params']

    for key, parms in parameter_groups.items():

        input_path = f'{nb}.ipynb'
        output_path = (
            f'{output_dir}/{nb}-{key}.ipynb'
            if key != 'none' else f'{output_dir}/{nb}.ipynb'
        )

        # check notebook expectations
        nb_api = pm.inspect_notebook(input_path)
        
        # TODO: validate paramter and API

        if nb_api:
            parms_in = dict(**default_params)
            parms_in.update(dict(**parms))
            parms_in['path_to_cat'] = cat_path
            parms_in['cluster_scheduler_address'] = cluster.scheduler_address
            parms_in['subset_kwargs'] = subset_kwargs
        else:
            parms_in = {}

        print(f'executing {input_path}')
        o = pm.execute_notebook(
            input_path=input_path,
            output_path=output_path,
            kernel_name=info['kernel_name'],
            parameters=parms_in,
            engine_name='md_jinja',
            jinja_data=parms,
        )

  from .autonotebook import tqdm as notebook_tqdm


executing description.ipynb


Executing: 100%|██████████| 2/2 [00:01<00:00,  1.54cell/s]


executing ssh.ipynb


Executing: 100%|██████████| 11/11 [00:13<00:00,  1.19s/cell]


## Calculating notebooks with dependencies

In [12]:
for nb, info in dependent_nbs.items():
    
    ### getting necessary asset:
    called_nb = info['dependency']
    dependent_asset_path = precompute_nbs[called_nb]["asset_path"]
    
    
    parameter_groups = info['parameter_groups']
    use_cluster = info['use_cluster']
    
    ### passing in subset kwargs if they're provided
    if 'subset' in info:
        subset_kwargs = info['subset']
    else:
        subset_kwargs = {}
    
    default_params = {}
    if 'default_params' in info:
        default_params = info['default_params']

    for key, parms in parameter_groups.items():

        input_path = f'{nb}.ipynb'
        output_path = (
            f'{output_dir}/{nb}-{key}.ipynb'
            if key != 'none' else f'{output_dir}/{nb}.ipynb'
        )

        # check notebook expectations
        nb_api = pm.inspect_notebook(input_path)
        
        # TODO: validate paramter and API

        if nb_api:
            parms_in = dict(**default_params)
            parms_in.update(dict(**parms))
            parms_in['path_to_cat'] = cat_path
            parms_in['cluster_scheduler_address'] = cluster.scheduler_address
            parms_in['subset_kwargs'] = subset_kwargs
            parms_in['asset_path'] = dependent_asset_path
        else:
            parms_in = {}

        print(f'executing {input_path}')
        o = pm.execute_notebook(
            input_path=input_path,
            output_path=output_path,
            kernel_name=info['kernel_name'],
            parameters=parms_in,
            engine_name='md_jinja',
            jinja_data=parms,
        )
    
    

executing moc-view.ipynb


Executing: 100%|██████████| 10/10 [00:04<00:00,  2.45cell/s]


In [13]:
cluster.close()

