Dask Approach to Non-SQL PTM Queries
--

Animate the overall field

First go with lightly smoothed, subtidal?

In [1]:
import postproc_dask as post
import six
six.moves.reload_module(post)
post.config_malloc()

In [2]:
try:
    client.close()
except NameError:
    pass

In [3]:
import multiprocessing.popen_spawn_posix #  https://github.com/dask/distributed/issues/4168
import dask
import dask.dataframe as dd
import dask.bag as db

In [4]:
from dask.distributed import Client
client=Client(n_workers=20,threads_per_worker=1)
client.cluster

VBox(children=(HTML(value='<h2>LocalCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

In [5]:
import matplotlib.pyplot as plt
import conc_figure
import six
import stompy.plot.cmap as scmap
from stompy.spatial import proj_utils
from matplotlib import cm
cmap=cm.CMRmap_r
cmap=scmap.cmap_clip(cmap,0.03,1.0)

%matplotlib notebook

In [6]:
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr
import re

from stompy.grid import unstructured_grid
from stompy import utils, memoize
from stompy.model.fish_ptm import ptm_config, ptm_tools
from stompy.model.suntans import sun_driver
from scipy.stats import spearmanr

import stompy.plot.cmap as scmap
from scipy import stats
import seaborn as sns
turbo=scmap.load_gradient('turbo.cpt')

Overall Process
===

1. SUNTANS hydro runs
2. SUNTANS average output
3. ptm-formatted average output
4. PTM runs
5. Load data

The top-level query is something like *generate a map of concentrations for...*

filter on:
 - sources $x$
 - settling classes $y$
 - vertical positions $z$
 - horizontal positions $h$

weighted by

 - loading data 
 - age
 
mapped by one of ...

 - bounding box
 - put on hydro grid
 - put on regular grid

and possibly smoothed.

In [7]:
# Experiment level configuration -- small, all python native data.
# The 'new' run
cfg=dict(
    ptm_base_dir="/opt2/sfb_ocean/ptm/all_source_022b",
    sun_base_dir="/opt2/sfb_ocean/suntans/runs",
    ptm_output_interval=np.timedelta64(1,'h')
)
cfg['ptm_run_patt']=os.path.join(cfg['ptm_base_dir'],"chunk??","20??????")
cfg['sun_patt']=os.path.join(cfg['sun_base_dir'],"merged_022_20??????")

ptm_run_paths=glob.glob(cfg['ptm_run_patt'])
ptm_run_paths.sort()
cfg['ptm_run_paths']=ptm_run_paths

sun_paths=glob.glob(cfg['sun_patt'])
sun_paths.sort()
cfg['sun_paths']=sun_paths

In [8]:
# Load the grid into... grid
hydro_path=sun_paths[0]
ptm_ds=xr.open_dataset(os.path.join(hydro_path,"ptm_average.nc_0000.nc"))
grid=unstructured_grid.UnstructuredGrid.read_ugrid(ptm_ds,dialect='fishptm')
ptm_ds.close()   

# distribute to workers ahead of time.
grid_d=client.scatter(grid)
cfg['grid_d']=grid_d # too far?

In [9]:
# So far this is only used locally.  Slow to compute (15s)
Msmooth=grid.smooth_matrix()

INFO:utils:63004/99089


In [9]:
six.moves.reload_module(post)

<module 'postproc_dask' from '/home/rusty/src/microplastic_sfbay/postprocess/postproc_dask.py'>

In [10]:
# Or could make this delayed and have it execute on each client?
load_data_d=client.scatter(post.get_load_data())
cfg['load_data_d']=load_data_d


In [11]:
def total_area(g):
    return g.cells_area().sum()

total_area_d=dask.delayed(total_area)(grid_d)
cfg['total_area_d']=total_area_d


In [12]:
# Hydro timestamps
cfg['hydro_timestamps']=post.load_hydro_timestamps(sun_paths)


In [13]:
bc_ds_d=client.scatter(post.bc_ds(cfg=cfg))
cfg['bc_ds_d']=bc_ds_d

In [14]:
areas=grid.cells_area()

Next steps
--


1. Recreate some of the figures from before, including on-grid smoothing.  
  a. Sample plot: from the powerpoint. 2017-08-30 to 2017-09-14. Surface particles
     max age of 10 days.   *This plot is similarish -- not exactly the same but close
     enough to rule out fundamental errors*
2. Pull out manta samples as before. Maybe skip putting it on the grid, just
   query a radius.
  a. Implement in this notebook
  b. Move all of this to a module.

In [None]:
# Inspect distribution around 21Aug2017
# Could specify additional criteria here
rec_DATE="2017-08-21"

df=post.particles_for_date(rec_DATE,cfg=cfg,cache=False)

In [None]:
frame_dir='surface-frames-v00'
if not os.path.exists(frame_dir):
    os.makedirs(frame_dir)

In [None]:
days=np.arange(np.datetime64("2017-08-01"),
               np.datetime64("2018-06-30"),
               np.timedelta64(1,'D'))

days_str=[str(d) for d in days]

In [None]:
fig,axs=plt.subplots(1,2,figsize=(9,6))

storm_factor=0.02
tau_d=30

for date in days:
    dt=utils.to_datetime(date)
    date_s=dt.strftime('%Y-%m-%d')
    
    conc_fn=os.path.join(frame_dir,
                         f"conc-storm{storm_factor}-tau{tau_d}-{dt.strftime('%Y%m%dT%H%M')}.nc")
    fig_fn=conc_fn.replace('.nc','.png')
    
    if os.path.exists(conc_fn):
        ds_conc=xr.open_dataset(conc_fn)
    else:
        df=post.particles_for_date(date_s,cfg=cfg,cache=False)
        
        age=df['time'] - df['rel_time']
        tau=np.timedelta64(tau_d,'D')
        decay=np.exp( -age/tau ) 

        group_weight=post.group_weights(df,storm_factor)
        
        df['count']=group_weight*df['weight_time'] * decay * df['mp_per_particle']
        
        ds_conc=post.particles_to_conc(df,grid,'count')
        ds_conc.to_netcdf(conc_fn)

    ds_smooth=ds_conc.copy()
    # 2 iterations seems good for removing particle noise
    for _ in range(2):
        ds_smooth['conc']=('cell',), Msmooth.dot(ds_smooth.conc.values)

    fig.clf()
    axs=[fig.add_subplot(1,2,1),
         fig.add_subplot(1,2,2)]
    
    fig.subplots_adjust(left=0,right=1,top=1,bottom=0,wspace=0)

    cf1=conc_figure.BayConcFigure(ds_smooth,grid=grid,fig=fig,ax=axs[1],
                                  cax_loc=[0.82,0.37,0.02,0.45])
    cf2=conc_figure.CoastalConcFigure(ds_smooth,grid=grid,fig=fig,ax=axs[0],
                                     cax_loc=None)
    for ax in axs:
        ax.texts=[]

    axs[0].text(0.5,0.9,date_s,transform=axs[0].transAxes,fontsize=15)
    axs[0].axis([441742., 591985., 4076042., 4276366.])
    axs[1].axis([522130., 582108., 4146136, 4226106.])
    plt.draw()
    fig.savefig(fig_fn,dpi=200)
    plt.pause(0.01)

In [None]:
out_dir="hourly-out-v00"
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

storm_factor=0.02
tau_d=30

# The hour/minutes are important to cast correctly.
intervals=np.arange(np.datetime64("2017-08-01T00:00"),
                    np.datetime64("2018-06-30T00:00"),
                    np.timedelta64(1,'D'))

for t_min,t_max in zip(intervals[:-1],intervals[1:]):
    hours=np.arange(t_min,t_max,np.timedelta64(3600,'s'))
    targets=[]
    for h_min,h_max in zip(hours[:-1],hours[1:]):
        dt=utils.to_datetime(h_min)
        date_s=dt.strftime("%Y%m%dT%H%M")
        conc_fn=os.path.join(out_dir,f"surface-{date_s}.nc")
        if not os.path.exists(conc_fn):
            targets.append([h_min,h_max,conc_fn])
    if not targets:
        print(f"Skip {t_min} - {t_max}")
        continue # nothing to do
        
    print(f"Processing {t_min} - {t_max}")
        
    criteria=dict(t_min=t_min,t_max=t_max,
                  category='nonfiber',
                  z_below_surface=0.095,
                  age_max=np.timedelta64(60,'D'))
    part_d=post.query_particles(criteria,cfg=cfg)
    df=part_d.compute()
    

    # Weights computed on the whole group.
    age=df['time'] - df['rel_time']
    tau=np.timedelta64(tau_d,'D')
    decay=np.exp( -age/tau ) 

    group_weight=post.group_weights(df,storm_factor)

    df['count']=group_weight* decay * df['mp_per_particle']

    for h_min,h_max,conc_fn in targets:
        sel=(df['time']>=h_min).values & (df['time']<h_max).values
        df_sel=df[sel]
        ds_conc=post.particles_to_conc(df_sel,grid,'count')
        ds_conc['time']=(),h_min
        ds_conc['time_max']=(),h_max
        ds_conc['storm_factor']=(),storm_factor
        ds_conc['tau_d']=(),tau_d
        ds_conc.to_netcdf(conc_fn)        

Processing 2017-08-01T00:00 - 2017-08-02T00:00
Will repartition with 42 partitions
