
Using the SQL database approach, compare manta data with model results, varying
the averaging period and the max age.

**v01:** try adapting this to each category separately, see if the model shows difference in loss rates.


Manta data is already adjusted by category, and the category-level data is in the csv.

Load data...

In [21]:
import os
import time
from collections import defaultdict
import six

import logging as log
import glob
import re

import seaborn as sns
from matplotlib import colors, cm
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
from matplotlib.colors import LogNorm

import numpy as np

import pandas as pd
import xarray as xr

from sqlite3 import dbapi2 as sql

from stompy.grid import unstructured_grid
from stompy import utils, memoize
from stompy.spatial import proj_utils 
from stompy.model.data_comparison import calc_metrics
from stompy.plot import plot_wkb
from stompy.model import data_comparison
import stompy.plot.cmap as scmap

import postprocess_v00 as post

%matplotlib notebook

In [22]:
ll2utm=proj_utils.mapper('WGS84','EPSG:26910')

In [23]:
# Load the manta data, clean it up
manta=pd.read_csv('manta_summary-v02.csv')
manta.head()

Unnamed: 0,SampleID,SAMPLE LOCATION,DATE,TYPE,LAT START,LONG START,LAT END,LONG END,TOTAL DIST (M),START FLOW,...,part_per_m3_std,part_per_m2_std,part_per_m3_raw_std,part_per_m2_raw_std,part_per_m3_nofiber,part_per_m2_nofiber,part_per_m3_raw_nofiber,part_per_m2_raw_nofiber,x,y
0,CB-4-Manta-21Aug2017,CB4,2017-08-21 00:00:00,Manta,37.915567,-122.441217,37.941933,-122.419983,3473.323262,,...,0.224191,0.021298,0.442172,0.042006,0.015526,0.001475,0.024841,0.00236,550040.331073,4198063.0
1,SPB3-Manta-21Aug2017,SPB3,2017-08-21 00:00:00,Manta,38.023072,-122.371583,38.042133,-122.321883,66678.22227,9113.0,...,1.817781,0.172689,1.928407,0.183199,0.296678,0.028184,0.301706,0.028662,557329.971756,4209634.0
2,SUB1-Manta-21Aug2017,SUB1,2017-08-21 00:00:00,Manta,38.107067,-122.056283,38.096533,-122.064917,1393.740434,128369.0,...,,,0.0,0.0,0.032671,0.003104,0.039205,0.003724,582363.556136,4217527.0
3,SPB2-Manta-21Aug2017,SPB2,2017-08-21 00:00:00,Manta,38.051283,-122.42175,38.023,-122.428117,3194.008847,220135.0,...,0.210549,0.020002,0.402882,0.038274,0.021966,0.002087,0.034288,0.003257,550464.04632,4210092.0
4,CB9-Manta-22Aug2017,CB9,2017-08-22 00:00:00,Manta,37.687233,-122.290917,37.6985,-122.298433,1416.65638,290000.0,...,0.085947,0.008165,0.174186,0.016548,0.085947,0.008165,0.091677,0.008709,562183.425087,4171973.0


In [24]:
manta.columns

Index(['SampleID', 'SAMPLE LOCATION', 'DATE', 'TYPE', 'LAT START',
       'LONG START', 'LAT END', 'LONG END', 'TOTAL DIST (M)', 'START FLOW',
       'END FLOW', 'area_km2', 'volume_m3', 'FibersYN', 'Season', 'time_pt',
       'time_local', 'time_utc', 'lat', 'lon', 'volume_l', 'area_m2',
       'count_preblank_std', 'count_preblank_nofiber', 'Fiber', 'Fiber Bundle',
       'Film', 'Foam', 'Fragment', 'Sphere', 'Fiber_adj', 'Fiber Bundle_adj',
       'Fragment_adj', 'Foam_adj', 'Film_adj', 'Sphere_adj', 'count_std',
       'count_nofiber', 'part_per_m3_std', 'part_per_m2_std',
       'part_per_m3_raw_std', 'part_per_m2_raw_std', 'part_per_m3_nofiber',
       'part_per_m2_nofiber', 'part_per_m3_raw_nofiber',
       'part_per_m2_raw_nofiber', 'x', 'y'],
      dtype='object')

In [237]:
manta['part_per_m2_nofiber']

0     0.001475
1     0.028184
2     0.003104
3     0.002087
4     0.008165
5     0.003313
6     0.167969
7     0.019350
8     0.029461
9     0.129766
10    0.009974
11    0.004888
12    0.015727
13    0.121830
14    0.006237
15    0.100369
16    0.081736
17    0.000334
18    0.005392
19    0.005493
20    0.004348
21    0.005818
22    0.000000
23    0.000045
24    0.002150
25    0.001026
26    0.018591
27    0.002819
28    0.001191
29    0.000426
        ...   
35    0.032179
36    0.076309
37    0.083536
38    0.062406
39    0.005564
40    0.011862
41    0.017004
42    0.199062
43    2.914498
44    0.061145
45    0.115487
46    0.009575
47    0.021445
48    0.050986
49    0.022715
50    0.011560
51    0.010717
52    0.036479
53    0.003271
54    0.003133
55    0.000740
56    0.005118
57    0.009542
58    0.003743
59    0.004057
60    0.012858
61    0.017765
62    0.002712
63    0.002263
64    0.000976
Name: part_per_m2_nofiber, Length: 65, dtype: float64

In [251]:
recon=manta.copy()

recon['recon']=(manta['Fiber Bundle_adj']+manta['Foam_adj'] + manta['Fragment_adj'] + manta['Sphere_adj'] + manta['Film_adj']) / manta['area_m2']

#recon.loc[:, ['recon','part_per_m2_nofiber']]
np.allclose( recon['recon'],recon['part_per_m2_nofiber'])

True

In [70]:
import sql_common
six.moves.reload_module(sql_common)

# try to run everything that can be cached and re-used through
# here. 

<module 'sql_common' from '/home/rusty/src/microplastic_sfbay/postprocess/sql_common.py'>

In [181]:
#PtmSet=sql_common.PtmSet
#dbs=glob.glob("/opt2/sfb_ocean/ptm/all_source/*/ptm_and_grid.db")

PtmSet=sql_common.PtmSetNew
dbs=glob.glob("/opt2/sfb_ocean/ptm/all_source_021b/20*/ptm_and_grid.db")

dbs.sort()

ptm_set=PtmSet(databases=dbs)

In [182]:
# Get all of the group names so loads can be premapped
group_names=[]
for db in ptm_set.databases:
    con=ptm_set.db_to_con(db)
    curs=con.cursor()
    curs.execute("select name from ptm_group group by name")
    group_names.append( np.array([t[0] for t in curs.fetchall()],'S100') )
group_names=np.unique(np.concatenate(group_names)) # ensures sort, too
print(f"{len(group_names)} groups in total")

5831 groups in total


In [28]:
# For each trawl, I want to query all of the particles 
# that were "nearby" at a "near time", and get their
# release time, cell, std mass, and nofiber mass.
# from there, it's postprocessing to see what subset or
# weighting of those gives the best agreement with
# part_per_m2_std and part-per_m2_raw

In [316]:
# Is it possible to run the query once, and save enough info about the
# particles to apply loading scenarios after the fact?

def query_particles(t_start,t_stop,
                    z_filter="and loc.z_from_surface>-0.2",
                    max_age=np.timedelta64(20,'D'), # could go to 30
                    grp_filter=""):
    epoch_start=int( utils.to_unix( t_start ) )
    epoch_stop =int( utils.to_unix( t_stop ) )
    max_age_clause=f"and (loc.time-rel.time)<{int(max_age/np.timedelta64(1,'s'))}"

    # Start with a global query -- how long does it take, and how
    # many rows?
    # So rather than multiplying out the loads here, 
    # record rel.volume/rel.count to get rel_part_volume
    # 
    query=f"""
      select rel.volume / rel.count as rel_part_volume,
             loc.time as time,
             rel.time as rel_time,
             loc.cell as cell,
             grp.name as grp,
             loc.z_from_surface as z_from_surface,
             loc.z_from_bed as z_from_bed
        from particle_loc as loc, particle as p, ptm_release as rel, 
             ptm_group as grp
        where loc.time>={epoch_start} and loc.time < {epoch_stop}
          and loc.particle_id=p.id
          and p.release_id=rel.id
          and rel.group_id=grp.id
          and loc.cell>=0
          {max_age_clause}
          {z_filter}
          {grp_filter}"""
    print(query)    
    
    query_dtype=[('rel_part_volume',np.float64),
                 ('time',np.int32),
                 ('rel_time',np.int32),
                 ('cell',np.int32),
                 ('group','S100'),
                 ('z_from_surface',np.float64),
                 ('z_from_bed',np.float64)]

    all_data=[]
    all_data.append( np.zeros(0,dtype=query_dtype) )

    for db in ptm_set.databases:
        con=ptm_set.db_to_con(db)
        curs=con.cursor()
        curs.execute(query)
        data=curs.fetchall()
        print(f"{db} {len(data)} rows")
        if len(data)==0: continue
        adata=np.array(data)

        adata2=np.zeros(len(adata),dtype=query_dtype)
        adata2['rel_part_volume']=adata[:,0]
        adata2['time']=adata[:,1].astype(np.int32)
        adata2['rel_time']=adata[:,2].astype(np.int32)
        adata2['cell']=adata[:,3].astype(np.int32)
        adata2['group']=adata[:,4]
        adata2['z_from_surface']=adata[:,5]
        adata2['z_from_bed']=adata[:,6]
        all_data.append(adata2)

    combined=np.concatenate( all_data )
    return combined

In [184]:
res=query_particles(t_start=np.datetime64("2017-11-04 12:00:00"),
                    t_stop =np.datetime64("2017-11-04 13:00:00"),
                    max_age=np.timedelta64(1,'D'))


      select rel.volume / rel.count as rel_part_volume,
             loc.time as time,
             rel.time as rel_time,
             loc.cell as cell,
             grp.name as grp
        from particle_loc as loc, particle as p, ptm_release as rel, 
             ptm_group as grp
        where loc.time>=1509796800 and loc.time < 1509800400
          and loc.particle_id=p.id
          and p.release_id=rel.id
          and rel.group_id=grp.id
          and loc.cell>=0
          and (loc.time-rel.time)<86400
          and loc.z_from_surface>-0.2
          
/opt2/sfb_ocean/ptm/all_source_021b/20170720/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170730/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170809/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170819/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170829/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170908/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source

In [185]:
# Load the loads
loads=xr.open_dataset("../loads/plastic_loads-7classes-v05.nc")
loads

<xarray.Dataset>
Dimensions:            (category: 6, pathway: 2, source: 9, w_s: 7)
Coordinates:
  * pathway            (pathway) object 'effluent' 'stormwater'
  * category           (category) object 'Fiber' 'Fiber Bundle' ... 'Sphere'
  * w_s                (w_s) float64 -0.05 -0.005 -0.0005 0.0 0.0005 0.005 0.05
  * source             (source) object 'CCCSD' 'EBDA' ... 'SUNN' 'stormwater'
Data variables:
    n_blank_particles  (pathway, category) int64 ...
    n_blank_samples    (pathway, category) int64 ...
    blank_rate         (pathway, category) float64 ...
    total_volume       (source) float64 ...
    n_samples          (source) float64 ...
    count_w_s          (source, category, w_s) int32 ...
    count_no_w_s       (source, category) int32 ...
    conc_raw           (source, category, w_s) float64 ...
    source_pathway     (source) object ...
    conc               (source, category, w_s) float64 ...
    conc_noclip        (source, category, w_s) float64 ...
    conc_

In [198]:
# res: released_part_volume particles/m3, time of observation, release_time, cell index,
# and the group.
# loads: source, category, w_s => particles/l
# Note that wastewater should be scaled up by 1/0.70
# stormwater is maybe close enough now that there are so many sources.
behavior_to_ws=sql_common.behavior_to_ws
# allow for unicode or byte strings
for k in list(behavior_to_ws.keys()):
    behavior_to_ws[k.encode()]=behavior_to_ws[k]

source_map=defaultdict(lambda:'stormwater')                                                                                                                
source_map['cccsd']='CCCSD'                                                                                                                                
source_map['sunnyvale']='SUNN'                                                                                                                             
source_map['fs']='FSSD'                                                                                                                                    
source_map['palo_alto']='PA'                                                                                                                               
source_map['san_jose']='SJ'                                                                                                                                
source_map['src000']='EBDA'                                                                                                                                
source_map['src001']='EBMUD'                                                                                                                               
source_map['src002']='SFPUC'                                                                                                                               

# these we don't actually use                                                                                                                              
source_map['petaluma']='SKIP'                                                                                                                              

# These shouldn't be used, but including just to be sure                                                                                                   
# that if they somehow show up, they won't contaminate                                                                                                     
# stormwater.                                                                                                                                              
source_map['SacRiver']='DELTA'                                                                                                                             
source_map['SJRiver']='DELTA'                                                                                                                              

def particle_load_mapping(load_da,group_names,wastewater_scale=1/0.70,stormwater_scale=1.0):
    # Here generate a mapping of group to release particle/m3.
    # tuples of group_name, particles/m3.                                                                                                                      
    # can omit rows with 0 concentration.                                                                                                                      
    
    # numpy strings have to be preallocated at max string length.
    # thus S100.  S is a few times faster for numpy to deal with
    # than U.
    mapping=np.zeros(len(group_names),
                     dtype=[('group','S100'),('conc',np.float64)])
    mapping['group'][:]=group_names
    
    for grp_i, group_name in enumerate(group_names):
        # group ~ Source_Name_behavior_relYYYYMMDD
        # and note the 'b' here, since using S-types and not U-types
        m=re.match(b'(.*)_(down\d+|up\d+|none)(_rel.*)?',group_name)                                                                                         
        source=m.group(1)  
        behavior=m.group(2)
        rel_time=m.group(3) # may be missing
        w_s=behavior_to_ws[behavior]                                                                                                                           

        source_name=source_map[source]                                                                                                                         
        if source_name in ['DELTA','SKIP']:                                                                                                                    
            conc=0.0                                                                                                                              
        else:                                                                                                                                                  
            conc=load_da.sel(source=source_name,w_s=w_s).item()                                                                                           

        if source_name=='stormwater':                                                                                                
            conc*=stormwater_scale                                                                                                                             
        else:                                                                                                                                                  
            conc*=wastewater_scale                                                                                                                             

        # load netcdf is in particles/l, but we want to set the calculation                                                                                    
        # up as particle/m3. Updated 2019-11-17                                                                                                                
        conc*=1000                                                                                                                                                                                                                                                                                       
        mapping['conc'][grp_i]=conc
        # print(f"{source:15s}  {behavior:9s} => {source_name:15s} {conc:.4f}")
    return mapping

def particle_counts_from_load(res,mapping,**kw):
    """
    load_da: xr.DataArray giving microparticle/m3 load concentrations
    by source and w_s.
    res: numpy struct array with group: <Source>_<w_s>_<release date>
      and rel_part_volume: the inflow volume represented by each PTM particle.
    return an array, same size as res, giving the inflow count of microparticles
      for each PTM particle. 
    """
    # This may still be slow:
    grp_idx=np.searchsorted(mapping['group'],res['group'])

    # microparticles/m3 in the release for each PTM particle
    load_conc=mapping['conc'][grp_idx]
    # microparticles/PTM particle.
    load_particles=load_conc * res['rel_part_volume']
    return load_particles


In [187]:
out_dir="manta_sets_20200212b"
# now the 021b runs.

os.makedirs(out_dir,exist_ok=True)

for idx,rec in manta.iterrows():
    #fn=os.path.join(out_dir,f"{rec.SampleID}.npy")
    # since we're just pulling out whole days, cache by day.
    # and pull through the end of that day
    fn=os.path.join(out_dir,f"v01-{rec.DATE[:10]}.npy")
    
    if os.path.exists(fn):
        print(f"{fn} exists. Skipping")
        continue

    # pull a generous buffer of particles here, and narrow
    if 1:
        t_sample=np.datetime64(rec.DATE)
        # want to be able to, after the fact, query a full 25h tidal cycle
        # centered on the actual time of a sample that could fall anywhere
        # in this day.
        t_start=t_sample+np.timedelta64(8,'h') - np.timedelta64(12,'h')
        # and go for a tidal day
        t_stop =t_sample+np.timedelta64(8+24,'h') + np.timedelta64(13,'h')
        query_n_steps=25 # how many hours are included in the query.
        
    combined=query_particles(t_start,t_stop)
    np.save(fn,combined)



      select rel.volume / rel.count as rel_part_volume,
             loc.time as time,
             rel.time as rel_time,
             loc.cell as cell,
             grp.name as grp
        from particle_loc as loc, particle as p, ptm_release as rel, 
             ptm_group as grp
        where loc.time>=1503259200 and loc.time < 1503435600
          and loc.particle_id=p.id
          and p.release_id=rel.id
          and rel.group_id=grp.id
          and loc.cell>=0
          and (loc.time-rel.time)<1728000
          and loc.z_from_surface>-0.2
          
/opt2/sfb_ocean/ptm/all_source_021b/20170720/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170730/ptm_and_grid.db 963115 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170809/ptm_and_grid.db 1272522 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170819/ptm_and_grid.db 513390 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170829/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170908/ptm_and_grid.db 0 rows
/opt2/sfb_oc

/opt2/sfb_ocean/ptm/all_source_021b/20170730/ptm_and_grid.db 566290 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170809/ptm_and_grid.db 1248633 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170819/ptm_and_grid.db 1031670 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170829/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170908/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170918/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170928/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171008/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171018/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171028/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171107/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171117/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171127/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171207/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_

/opt2/sfb_ocean/ptm/all_source_021b/20180317/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180327/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180406/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180416/ptm_and_grid.db 0 rows
manta_sets_20200212b/v01-2017-09-12.npy exists. Skipping
manta_sets_20200212b/v01-2017-09-12.npy exists. Skipping
manta_sets_20200212b/v01-2017-09-12.npy exists. Skipping

      select rel.volume / rel.count as rel_part_volume,
             loc.time as time,
             rel.time as rel_time,
             loc.cell as cell,
             grp.name as grp
        from particle_loc as loc, particle as p, ptm_release as rel, 
             ptm_group as grp
        where loc.time>=1505246400 and loc.time < 1505422800
          and loc.particle_id=p.id
          and p.release_id=rel.id
          and rel.group_id=grp.id
          and loc.cell>=0
          and (loc.time-rel.time)<1728000
          and loc.z_from_surface>-0.2
    

/opt2/sfb_ocean/ptm/all_source_021b/20171008/ptm_and_grid.db 202301 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171018/ptm_and_grid.db 1981588 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171028/ptm_and_grid.db 2274172 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171107/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171117/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171127/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171207/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171217/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171227/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180106/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180116/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180126/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180205/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180215/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_

/opt2/sfb_ocean/ptm/all_source_021b/20171207/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171217/ptm_and_grid.db 1057932 rows
/opt2/sfb_ocean/ptm/all_source_021b/20171227/ptm_and_grid.db 2658400 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180106/ptm_and_grid.db 1997580 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180116/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180126/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180205/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180215/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180225/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180307/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180317/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180327/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180406/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180416/ptm_and_grid.db 0 rows
manta_sets_20200212b/v01-2018-

/opt2/sfb_ocean/ptm/all_source_021b/20180307/ptm_and_grid.db 1611448 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180317/ptm_and_grid.db 2248450 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180327/ptm_and_grid.db 898951 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180406/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20180416/ptm_and_grid.db 0 rows
manta_sets_20200212b/v01-2018-03-29.npy exists. Skipping
manta_sets_20200212b/v01-2018-03-29.npy exists. Skipping
manta_sets_20200212b/v01-2018-03-29.npy exists. Skipping

      select rel.volume / rel.count as rel_part_volume,
             loc.time as time,
             rel.time as rel_time,
             loc.cell as cell,
             grp.name as grp
        from particle_loc as loc, particle as p, ptm_release as rel, 
             ptm_group as grp
        where loc.time>=1522353600 and loc.time < 1522530000
          and loc.particle_id=p.id
          and p.release_id=rel.id
          and rel.group_id=grp.id
          and loc.cell>=0

In [188]:
g=ptm_set.grid()

In [148]:
# just preload all of the particle data:
# going to use a good bit of memory...
#def load_particles(date):
#    particle_fn=os.path.join(out_dir,f"v01-{date[:10]}.npy")
#    particles=np.load(particle_fn)
#    return particles

In [189]:
@memoize.memoize(lru=60)
def load_particles_with_load(date,mapping):
    particle_fn=os.path.join(out_dir,f"v01-{date[:10]}.npy")
    particles=np.load(particle_fn)
    
    grp_idx=np.searchsorted(mapping['group'],particles['group'])
    # Be sure matches are exact!
    assert np.all( mapping['group'][grp_idx]==particles['group'])
    
    load_conc=mapping['conc'][grp_idx]
    # microparticles/PTM particle.
    load_particles=load_conc * particles['rel_part_volume']
    p=utils.recarray_add_fields(particles,[ ('load_count',load_particles)])
    return p

In [190]:
@memoize.memoize()
def stencil0(c):
    stencil=np.zeros(g.Ncells(),np.float64)
    stencil[c]=1.0
    return stencil

@memoize.memoize()
def stencil1(c):
    return ptm_set.smooth(stencil0(c))
   

In [191]:
import time

# which particles will be counted relative to the time of the manta
# observation itself.
vers='20200212e' # 25 h, centered on sample
hours_min=-12
hours_max= 13

# previous attempts
#vers='20191205e' # 6 h, centered on sample. not as clear, but 1d is still the winner.
#hours_min=-3
#hours_max= 3

#vers='20191205f' # 4 h, centered on sample. not great, and 0.5d is the winner.
#hours_min=-2
#hours_max= 2

def extract_for_sample(rec,mapping,smooth=0,tau_s=-1):
    """
    rec: a DataFrame row from the manta data
    load_da: xr.DataArray with particle/m3 load concentrations by source, w_s.
    """
    t0=time.time()
    def tok(msg=''):
        print(f"{msg} elapsed {time.time()-t0}")

    # This is where a different wastewater_scale or stormwater_scale
    # could be supplied  with some effort.
    particles=load_particles_with_load(rec.DATE,mapping)
    
    age_s=particles['time']-particles['rel_time']
    if tau_s>0:
        aged_counts=np.exp(-age_s/tau_s) * particles['load_count']
    else:
        aged_counts=load_counts
        
    # time of observation relative to the time of the manta sample.
    relative_time_h=(particles['time']-utils.to_unix(rec.time_utc))/3600.0
    aged_counts[relative_time_h<hours_min]=0.0
    aged_counts[relative_time_h>hours_max]=0.0
    nsteps=hours_max-hours_min

    cell_counts=np.bincount( particles['cell'], weights=aged_counts,
                             minlength=g.Ncells())
    # This sets the units -- particles per m2
    cell_conc=cell_counts/g.cells_area()/nsteps

    c_select=ptm_set.grid().select_cells_nearest(np.r_[rec.x,rec.y])
    stencil=stencil0(c_select)
    M=ptm_set.Msmooth()
    for s in range(smooth):
        stencil=M.dot(stencil)
    conc=(stencil*cell_conc).sum()
    return conc

In [236]:
manta.columns

Index(['SampleID', 'SAMPLE LOCATION', 'DATE', 'TYPE', 'LAT START',
       'LONG START', 'LAT END', 'LONG END', 'TOTAL DIST (M)', 'START FLOW',
       'END FLOW', 'area_km2', 'volume_m3', 'FibersYN', 'Season', 'time_pt',
       'time_local', 'time_utc', 'lat', 'lon', 'volume_l', 'area_m2',
       'count_preblank_std', 'count_preblank_nofiber', 'Fiber', 'Fiber Bundle',
       'Film', 'Foam', 'Fragment', 'Sphere', 'Fiber_adj', 'Fiber Bundle_adj',
       'Fragment_adj', 'Foam_adj', 'Film_adj', 'Sphere_adj', 'count_std',
       'count_nofiber', 'part_per_m3_std', 'part_per_m2_std',
       'part_per_m3_raw_std', 'part_per_m2_raw_std', 'part_per_m3_nofiber',
       'part_per_m2_nofiber', 'part_per_m3_raw_nofiber',
       'part_per_m2_raw_nofiber', 'x', 'y'],
      dtype='object')

In [281]:
print(loads.category.values)
loads['conc'].sum(dim='w_s').sum(dim='source')

['Fiber' 'Fiber Bundle' 'Film' 'Foam' 'Fragment' 'Sphere']


<xarray.DataArray 'conc' (category: 6)>
array([3.254372, 0.048556, 0.098039, 0.104429, 5.373341, 0.034279])
Coordinates:
  * category  (category) object 'Fiber' 'Fiber Bundle' ... 'Fragment' 'Sphere'

In [290]:
# These need to be consistent with each other:
# full dataset:

if 0:
    #   spearman is almost constant.
    #   log_r shows mostly a smooth=2 signal, but not that much variation.
    #   r has a great pattern. but the values are about 0.25, and it's almost
    #     certainly driven by a few large values, so not that convincing.
    manta_field='part_per_m2_std'
    load_da=loads.conc.sum(dim='category')
    sub_vers=vers+"std"
if 0:
    # non-fibers:
    #   quite similar to previous code.  tau=2 from pearson or log-r.
    #   amp suggests 1.0d, but the manta size class issue easily justifies
    #   saying this could be 2.0days.
    manta_field='part_per_m2_nofiber'
    # TODO: the above actually includes Fiber Bundles, which probably ought to
    # be omitted with fibers.  for some data, Fiber Bundles were not counted 
    # separate from fibers.
    load_sel=(loads.category.values != 'Fiber')&(loads.category.values!='Fiber Bundle')
    load_da=loads.isel(category=load_sel).conc.sum(dim='category')
    sub_vers=vers+"nofiber"
if 0:
    # films - pretty terrible.  amplitude suggests 3-4 days.
    # correlations are screwy, and show the worst correlations at
    # that same time scale.
    manta_field='part_per_m2_film'
    manta[manta_field]=manta['Fiber_adj']/manta['area_m2']
    load_sel=(loads.category.values == 'Film')
    load_da=loads.isel(category=load_sel).conc.sum(dim='category')
    sub_vers=vers+"film"
if 0:
    # fragments - this one does look quite similar to the nofiber
    # data.  amplitude ~ 1 day, spearman ~ 2 days
    manta_field='part_per_m2_fragment'
    manta[manta_field]=manta['Fragment_adj']/manta['area_m2']
    load_sel=(loads.category.values == 'Fragment')
    load_da=loads.isel(category=load_sel).conc.sum(dim='category')
    sub_vers=vers+"fragment"
if 0:
    # just fibers!  spearman says shortest tau, least smoothing
    # log(amp) says a bit less than 1 day.
    # wilmott says 0.5 days
    manta_field='part_per_m2_fiber'
    manta[manta_field]=(manta['Fiber_adj']+manta['Fiber Bundle_adj'])/manta['area_m2']
    load_sel=(loads.category.values == 'Fiber')|(loads.category.values=='Fiber Bundle')
    load_da=loads.isel(category=load_sel).conc.sum(dim='category')
    sub_vers=vers+"fiber"
if 0:
    # foam - spearman says full smoothing, shortest tau
    # amplitude says 1-2 days
    manta_field='part_per_m2_foam'
    manta[manta_field]=manta['Foam_adj']/manta['area_m2']
    load_sel=(loads.category.values == 'Foam')
    load_da=loads.isel(category=load_sel).conc.sum(dim='category')
    sub_vers=vers+"foam"
if 1:
    # sphere
    manta_field='part_per_m2_sphere'
    manta[manta_field]=manta['Sphere_adj']/manta['area_m2']
    load_sel=(loads.category.values == 'Sphere')
    load_da=loads.isel(category=load_sel).conc.sum(dim='category')
    sub_vers=vers+"sphere"
    
smooths=[10] # abbreviated to speed things up.
taus=np.array([0.5,1,2,3,4,5,7.5,10,15,20]) * 86400.
#smooths=[5]
#taus=taus[2:3]

mapping=particle_load_mapping(load_da,group_names)

all_results=[]
for smooth in smooths:
    for tau in taus:
        # 1 or 2 smooth makes a big improvement in spearman.
        # nothing helps r, bias or amp.
        pred=[]
        for idx,rec in utils.progress(manta.iterrows()):
            if np.isnan(rec[manta_field]):
                # e.g. sample did not test for fibers, but manta_field and
                # loads include fibers.
                x=np.nan
            else:
                x=extract_for_sample(rec,mapping=mapping,smooth=smooth,tau_s=tau)
            pred.append(x)
        pred=np.array(pred)
        obs=manta[manta_field]

        valid=np.isfinite(obs)
        pred=pred[valid]
        obs=obs[valid]
        
        metrics=calc_metrics( xr.DataArray(pred), xr.DataArray(obs))
        metrics['smooth']=smooth
        metrics['tau']=tau
        metrics['dataset']=sub_vers
        
        metrics['log_r']=np.corrcoef( np.log10(pred.clip(1e-6)), np.log10(obs.clip(1e-6)))[1,0]
        all_results.append(metrics)

df=pd.DataFrame(all_results)
df.to_csv(f"parameter-scan-{sub_vers}.csv")

INFO:utils:13
INFO:utils:23
INFO:utils:32
INFO:utils:42
INFO:utils:51
INFO:utils:61
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:34
INFO:utils:56
INFO:utils:33
INFO:utils:55
INFO:utils:33
INFO:utils:55
INFO:utils:33
INFO:utils:55
INFO:utils:33
INFO:utils:55
INFO:utils:33
INFO:utils:55
INFO:utils:33
INFO:utils:55
INFO:utils:33
INFO:u

In [291]:
df=pd.read_csv(f'parameter-scan-{sub_vers}.csv')
df['tau_d']=df.tau/86400.

In [292]:
field='spearman_rho'

as_matrix=df.set_index(['smooth','tau_d'])[field].unstack()

fig,ax=plt.subplots(1,1)
hm=sns.heatmap(as_matrix, annot=True)
ax.set_xlabel(r'$\tau$ (days)',fontsize=14)
ax.set_ylabel('Smoothing iterations',fontsize=14)
cax=fig.axes[1]
cax.set_ylabel(r'Spearman $\rho$',fontsize=14)
plt.subplots_adjust(bottom=0.15,top=0.95,right=0.9)
fig.savefig(f'manta_compare_spearman_rho-{sub_vers}.png',dpi=150)

<IPython.core.display.Javascript object>

In [293]:
df['log_amp']=np.log10(df.amp)
field='log_amp'

as_matrix=df.set_index(['smooth','tau_d'])[field].unstack()

fig,ax=plt.subplots(1,1)

hm=sns.heatmap(as_matrix, annot=True,cmap='coolwarm',ax=ax,
               cbar_kws=dict(label=field),vmin=-1,vmax=1)
plt.subplots_adjust(bottom=0.15,top=0.95,right=0.9)
ax.set_xlabel(r'$\tau$ (days)',fontsize=14)
ax.set_ylabel('Smoothing iterations',fontsize=14)
cax=fig.axes[1]
cax.set_ylabel('log$_{10}$(amp)',fontsize=14)
fig.savefig(f'manta_compare_log10amp-{sub_vers}.png',dpi=150)


<IPython.core.display.Javascript object>

In [294]:
field='log_r'

as_matrix=df.set_index(['smooth','tau_d'])[field].unstack()

plt.figure()
hm=sns.heatmap(as_matrix**2, annot=True,cbar_kws=dict(label='$r^2$ for log transformed'))
plt.subplots_adjust(bottom=0.25)

fig.savefig(f'manta_compare_log_r2-{sub_vers}.png',dpi=150)


<IPython.core.display.Javascript object>

In [295]:
field='wilmott'

as_matrix=df.set_index(['smooth','tau_d'])[field].unstack()

plt.figure()
hm=sns.heatmap(as_matrix, annot=True,cbar_kws=dict(label=field))
plt.subplots_adjust(bottom=0.25)

<IPython.core.display.Javascript object>

In [296]:
field='r'

as_matrix=df.set_index(['smooth','tau_d'])[field].unstack()

plt.figure()
hm=sns.heatmap(as_matrix, annot=True,cbar_kws=dict(label=field))
plt.subplots_adjust(bottom=0.25)

<IPython.core.display.Javascript object>

In [297]:
field='murphy'

as_matrix=df.set_index(['smooth','tau_d'])[field].unstack()

plt.figure()
hm=sns.heatmap(as_matrix, annot=True,cbar_kws=dict(label=field))
plt.subplots_adjust(bottom=0.25)

<IPython.core.display.Javascript object>

In [213]:
# Scatter for a specific case:
manta_field='part_per_m2_std'
load_da=loads.conc.sum(dim='category')

smooth=20
tau=2 * 86400.
mapping=particle_load_mapping(load_da,group_names)

all_results=[]
pred=[]
for idx,rec in utils.progress(manta.iterrows()):
    if np.isnan(rec[manta_field]):
        # e.g. sample did not test for fibers, but manta_field and
        # loads include fibers.
        x=np.nan
    else:
        x=extract_for_sample(rec,mapping=mapping,smooth=smooth,tau_s=tau)
    pred.append(x)
pred=np.array(pred)
obs=manta[manta_field]

valid=np.isfinite(obs)
pred=pred[valid]
obs=obs[valid]

INFO:utils:46


In [214]:
fig,ax=plt.subplots(1,1)
ax.plot(obs,pred,'g.')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fb03596c940>]

In [215]:
np.corrcoef(obs,pred)

array([[1.        , 0.28083222],
       [0.28083222, 1.        ]])

In [None]:

# for baseline reference, what concentration do I get
# from this directly?
cell_counts=np.zeros(g.Ncells(),np.float64)
for count,time,rel_time,cell in combined:
    cell_counts[int(cell)]+=count
nsteps=4 # [-2,+2] hours
cell_conc=cell_counts/g.cells_area()/nsteps

In [None]:
fig=plt.figure()
ax=plt.gca()

clim=[1e-4,100]
ccoll=g.plot_cells(values=cell_conc.clip(clim[0]),
                   norm=LogNorm(vmin=clim[0],vmax=clim[1],clip=True),
                   cmap='jet',ax=ax)
ccoll.set_clim(clim)
ax.axis('equal')

In [None]:
# relate that back to the sample:
c_select=ptm_set.grid().select_cells_nearest(np.r_[rec.x,rec.y])

stencil=np.zeros(g.Ncells(),np.float64)
stencil[c_select]=1.0
stencil1=ptm_set.smooth(stencil)
conc0=(stencil*cell_conc).sum()
conc1=(stencil1*cell_conc).sum()

print(f"No smoothing, predicted: {conc0:.5f} particles/m2")
print(f"Smoothed, predicted      {conc1:.5f} particles/m2")
print(f"Observed, no-fiber       {rec['part_per_m2_nofiber']:.5f} particles/m2")

In [None]:
fig,ax=plt.subplots(1,1,figsize=(5,4))

ax.loglog( obs, pred, 'g.',ms=7 )

ax.set_ylabel('Model')
ax.set_xlabel('Observed')
fig.tight_layout()
fig.savefig('scatter-depression.png',dpi=150)

In [None]:
pred=manta.apply(lambda rec: extract_for_sample(rec,smooth=10,tau_s=2*86400),
                 axis=1)
obs=manta['part_per_m2_nofiber']

fig,ax=plt.subplots(1,1,figsize=(5,4))

ax.loglog( obs, pred, 'g.',ms=7 )

ax.set_ylabel('Model')
ax.set_xlabel('Observed')
fig.tight_layout()
fig.savefig('scatter-betterest.png',dpi=150)

In [301]:
# See effective vertical distribution in South Bay.
# Choose a point out in the middle of South Bay
plt.figure()
g.plot_edges()
p=np.array([563573,4.1678e6])
plt.plot( p[:1],p[1:],'ro')
plt.axis( (542485.2055258009, 585698.5796708226, 4140994.463422078, 4193349.128251624) )

<IPython.core.display.Javascript object>

(542485.2055258009, 585698.5796708226, 4140994.463422078, 4193349.128251624)

In [317]:
res=query_particles(np.datetime64('2018-02-01T00:00'),
                    np.datetime64('2018-02-02T00:00'),
                    z_filter="",
                    max_age=np.timedelta64(5,'D'),
                    grp_filter="")


      select rel.volume / rel.count as rel_part_volume,
             loc.time as time,
             rel.time as rel_time,
             loc.cell as cell,
             grp.name as grp,
             loc.z_from_surface as z_from_surface,
             loc.z_from_bed as z_from_bed
        from particle_loc as loc, particle as p, ptm_release as rel, 
             ptm_group as grp
        where loc.time>=1517443200 and loc.time < 1517529600
          and loc.particle_id=p.id
          and p.release_id=rel.id
          and rel.group_id=grp.id
          and loc.cell>=0
          and (loc.time-rel.time)<172800
          
          
/opt2/sfb_ocean/ptm/all_source_021b/20170720/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170730/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170809/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170819/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source_021b/20170829/ptm_and_grid.db 0 rows
/opt2/sfb_ocean/ptm/all_source

In [334]:
cells=utils.dist(g.cells_centroid(),p)<2000
print(f"{cells.sum()} selected within 2000 m of sample point")

335 selected within 2000 m of sample point


In [335]:
res_spatial=res[ cells[res['cell']]]
print(f"{len(res_spatial)} particles within those cells")

6486 particles within those cells


In [336]:
z_from_surface=res_spatial['z_from_surface']
z_from_bed=res_spatial['z_from_bed']
sigma=z_from_surface/(z_from_bed-z_from_surface)
plt.figure()
scat=plt.scatter( z_from_surface,z_from_bed,30,sigma,cmap='jet')
plt.colorbar(scat)

<IPython.core.display.Javascript object>

<matplotlib.colorbar.Colorbar at 0x7fb0144db550>

In [337]:
plt.figure()
plt.hist(sigma,bins=np.linspace(-1,0,25))

<IPython.core.display.Javascript object>

(array([1136.,  569.,  329.,  266.,  230.,  222.,  206.,  192.,  200.,
         168.,  182.,  200.,  171.,  179.,  179.,  171.,  180.,  181.,
         199.,  215.,  223.,  245.,  279.,  364.]),
 array([-1.        , -0.95833333, -0.91666667, -0.875     , -0.83333333,
        -0.79166667, -0.75      , -0.70833333, -0.66666667, -0.625     ,
        -0.58333333, -0.54166667, -0.5       , -0.45833333, -0.41666667,
        -0.375     , -0.33333333, -0.29166667, -0.25      , -0.20833333,
        -0.16666667, -0.125     , -0.08333333, -0.04166667,  0.        ]),
 <a list of 24 Patch objects>)

In [338]:
np.unique(res_spatial['group'])

array([b'src000_down50000_rel20180126', b'src000_down5000_rel20180126',
       b'src000_down500_rel20180126', b'src000_none_rel20180126',
       b'src000_up50000_rel20180126', b'src000_up5000_rel20180126',
       b'src000_up500_rel20180126', b'src001_up500_rel20180126'],
      dtype='|S100')

In [339]:
import seaborn as sns

In [386]:
fig,ax=plt.subplots(1,1)

behaviors=['_down50000_',
           '_down5000_',
           '_down500_',
           '_none_',
           '_up500_',
           '_up5000_',
           '_up50000_'][::-1]
blabels={'_down50000_':'50',
       '_down5000_':'5',
       '_down500_':'0.5',
       '_none_':'0.0',
       '_up500_':'-0.5',
       '_up5000_':'-5',
       '_up50000_':'-50'}

sigma_by_b=[]

for b in behaviors:
    sel=np.array([b.encode() in r['group'] for r in res_spatial])
    sigma_by_b.append( sigma[sel] )

from scipy.stats import gaussian_kde
from matplotlib import cm

for bi,(b,sig) in enumerate(zip(behaviors,sigma_by_b)):
    k=gaussian_kde(sig)
    z=np.linspace(-1,0,100)
    k_val=k(z) + k(-2-z) + k(-z)
    color=cm.coolwarm( bi/float(len(behaviors)-1.0))
    ax.plot(k_val,z,label=blabels[b],color=color,lw=2.5)
ax.axis(ymin=-1,ymax=0)
ax.legend(title='$w_s$ mm s$^{-1}$',fontsize=12)
ax.set_ylabel('$\sigma$',fontsize=16)
ax.set_xlabel('Density',fontsize=16)
plt.setp(ax.get_xticklabels(),visible=0)
fig.savefig('sigma-distribution.png')

<IPython.core.display.Javascript object>