Refactor interpolator, plus minor changes (#15)

* New class for CML interpolator * This class uses interpolator objects derived from a general point to grid interpolator class * Currently a IDW and Kriging interpolator class are implemented * Kriging functions are there but do not work reliably * KDTreeIDW reuses the calculated distance matrix if possible * KDTree IDW was speed up (x10) using numba * Moved coverage calculations to own file * Added decorator for depreciation warning * Added a whats-new.rst doc file * Added notebook and example data for spatial interpolation
pycomlink · Sep 6, 2017 · d7cea63 · d7cea63
1 parent 496fd99
commit d7cea63
Show file tree

Hide file tree

Showing 16 changed files with 929 additions and 109 deletions.
diff --git a/docs/whats-new.rst b/docs/whats-new.rst
@@ -0,0 +1,60 @@
+What's New
+==========
+
+v0.2.0 (unreleased)
+-------------------
+
+Backward Incompatible Changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-Complete rewrite of interpolator classes. The old interpolator class
+ `spatial.interpol.Interpolator()` is depreciated. New interpolator base classes
+ for IDW and Kriging have been added together with a convenience inteprolator
+ for CML data. Usage is showcased in a new example notebook.
+
+-Some old functionality has moved to separate files.
+  * resampling to a given `DatetimeIndex` is now availabel in `util.temporal`
+    and will be removed from `validatoin.validator.Validation()` class soon.
+  * calculation of wet-dry error is now in module `validation.stats`
+  * calculation of spatial coverage with CMLs was moved to function
+    `spatial.coverage.calc_coverage_mask()`.
+  * error metric for performance evaluation of wet-dry classification is now
+    in `validation.stats`. Errors are now returned with meaningful names as
+    namedtuples. `validation.validator.calc_wet_dry_error()` is depreciated now.
+
+Enhancements
+~~~~~~~~~~~~
+
+-Read and write to and from multiple cmlh5 files (#12)
+
+-Improved `NaN` handling in `wet` indicator for baseline determination
+
+-Speed up of KDtreeIDW using numba and by reusing
+ previously calculated variables
+
+-Added example notebook for baseline determination
+
+-Added data set of 75 CMLs (with fake locations)
+
+-Added example notebook to show usage of new interpolator classes
+
+-Added decorator to mark depreciated code
+
+Bug fixes
+~~~~~~~~~
+
+-`setup.py` now reads all packages subdirectories correctly
+
+-Force integers for shape in `nans` helper function in `stft` module
+
+-Always use first value of `dry_stop` timestamp list in `stft` module.
+ The old code did not work anyway for a list with length = 1 and would
+ have failed if `dry_stop` would have been a scalar value. Now we
+ assume that we always get a list of values (which should be true for
+ `mlab.find`.
+
+
+v0.1.1
+------
+
+No info for older version...
diff --git a/notebooks/Spatial interpolation.ipynb b/notebooks/Spatial interpolation.ipynb
diff --git a/notebooks/example_data/75_cmls.h5 b/notebooks/example_data/75_cmls.h5
diff --git a/pycomlink/__init__.py b/pycomlink/__init__.py
@@ -11,6 +11,4 @@
 import vis
 import spatial
 import validation
-
-
-
+import util
diff --git a/pycomlink/io/examples.py b/pycomlink/io/examples.py
@@ -13,3 +13,9 @@ def read_one_cml():
     data_path = get_example_data_path()
     fn = 'one_cml.h5'
     return read_from_cmlh5(path.join(data_path, fn))[0]
+
+
+def read_cml_list():
+    data_path = get_example_data_path()
+    fn = '75_cmls.h5'
+    return read_from_cmlh5(path.join(data_path, fn))
diff --git a/pycomlink/spatial/__init__.py b/pycomlink/spatial/__init__.py
@@ -9,4 +9,6 @@
 # Licence:      The MIT License
 #----------------------------------------------------------------------------
 
-import interpol
+import interpol
+import idw
+import interpolator
diff --git a/pycomlink/spatial/coverage.py b/pycomlink/spatial/coverage.py
@@ -0,0 +1,73 @@
+import shapely as sh
+import geopandas
+
+
+def calc_coverage_mask(cml_list, xgrid, ygrid, max_dist_from_cml):
+        """ Generate a coverage mask with a certain area around all CMLs
+
+        Parameters
+        ----------
+
+        cml_list : list
+            List of Comlink objects
+        xgrid : array
+            2D matrix of x locations
+        ygrid : array
+            2D matrix of y locations
+        max_dist_from_cml : float
+            Maximum distance from a CML path that should be considered as
+            covered. The units must be the same as for the coordinates of the
+            CMLs. Hence, if lat-lon is used in decimal degrees, this unit has
+            also to be used here. Note that the different scaling of lat-lon
+            degrees for higher latitudes is not accounted for.
+
+        Returns
+        -------
+
+        grid_points_covered_by_cmls : array of bool
+            2D array with size of `xgrid` and `ygrid` with True values where
+            the grid point is within the area considered covered.
+        """
+
+        # TODO: Add option to do this for each time step, based on the
+        #       available CML in self.df_cml_R, i.e. exclusing those
+        #       with NaN.
+
+        # Build a polygon for the area "covered" by the CMLs
+        # given a maximum distance from their individual paths
+        cml_lines = []
+        for cml in cml_list:
+            cml_lines.append(
+                sh.geometry.LineString([
+                    [cml.metadata['site_a_longitude'],
+                     cml.metadata['site_a_latitude']],
+                    [cml.metadata['site_b_longitude'],
+                     cml.metadata['site_b_latitude']]])
+                .buffer(max_dist_from_cml, cap_style=1))
+
+        cml_dil_union = sh.ops.cascaded_union(cml_lines)
+        # Build a geopandas object for this polygon
+        gdf_cml_area = geopandas.GeoDataFrame(
+            geometry=geopandas.GeoSeries(cml_dil_union))
+
+        # Generate a geopandas object for all grid points
+        sh_grid_point_list = [sh.geometry.Point(xy) for xy
+                              in zip(xgrid.flatten(),
+                                     ygrid.flatten())]
+        gdf_grid_points = geopandas.GeoDataFrame(
+            geometry=sh_grid_point_list)
+
+        # Find all grid points within the area covered by the CMLs
+        points_in_cml_area = geopandas.sjoin(gdf_grid_points,
+                                             gdf_cml_area,
+                                             how='left')
+
+        # Generate a Boolean grid with shape of xgrid (and ygrid)
+        # indicating which grid points are within the area covered by CMLs
+        grid_points_covered_by_cmls = (
+            (~points_in_cml_area.index_right.isnull())
+            .values.reshape(xgrid.shape))
+
+        grid_points_covered_by_cmls = grid_points_covered_by_cmls
+
+        return grid_points_covered_by_cmls
diff --git a/pycomlink/spatial/idw.py b/pycomlink/spatial/idw.py
@@ -0,0 +1,139 @@
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+from numba.decorators import jit
+
+
+class Invdisttree(object):
+    """ inverse-distance-weighted interpolation using KDTree:
+
+    Copied from http://stackoverflow.com/questions/3104781/
+    inverse-distance-weighted-idw-interpolation-with-python
+
+    Usage granted by original author here:
+    https://github.com/scipy/scipy/issues/2022#issuecomment-296373506
+
+    invdisttree = Invdisttree( X, z )  -- data points, values
+    interpol = invdisttree( q, nnear=3, eps=0, p=1, weights=None, stat=0 )
+        interpolates z from the 3 points nearest each query point q;
+        For example, interpol[ a query point q ]
+        finds the 3 data points nearest q, at distances d1 d2 d3
+        and returns the IDW average of the values z1 z2 z3
+            (z1/d1 + z2/d2 + z3/d3)
+            / (1/d1 + 1/d2 + 1/d3)
+            = .55 z1 + .27 z2 + .18 z3  for distances 1 2 3
+
+        q may be one point, or a batch of points.
+        eps: approximate nearest, dist <= (1 + eps) * true nearest
+        p: use 1 / distance**p
+        weights: optional multipliers for 1 / distance**p, of the same shape as q
+        stat: accumulate wsum, wn for average weights
+
+    How many nearest neighbors should one take ?
+    a) start with 8 11 14 .. 28 in 2d 3d 4d .. 10d; see Wendel's formula
+    b) make 3 runs with nnear= e.g. 6 8 10, and look at the results --
+        |interpol 6 - interpol 8| etc., or |f - interpol*| if you have f(q).
+        I find that runtimes don't increase much at all with nnear -- ymmv.
+
+    p=1, p=2 ?
+        p=2 weights nearer points more, farther points less.
+        In 2d, the circles around query points have areas ~ distance**2,
+        so p=2 is inverse-area weighting. For example,
+            (z1/area1 + z2/area2 + z3/area3)
+            / (1/area1 + 1/area2 + 1/area3)
+            = .74 z1 + .18 z2 + .08 z3  for distances 1 2 3
+        Similarly, in 3d, p=3 is inverse-volume weighting.
+
+    Scaling:
+        if different X coordinates measure different things, Euclidean distance
+        can be way off.  For example, if X0 is in the range 0 to 1
+        but X1 0 to 1000, the X1 distances will swamp X0;
+        rescale the data, i.e. make X0.std() ~= X1.std() .
+
+    A nice property of IDW is that it's scale-free around query points:
+    if I have values z1 z2 z3 from 3 points at distances d1 d2 d3,
+    the IDW average
+        (z1/d1 + z2/d2 + z3/d3)
+        / (1/d1 + 1/d2 + 1/d3)
+    is the same for distances 1 2 3, or 10 20 30 -- only the ratios matter.
+    In contrast, the commonly-used Gaussian kernel exp( - (distance/h)**2 )
+    is exceedingly sensitive to distance and to h.
+
+    """
+    # anykernel( dj / av dj ) is also scale-free
+    # error analysis, |f(x) - idw(x)| ? todo: regular grid, nnear ndim+1, 2*ndim
+
+    def __init__(self, X, leafsize=10, stat=0):
+        self.X = X
+        self.tree = KDTree(X, leafsize=leafsize)  # build the tree
+        self.stat = stat
+        self.wn = 0
+        self.wsum = None
+        self.q = None
+
+    def __call__(self, q, z, nnear=6, eps=0, p=1, weights=None):
+            # nnear nearest neighbours of each query point --
+        assert len(self.X) == len(z), "len(X) %d != len(z) %d" % (len(self.X), len(z))
+
+        if nnear <= 1:
+            raise ValueError('`nnear` must be greater than 1')
+
+        q = np.asarray(q)
+        z = np.asarray(z)
+        qdim = q.ndim
+        if qdim == 1:
+            q = np.array([q])
+        if self.wsum is None:
+            self.wsum = np.zeros(nnear)
+
+        # Do not recalculate the distance matrix if it has already
+        # been calculated for the same parameters
+        if (hasattr(self, 'q') and np.array_equal(q, self.q) and
+            hasattr(self, 'nnear') and np.array_equal(nnear, self.nnear) and
+            hasattr(self, 'eps') and np.array_equal(eps, self.eps)
+            ):
+            # Do nothing here
+            # print 'reuse `distances`'
+            pass
+        else:
+            self.distances, self.ix = self.tree.query(q, k=nnear, eps=eps)
+            self.q = q
+            self.nnear = nnear
+            self.eps = eps
+
+        self.z = z
+
+        if weights is None:
+            weights = np.ones(len(q))
+
+        interpol, self.wn, self.wsum = _numba_idw_loop(
+            distances=self.distances,
+            ixs=self.ix,
+            z=self.z,
+            z_shape=z[0].shape,
+            p=p,
+            weights=weights,
+            wn=self.wn,
+            wsum=self.wsum)
+
+        return interpol if qdim > 1 else interpol[0]
+
+
+@jit(nopython=True)
+def _numba_idw_loop(distances, ixs, z, z_shape, p, weights, wn, wsum):
+    interpol = np.zeros((len(distances),) + z_shape)
+    jinterpol = 0
+    for i in range(len(distances)):
+        dist = distances[i]
+        ix = ixs[i]
+        if dist[0] < 1e-10:
+            wz = z[ix[0]]
+        else:  # weight z s by 1/dist --
+            w = 1 / dist**p
+            w *= weights[ix]  # >= 0
+            w /= np.sum(w)
+            wz = np.dot(w, z[ix])
+            wn += 1
+            wsum += w
+        interpol[jinterpol] = wz
+        jinterpol += 1
+    return interpol, wn, wsum