In [None]:
%reload_ext autoreload
%autoreload 2

import os
import sys
from pathlib import Path
import warnings
# warnings.filterwarnings('ignore')  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for single cpu)
warnings.filterwarnings("ignore", category=UserWarning)  # could also filter with message='...'
os.environ["PYTHONWARNINGS"] = "ignore"  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for parallel)
try:  # if on phy-server local modules will not be found if their directory is not added to PATH
    sys.path.append("/silod7/lenz/MPSchleiSediments/analysis/")
    os.chdir("/silod7/lenz/MPSchleiSediments/analysis/")
except Exception:
    pass

from joblib import Parallel, delayed
from tqdm import tqdm
from ipywidgets import interact, fixed, widgets
import numpy as np
import pandas as pd
import xarray as xr
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import libpysal as lps
import esda
import rasterio as rio
from rasterio.plot import show
from rasterio.mask import mask
import hvplot.pandas  # noqa
import hvplot.xarray
import holoviews as hv
import seaborn as sns
hv.extension('bokeh')
import altair as alt
alt.data_transformers.disable_max_rows()
from matplotlib import pyplot as plt

from settings import Config
import geo, geo_io
from helpers import tqdm_joblib
from cv import get_performance, performance, loocv_interp
from plots import scatter_chart, model_pred_bars, gridplot

## Data import

In [None]:
# create geodataframe from geojson file
poly = geo_io.get_schlei()
# poly.hvplot(crs=Config.baw_epsg, tiles=True)

In [None]:
## Read predicted data from model run
savestamp = '20231004_153340' #500rep run:'20231005_014730'  #'20230926_021644'  # '20230823_111917'  # '20230403_233901'   '20230501_172522'
f = [c for c in Path('../data/exports/models/predictions').glob(f'{savestamp}*.csv')][0]
target = f.name.split('_')[-2]
station_data = pd.read_csv(f)
station_data = gpd.GeoDataFrame(station_data, geometry=gpd.points_from_xy(station_data.LON, station_data.LAT), crs='EPSG:4326')

station_data.to_crs(Config.baw_epsg, inplace=True)

In [None]:
## switch on to check what would be interpolated if only the original measured data points were available (i.e. no modelled stations)
# station_data = station_data[~station_data.Concentration_observed.isna()].reset_index(drop=True)

## switch on to interpolate with the single model predictions instead of the ensemble predictions
# station_data[target][station_data[f'{target}_observed'].isna()] = station_data[f'{target}_predictedBySingleModel'][station_data[f'{target}_observed'].isna()]

## Geospatial interpolation

In [None]:
## Most simple interpolation: using the stations' values for the whole Schlei area via Voronoi tesselation
station_points = [(x,y) for x,y in zip(station_data.geometry.x, station_data.geometry.y)]
voronoi_df, _ = lps.cg.voronoi_frames(station_points, clip=poly.geometry[0].geoms[0])
voronoi_df.crs = station_data.crs
station_data['Voronoi_Area'] = voronoi_df.geometry.area
station_data['Voronoi_Sed_Mass'] = station_data.Voronoi_Area * Config.sediment_layer_depth * station_data.SedDryBulkDensity
station_data['Voronoi_MP_Num'] =  station_data.Voronoi_Sed_Mass * station_data.Concentration
station_data['Voronoi_MP_Mass'] = station_data.Voronoi_Sed_Mass * station_data.MassConcentration
print(f'Based on Voronoi-regions: total MP in upper {Config.sediment_layer_depth*100} cm of Schlei sediments: {np.round(station_data.Voronoi_MP_Num.sum() / 1e12, 1)} Trillion particles with a combined mass of {np.round(station_data.Voronoi_MP_Mass.sum() / 1e12, 1)} tons')
voronoi_df[target] = station_data[target]
station_map = voronoi_df.hvplot(c=target, cmap='viridis', cnorm='eq_hist', line_alpha=0, crs=Config.baw_epsg, frame_width=600) * station_data.hvplot(color='white', fill_color='black', size=30, hover_cols=[col for col in station_data.columns if 'WWTP' not in col])
station_map.opts(frame_width=1000, frame_height=700, active_tools = ['wheel_zoom'])

# station_data = station_data.loc[
#       (station_data.Sample!='S08')
#     & (station_data.Sample!='S10')
    # & (station_data.Sample!='20170425_G20')
    # & (station_data.Sample!='S05')
    # & (station_data.Sample!='S32')
# ].reset_index(drop=True)

### Gridded interpolations

In [None]:
## Choose interpolation tool:

        ## Any of: ['numpy_simple_idw',
        ##          'numpy_rbf_idw',
        ##          'scipy_rbf_idw',
        ##          'scipy_griddata',
        ##          'pykrige_ordkrig',
        ##          'skgstat_ordkrig',
        ##          'pygmt_xyz2grd',
        ##          'load_external_interpol',
        ##         ]


tool = 'skgstat_ordkrig'  # must be in keys of Config.interpolation_methods
if tool not in Config.interpolation_methods.keys():
    raise KeyError('Chosen tool does not exist... Try again!')

In [None]:
## Making the grid
res = Config.interpolation_resolution  # grid resolution (pixel size in m)
cell_area = res ** 2  # grid cell area in m² from cell width * cell height in m
cell_sedVol = cell_area * Config.sediment_layer_depth  # volume of sediment layer considered in m³
xgrid, ygrid, xmin, ymin, xmax, ymax = geo.make_grid(poly, res, round_grid_coords=True)
print(f'grid bounds: xmin={xmin}, ymin={ymin}, xmax={xmax}, ymax={ymax}, grid shape: width: {xgrid.shape[1]}, height: {xgrid.shape[0]}') 

#### Run the interpolation

In [None]:
## Variables to interpolate:
vars = ['Concentration', 'MassConcentration', 'SedDryBulkDensity']

### Single CPU calculation
# grids = {var : geo.interclip(station_data, var, xgrid, ygrid, poly, tool) for var in tqdm(vars)}  # dict of grids of the interpolated variables

### Multi CPU calculation (may crash du to Ram overflow for high grid resolutions)
with tqdm_joblib(tqdm(desc="Parallel gridding...", total=len(vars))) as progress_bar:
    grids = {k: v for k, v in zip(vars, Parallel(n_jobs=len(vars))(delayed(geo.interclip)(station_data, var, xgrid, ygrid, poly, tool) for var in vars))}  # for some reason variogram plots don't show up when running parallelised

### Display maps

In [None]:
# ## Plot grind in matplotlib
# plt.imshow(target_clipped, cmap='terrain', interpolation='nearest')
# plt.show()

## Plot grid as interactive hvplot incl. coastline boundary
hvgrid = gridplot(grids['Concentration'], xgrid, ygrid)
hvgrid

## Summarising intepolation results

In [None]:
# Sample the generated MP grids at the station locations
station_data.insert(4, f'Concentration_interpolated', geo.sample_array_raster(grids['Concentration'], xmin, ymax, res, station_data))
station_data.insert(8, f'MassConcentration_interpolated', geo.sample_array_raster(grids['MassConcentration'], xmin, ymax, res, station_data))

# Combine sediment and MP conc girds into abundance grid
sedMass_grid = np.nan_to_num(grids['SedDryBulkDensity']) * cell_sedVol  # mass of sediment in each cell in kg, calculated from (interpolated) sediment dry bulk density (kg m⁻³) * volume of sediment per cell (m³)
MPnum_grid = np.nan_to_num(grids['Concentration']) * sedMass_grid  # grid of target amounts (MP particles if target==Conentration; MP mass if target==MassConcentration)
num_total = MPnum_grid.sum()
MPmass_grid = np.nan_to_num(grids['MassConcentration']) * sedMass_grid
mass_total = MPmass_grid.sum()

In [None]:
## Calculate the performance of the prediction against the seen trainings data
## OBS: this is a overly optimistic score and does not tell anything about how well the model can predict new data points. Use the scores from the NCV for that.
## The difference between this score here and the NCV score, shows that the typical MP study scenario (low sample number) makes a crossvalidated score a neccesity.

interact(
    get_performance,
        df=fixed(station_data), target=['Concentration', 'MassConcentration'], kind=['predicted', 'interpolated'], with_outliers=False
)

In [None]:
print('\n', '='*100, '\n')

print('Summary of MP particles numbers:')

print(f'Total MP particle numbers in upper {Config.sediment_layer_depth*100} cm of Schlei sediments: {np.round(num_total / 1e12, 1)} Trillion')
print(f'MP particle number per m² and cm sediment depth: {round(num_total / poly.area[0] / (Config.sediment_layer_depth * 100))}')
print(f'Schlei-wide MP number concentration as ratio of total MP / total sed mass: {round(num_total / sedMass_grid.sum())} particles / kg')
print(f'Schlei-wide MP number concentration as mean of grid data (i.e. assuming equal sediment mass in all cells): {round(np.nanmean(grids["Concentration"]))} particles / kg')
print(f'Schlei-wide MP number concentration as mean of ALL stations (observed + predicted): {round(station_data[target].mean())} particles / kg')
print(f'Schlei-wide MP number concentration as mean of OBSERVED stations only: {round(station_data.loc[station_data.Type=="observed", target].mean())} particles / kg')

print('\n', '='*100, '\n')

print('Summary of MP masses:')
print(f'Total MP mass in upper {Config.sediment_layer_depth*100} cm of Schlei sediments: {np.round(mass_total / 1e12, 1)} tons')
print(f'MP mass per m² and cm sediment depth: {round(mass_total / poly.area[0] / (Config.sediment_layer_depth * 100))} µg')
print(f'Schlei-wide MP mass concentation as ratio of total MP mass / total sed mass: {round(mass_total / sedMass_grid.sum())} µg / kg')
print(f'Schlei-wide MP mass concentation as mean of grid data (i.e. assuming equal sediment mass in all cells): {round(np.nanmean(grids["MassConcentration"]))} µg / kg')
print(f'Schlei-wide MP mass concentation as mean of ALL stations (observed + predicted): {round(station_data["MassConcentration"].mean())} µg / kg')
print(f'Schlei-wide MP mass concentation as mean of OBSERVED stations only: {round(station_data.loc[station_data.Type=="observed", "MassConcentration"].mean())} µg / kg')

print('\n', '='*100, '\n')

In [None]:
interact(
    model_pred_bars,
        df = fixed(station_data),
        target = ['Concentration', 'MassConcentration'],
        domain = fixed(None),
        )
          
# interact(
#     model_pred_bars,
#         df = fixed(station_data),
#         target = ['Concentration', 'MassConcentration'],
#         domain = widgets.FloatLogSlider(
#             value=1,
#             base=10,
#             min=-2, # max exponent of base
#             max=2, # min exponent of base
#             step=0.1, # exponent step
#             description='Y-axis scale',
#             disabled=False,
#             continuous_update=True,
#             orientation='horizontal',
#             )
#         )


In [None]:
interact(
    scatter_chart,
        df = fixed(station_data.loc[(station_data.Type=='observed')]),
        x = ['Concentration_observed', 'Concentration_predicted', 'Concentration_interpolated', 'MassConcentration_observed', 'MassConcentration_predicted', 'MassConcentration_interpolated'],
        y = ['Concentration_observed', 'Concentration_predicted', 'Concentration_interpolated', 'MassConcentration_observed', 'MassConcentration_predicted', 'MassConcentration_interpolated'],
        color = 'outlier_excl', labels=[None, 'Sample'],
        reg=['linear', 'pow', 'exp'], reg_groups=fixed(False),
        equal_axes=fixed(False), identity = fixed(True),
        linref=False, linref_slope=1.0, linref_intercept=fixed(0), mix_lines=fixed(False),
        xtransform=fixed(False), ytransform=fixed(False), xscale = ['log', 'linear'], yscale = ['log', 'linear'],
        title=fixed(''), width = fixed(400), height = fixed(400),
        incl_params=fixed(False)
        )

## Saving interpolated grid

In [None]:
## Saving raster to tiff file and reading it back in as rasterio dataset
for var_name, grid in grids.items():
    fp = f'../data/exports/models/predictions/interpolated/{savestamp}_{var_name}_{tool}_{int(res)}x{int(res)}'
    Config.interpolation_methods[tool]['var_name'] = var_name
    geo_io.grid_save(grid, fp+'.tif', (xmin, ymax), tags=Config.interpolation_methods[tool])
    hvplot.save(gridplot(grid, xgrid, ygrid), fp+'.html', resources='INLINE')  # Saving hvplot of grid to interactive html
    with rio.open(fp+'.tif') as rasta:
        # out_img, out_transform = mask(rasta, poly.geometry[0].geoms, nodata=np.nan)
        print(rasta.tags())
        show(rasta, cmap='terrain', norm='log')

#### LOOCV of interpolation

In [None]:
# Config.interpolation_methods[tool]['plot'] = False  # turn of plotting in case interpolation method want to plot...
station_data[f'{target}_LOOCV'] = loocv_interp(station_data, target, xgrid, ygrid, res, poly, tool, n_jobs=72, verbose=0)

print('\n', f'Performance calculation is based on {station_data.shape[0]} ŷhat-vs-y pairs.')
performance(station_data.loc[:, target], station_data.loc[:, f'{target}_LOOCV'])

In [None]:
scatter_chart(
    df = station_data.loc[~station_data.iloc[:,-1].isna()],
    x = f'{target}',
    y = f'{target}_LOOCV',
    color = 'outlier_excl', 
    equal_axes=True, identity = True,
    incl_params=False, width=400, height=400
    )

## Gridding other variables

In [None]:
# gridf_orig = gpd.GeoDataFrame({target: grids['Concentration'].ravel()}, geometry=gpd.points_from_xy(xgrid.ravel(), ygrid.ravel()), crs=Config.baw_epsg)
# gridf = gridf_orig.loc[~gridf_orig[target].isna()].copy()

In [None]:
# gridf = geo.get_depth(gridf, label='Depth')
# gridf.loc[gridf.Depth < 0, 'Depth'] = 0  # removing artefacts from sampleing from one grid to the other

In [None]:
# gridf['Dist_Land'] = geo.get_distance_to_shore(gridf.to_crs('EPSG:4326').geometry.x, gridf.to_crs('EPSG:4326').geometry.y)

In [None]:
# gridf.plot(
#     marker='o', markersize=0.1,
#     column="Depth",
#     cmap='terrain_r',
#     legend=True,
#     # scheme="quantiles",
#     figsize=(15, 10),
#     missing_kwds={
#         "color": "lightgrey",
#         # "edgecolor": "red",
#         # "hatch": "///"
#         "label": "Missing values",
#     },
# )

In [None]:
# gridf_orig['Depth'] = gridf.Depth  # values will only be inserted where indeces match
# gridf_orig['Dist_Land'] = gridf.Dist_Land  # values will only be inserted where indeces match
# Depth_grid = gridf_orig.Depth.values.reshape(xgrid.shape)  # turn geodataframe back into grid array
# Dist_Land_grid = gridf_orig.Dist_Land.values.reshape(xgrid.shape)  # turn geodataframe back into grid array

In [None]:
## Save to file...
# geo_io.grid_save(Depth_grid,f'../data/Depth_epsg{Config.baw_epsg}_{res}x{res}.tif', (xmin, ymax))
# geo_io.grid_save(Dist_Land_grid,f'../data/Dist_Land_epsg{Config.baw_epsg}_{res}x{res}.tif', (xmin, ymax))
# gridf.to_csv(f'../data/grid_{res}m_ForPredictions.csv')

## ESDA

In [None]:
# w =  lps.weights.Voronoi.from_dataframe(station_data)  # generating spatial weights object
# w.transform = 'r'  # making weights row-standardised

In [None]:
# ## Check if a property is spatially auto-corralated
# property = target  # 'Depth', 'Dist_Land', 'Concentratration', etc...

# np.random.seed(12345)
# mi = esda.moran.Moran(station_data[property], w)
# print(f"Moran's I: {mi.I}")
# sns.kdeplot(mi.sim, fill=True)
# plt.vlines(mi.I, 0, 1, color='r')
# plt.vlines(mi.EI, 0,1)
# plt.xlabel(f"Moran's I (p = {mi.p_sim})")

In [None]:
# interact(
#     scatter_chart,
#         df = fixed(station_data),
#         x = station_data.select_dtypes([np.number]).columns,
#         y = station_data.select_dtypes([np.number]).columns,
#         color = 'outlier_excl', labels=[None, 'Sample'],
#         reg=['linear', 'pow', 'exp'], reg_groups=fixed(False),
#         equal_axes=fixed(False), identity = fixed(False),
#         linref=fixed(False), linref_slope=fixed(1.0), linref_intercept=fixed(0), mix_lines=fixed(False),
#         xtransform=fixed(False), ytransform=fixed(False), xscale = ['log', 'linear'], yscale = ['log', 'linear'],
#         title=fixed(''), width = fixed(400), height = fixed(400),
#         incl_params=fixed(False)
#         )