# Check the quality of the input data

This notebook can be used to check the quality of all the input data used in the SFINCS model. The validity of the model results can be derived from these insights.

#### Import packages

In [10]:
# package import
import os
from pathlib import Path
import sys
import pandas as pd
import rasterio

import cartopy.crs as ccrs
import cartopy.io.img_tiles as cimgt
import geopandas as gpd
import hydromt
import numpy as np
import matplotlib.pyplot as plt
from hydromt.log import setuplog
from hydromt_sfincs import SfincsModel
from rasterio.plot import show

# local script imports
from sfincs_utils import run_sfincs, create_sfincs_model_archive

#### Read all input data

In [14]:
gauging_locations = gpd.read_file("../data/gtsm_codec_reanalysis_hourly_v1/gauging_location.geojson").to_crs(epsg=4326)
print("Gauging locations read")
gauging_timeseries = pd.read_csv("../data/gtsm_codec_reanalysis_hourly_v1/validationtimeseries.csv")
print("Gauging timeseries read")
with rasterio.open('../data/gebco.tif') as src:
    gebco_data = src.read(1)
    gebco_nodata = src.nodata
    gebco_clean_data = gebco_data[gebco_data != gebco_nodata]
    gebco_mean, gebco_std = np.mean(gebco_clean_data), np.std(gebco_clean_data)
print("Gebco data read")
with rasterio.open('../data/MRLC_landcover.tiff') as src:
    landcover_data = src.read(1)
    landcover_nodata = src.nodata
    landcover_clean_data = landcover_data[landcover_data != landcover_nodata]
    landcover_mean, landcover_std = np.mean(landcover_clean_data), np.std(landcover_clean_data)
print("Landcover data read")
landcover_mapping = pd.read_csv("../data/MRLC_landcover_mapping.csv")
print("Landcover mapping read")
osm_land_areas = gpd.read_file("../data/osm_landareas.gpkg").to_crs(epsg=4326)
print("OSM land areas read")
with rasterio.open('../data/topography_Savannah_10m.tif') as src:
    DEM_3D_10m_data = src.read(1)
    DEM_3D_10m_nodata = src.nodata
    DEM_3D_10m_clean_data = DEM_3D_10m_data[DEM_3D_10m_data != DEM_3D_10m_nodata]
    DEM_3D_10m_mean, DEM_3D_10m_std = np.mean(DEM_3D_10m_clean_data), np.std(DEM_3D_10m_clean_data)
print("DEM data read")
# with rasterio.open('../data/topography_Savannah_CoNED_G.tiff') as src:
#     CoNED_G_data = src.read(1)
#     CoNED_G_nodata = src.nodata
#     CoNED_G_clean_data = CoNED_G_data[CoNED_G_data != CoNED_G_nodata]
#     CoNED_G_mean, CoNED_G_std = np.mean(CoNED_G_clean_data), np.std(CoNED_G_clean_data)
# print("Georga DEM data read")
# with rasterio.open('../data/topography_Savannah_CoNED_SC.tiff') as src:
#     CoNED_SC_data = src.read(1)
#     CoNED_SC_nodata = src.nodata
#     CoNED_SC_clean_data = CoNED_SC_data[CoNED_SC_data != CoNED_SC_nodata]
#     CoNED_SC_mean, CoNED_SC_std = np.mean(CoNED_SC_clean_data), np.std(CoNED_SC_clean_data)
# print("South Carolina DEM data read")
model_domain = gpd.read_file("../model_domain/model_domain_savannah.geojson").to_crs(epsg=4326)
print("Model domain read")
obs_points = gpd.read_file("../model_domain/obs_points_savannah.geojson").to_crs(epsg=4326)
print("Observation points read")
wl_mask = gpd.read_file("../model_domain/waterlevel_mask_savannah.geojson").to_crs(epsg=4326)
print("Waterlevel mask read")

Gauging locations read
Gauging timeseries read
Gebco data read
Landcover data read
Landcover mapping read
OSM land areas read
DEM data read
Model domain read
Observation points read
Waterlevel mask read


#### Visualise inputs

In [12]:
gebco_missing_mask = (gebco_data == gebco_nodata)
gebco_outlier_mask = (np.abs(gebco_data - gebco_mean) > 3 * gebco_std) & (gebco_data != gebco_nodata)

landcover_missing_mask = (landcover_data == landcover_nodata)
landcover_outlier_mask = (np.abs(landcover_data - landcover_mean) > 3 * landcover_std) & (landcover_data != landcover_nodata)

DEM_3D_10m_missing_mask = (DEM_3D_10m_data == DEM_3D_10m_nodata)
DEM_3D_10m_outlier_mask = (np.abs(DEM_3D_10m_data - DEM_3D_10m_mean) > 3 * DEM_3D_10m_std) & (DEM_3D_10m_data != DEM_3D_10m_nodata)

# CoNED_G_missing_mask = (CoNED_G_data == CoNED_G_nodata)
# CoNED_G_outlier_mask = (np.abs(CoNED_G_data - CoNED_G_mean) > 3 * CoNED_G_std) & (CoNED_G_data != CoNED_G_nodata)

# CoNED_SC_missing_mask = (CoNED_SC_data == CoNED_SC_nodata)
# CoNED_SC_outlier_mask = (np.abs(CoNED_SC_data - CoNED_SC_mean) > 3 * CoNED_SC_std) & (CoNED_SC_data != CoNED_SC_nodata)

def find_nans_and_outliers(df_in, date_col, val_col):
    df = df_in.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.set_index(date_col)
    freq = pd.infer_freq(df[date_col])
    full_date_range = pd.date_range(start=df[date_col].min(), 
                                    end=df[date_col].max(), 
                                    freq=freq)
    df_reindexed = df.reindex(full_date_range)

    nans = df_reindexed[df_reindexed[val_col].isna()]
    missing_dates = full_date_range.difference(df[date_col])

    Q1 = df[val_col].quantile(0.25)
    Q3 = df[val_col].quantile(0.75)
    IQR = Q3 - Q1
    bottom_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    outliers = df[(df[' (m)'] < bottom_limit) | (df[' (m)'] > upper_limit)]

    return df_reindexed, nans, bottom_limit, upper_limit

gauging_reindexed, gauging_nans, gauging_bot, gauging_up = find_nans_and_outliers(gauging_timeseries, "date and time ", " (m)")

aantal_rijen = 12
fig_height = aantal_rijen * 4

fig, ax = plt.subplots(aantal_rijen, 3, figsize=(20, fig_height), constrained_layout=True)

ax[0,0].set_title("Gebco")
ax[0,0].imshow(gebco_data, cmap='viridis')
ax[0,1].imshow(gebco_missing_mask)
ax[0,2].imshow(gebco_outlier_mask)

ax[1,0].imshow(landcover_data, cmap='viridis')
ax[1,1].imshow(landcover_missing_mask)
ax[1,2].imshow(landcover_outlier_mask)

ax[2,0].imshow(DEM_3D_10m_data, cmap='viridis')
ax[2,1].imshow(DEM_3D_10m_missing_mask)
ax[2,2].imshow(DEM_3D_10m_outlier_mask)

# ax[3,0].imshow(CoNED_G_data, cmap='viridis')
# ax[3,1].imshow(CoNED_G_missing_mask)
# ax[3,2].imshow(CoNED_G_outlier_mask)

# ax[4,0].imshow(CoNED_SC_data, cmap='viridis')
# ax[4,1].imshow(CoNED_SC_missing_mask)
# ax[4,2].imshow(CoNED_SC_outlier_mask)

ax[3,0].plot(gauging_reindexed["date and time "], gauging_reindexed[" (m)"], label="Waterlevel")
ax[3,0].scatter(gauging_nans.index, (gauging_reindexed[" (m)"].min() * len(gauging_nans)), color="red", label="Holes in dataset, s=20, zorder=5")
ax[3,0].axhline(gauging_bot, color="gray")
ax[3,0].axhline(gauging_up, color="gray")
ax[3,0].legend()

plt.show()

KeyError: 'date and time '

In [None]:
gauging_timeseries.columns

Index(['date and time ', ' (m)'], dtype='object')