# NRW Groundwater Data - OpenHygrisC Data Engineering

OpenHygrisC Data: https://www.opengeodata.nrw.de/produkte/umwelt_klima/wasser/grundwasser/hygrisc/

Download gw station data: https://www.opengeodata.nrw.de/produkte/umwelt_klima/wasser/grundwasser/hygrisc/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_CSV.zip


## My local problem with the env var PROJ_LIB

In [337]:
# Correct installation bug ocuuring when using OSGeo4W
import os
proj_lib = os.environ['proj_lib']
print(proj_lib)
conda_prefix = os.environ['conda_prefix']
os.environ['proj_lib'] = conda_prefix + r"\Library\share\proj"
proj_lib = os.environ['proj_lib']
print(proj_lib)

C:\Users\rb\Anaconda3\envs\geo\Library\share\proj
C:\Users\rb\Anaconda3\envs\geo\Library\share\proj


## Imports

In [19]:
import pandas as pd
import geopandas as gpd

## Data

In [20]:
data_in_dir = r"../data/OpenGeodata.NRW/OpenHygrisC/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_CSV/"

In [77]:
gw_station_fname = r"opendata.gw_messstelle.csv"
gw_quality_fname = r"opendata.gw_chemischer_messwert.csv"

In [341]:
gw_station_pfname = data_in_dir + gw_station_fname
gw_quality_pfname = data_in_dir + gw_quality_fname
print(f"Stationsdaten:  {gw_station_pfname:s}")
print(f"Qualitätsdaten: {gw_quality_pfname:s}")

Stationsdaten:  ../data/OpenGeodata.NRW/OpenHygrisC/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_CSV/opendata.gw_messstelle.csv
Qualitätsdaten: ../data/OpenGeodata.NRW/OpenHygrisC/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_CSV/opendata.gw_chemischer_messwert.csv


## GW Station Data


In [342]:
df = pd.read_csv(data_in_dir + gw_station_fname, sep = ";", index_col=["sl_nr"])

In [348]:
df.head(10)

Unnamed: 0_level_0,messstelle_id,name,e32,n32,gw_stockwerk,grundstueck,gemeinde_id,gwhorizont_id,gwhorizont,gwleiter_id,...,beobachtung_wasserstand,eigentuemer,betreiber,filterlaenge_cm,sumpfrohrlaenge_cm,ausbaudurchmesser_mm,historischer_ruhe_wsp,einbaulaenge_cm,oberkante_filter_cm,unterkante_filter_cm
sl_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
67530,32505929,UWB-Ddorf 01285,343064,5678019,1.0,,05111000,,,,...,-,Stadt Düsseldorf ...,Stadt Düsseldorf ...,,,,,,,
51044,10446746,60GP012303,292077,5645349,,privat,NL000882,5,Zwischenmittel,,...,-,Prov. Limburg (NL) ...,Prov. Limburg (NL) ...,200.0,,,,16893.0,-3333.0,-3533.0
51070,87005323,58BP024606,287141,5684635,,privat,NL001640,6D,Neurather Sand,,...,-,Prov. Limburg (NL) ...,Prov. Limburg (NL) ...,500.0,300.0,,,32667.0,-29083.0,-29583.0
68442,80000290,Krinsend 0079 neu,309490,5678060,1.0,,05166024,,,,...,-,Land NRW ...,,,,,,1517.0,4832.0,4832.0
68518,86583852,WG102GM93-3,316741,5680237,,privat,05166032,,,,...,-,NiederrheinWasser GmbH (ehem. Niederheinwerke)...,NiederrheinWasser GmbH (ehem. Niederheinwerke)...,200.0,,80.0,,5400.0,1413.0,1213.0
61945,87005694,Weeze alt,305351,5722580,1.0,oeffentlich,05154064,,,,...,-,Land NRW ...,,600.0,0.0,125.0,,796.0,1721.0,1121.0
67603,96005816,SPE 10,456549,5716532,,privat,05974016,,,,...,-,Fa. Spenner Zement ...,Fa. Spenner Zement ...,,,,,6000.0,5191.0,5191.0
69654,80865604,Cargill M1,337378,5690675,1.0,,05114000,,,,...,-,"Cargill Deutschland GmbH, Krefeld ...","Cargill Deutschland GmbH, Krefeld ...",,,,,,,
68338,76398006,Lanzerath_BN_766_EB,367793,5622538,1.0,,05314000,,,,...,-,,,,,,,1610.0,3790.0,3790.0
18990,80201581,SCHMALENEND 201,309762,5682944,1.0,oeffentlich,05166032,,,,...,-,Land NRW ...,Land NRW ...,200.0,100.0,125.0,,1065.0,3927.0,3727.0


In [347]:
num_total = df.shape[0]
df.shape

(71120, 38)

## Challange: Coordinates obfuscation

In [109]:
# Add column for precision
df["genau"] = -999

# If the coord data is numeric then the precision is 1m
idx_coords_1m_prec = (df["e32"].str.isnumeric() == True) 
df.loc[idx_coords_1m_prec, "genau"] = 1

# Some stations don't have coordinates
# e32 and n32 strings are either NaN (Null) or "xx"
#idx_coords_null = df["e32"].isnull() | df["n32"].isnull()
#idx_coords_xx = (df.loc[~idx_coords_null,"e32"].str == "xx") | (df.loc[~idx_coords_null,"n32"].str == "xx")
#idx_coords_missing = idx_coords_null | idx_coords_xx

idx_coords_missing = (df["e32"].str.len() < 6) | (df["e32"].isnull() == True)
df.loc[idx_coords_missing, "genau"] = -999

# If coord data is avaliable but not numeric, then the numbers have been obscured with "XX" for the two least significant decimals.
idx_coords_100m_prec = ~idx_coords_missing &  ~(df["e32"].str.isnumeric() & df["n32"].str.isnumeric())
df.loc[idx_coords_100m_prec, "genau"] = 100


In [110]:
#ä check if all records have been matched
num_of_1m_prec = df[df["genau"] == 1].shape[0]
num_of_100m_prec = df[df["genau"] == 100].shape[0]
num_of_no_prec = df[df["genau"] == -999].shape[0]

num_check = num_of_1m_prec + num_of_100m_prec + num_of_no_prec

print(f"total num of recs:                        {num_total:6d}")
print(f"number of recs with 1m coord precision:   {num_of_1m_prec:6d}")
print(f"number of recs with 100m coord precision: {num_of_100m_prec:6d}")
print(f"number of recs with no coords:            {num_of_no_prec:6d}")
print(f"check sum:                                {num_check:6d}")

assert num_check == num_total, "ERROR. Mismatch in numbers of stations"


total num of recs:                         71120
number of recs with 1m coord precision:    59280
number of recs with 100m coord precision:  11810
number of recs with no coords:                30
check sum:                                 71120


In [111]:
df.loc[idx_coords_1m_prec,"e32num"] = df.loc[idx_coords_1m_prec,"e32"].astype(float)
df.loc[idx_coords_1m_prec,"n32num"] = df.loc[idx_coords_1m_prec,"n32"].astype(float)

In [112]:
df.loc[idx_coords_100m_prec,"e32num"] = (df.loc[idx_coords_100m_prec,"e32"].str[:-2]+"00").astype(float)
df.loc[idx_coords_100m_prec,"n32num"] = (df.loc[idx_coords_100m_prec,"n32"].str[:-2]+"00").astype(float)

In [113]:
df.loc[idx_coords_missing,"e32num"] = -999.9
df.loc[idx_coords_missing,"n32num"] = -999.9

In [114]:
df[["e32","e32num","n32","n32num","genau"]].to_csv("check.csv")

## Geopandas

In [115]:
import geopandas as gpd
from shapely.geometry import Point

In [116]:
# remove records without coords
df2 = df[df["genau"] > 0]

In [117]:
%%time
gdf = gpd.GeoDataFrame(df2, geometry=gpd.points_from_xy(df2.e32num, df2.n32num), crs="EPSG:25832")

Wall time: 4.85 s


In [118]:
%%time
# This takes 90 secs on my computer
# gdf.to_file("GW_Stations.gpkg", layer='GW Stations', driver="GPKG")

Wall time: 0 ns


## Inline SQL: `create schema gw`

In [131]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [132]:
%%sql
SELECT * FROM information_schema.schemata

 * postgresql://env_master:***@localhost/env_db
5 rows affected.


catalog_name,schema_name,schema_owner,default_character_set_catalog,default_character_set_schema,default_character_set_name,sql_path
env_db,information_schema,postgres,,,,
env_db,public,postgres,,,,
env_db,pg_catalog,postgres,,,,
env_db,pg_toast,postgres,,,,
env_db,gw,env_master,,,,


In [133]:
%%sql
CREATE SCHEMA IF NOT EXISTS gw AUTHORIZATION env_master

 * postgresql://env_master:***@localhost/env_db
Done.


[]

In [134]:
%%sql
SELECT * FROM information_schema.schemata

 * postgresql://env_master:***@localhost/env_db
5 rows affected.


catalog_name,schema_name,schema_owner,default_character_set_catalog,default_character_set_schema,default_character_set_name,sql_path
env_db,information_schema,postgres,,,,
env_db,public,postgres,,,,
env_db,pg_catalog,postgres,,,,
env_db,pg_toast,postgres,,,,
env_db,gw,env_master,,,,


## PostGIS: Upload GeoDataFrame with `gdf.to_postgis()`

Dependencies:
* psycopg2
* geoalchemy2

In [129]:
import sqlalchemy
engine = sqlalchemy.create_engine("postgresql://env_master:xxxxxx@localhost/env_db")

In [130]:
%%time
gdf.to_postgis(con=engine, name="gw_stations", schema="gw", index=True, chunksize=100, if_exists="replace")

Wall time: 7.5 s


## Quality Data

In [128]:
print(gw_quality_pfname)

../data/original/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_CSV/opendata.gw_chemischer_messwert.csv


In [174]:
fh = open(data_pfname,"r", encoding = "utf-8", newline = '')
s = fh.readline()
s = s.replace('"', '').strip()
header_de = s[1:].split(';')
header_de

['sl_nr',
 'messstelle_id',
 'pna_id',
 'datum_pn',
 'stoff_nr',
 'probengut',
 'messergebnis_c',
 'messergebnis_hinweis',
 'bestimmungsgrenze',
 'masseinheit',
 'trennverfahren',
 'verfahren',
 'vor_ort',
 'herkunft',
 'aktual_dat',
 'erstell_dat']

In [221]:
df_qual_header = pd.read_csv(data_pfname, sep = ";", dtype = {"messergebnis_c":str ,"messergebnis_hinweis":str }, nrows = 5)

In [358]:
df_qual_header

Unnamed: 0,sl_nr,messstelle_id,pna_id,datum_pn,stoff_nr,probengut,messergebnis_c,messergebnis_hinweis,bestimmungsgrenze,masseinheit,trennverfahren,verfahren,vor_ort,herkunft,aktual_dat,erstell_dat
0,2903561,59620687,5/2005/4599,20051018,1164,Grundwasser,22.0,,,µg/l,Gesamtgehalt,DIN 38406-E22 MAERZ 1988,,HYGC_BR-AR,20051205,20051205
1,2903564,59620687,5/2005/4599,20051018,1061,Grundwasser,6.8,,,-,Gesamtgehalt,DIN 38404-C5 JANUAR 1984,ja,HYGC_BR-AR,20051205,20051205
2,2903565,59620687,5/2005/4599,20051018,1011,Grundwasser,12.8,,,°C,Gesamtgehalt,DIN 38404-C4 DEZEMBER 1976,ja,HYGC_BR-AR,20051205,20051205
3,2903584,59620389,5/2005/5002,20051114,1011,Grundwasser,12.3,,,°C,Gesamtgehalt,DIN 38404-C4 DEZEMBER 1976,ja,HYGC_BR-AR,20051205,20051205
4,2903585,59620080,5/2005/5001,20051111,1061,Grundwasser,7.4,,,-,Gesamtgehalt,DIN 38404-C5 JANUAR 1984,ja,HYGC_BR-AR,20051205,20051205


In [359]:
# df_qual = pd.read_csv(data_pfname, sep = ";", index_col=["sl_nr", "aktual_dat"], dtype = {"messergebnis_c":str ,"messergebnis_hinweis":str })
df_qual = pd.read_csv(data_pfname, sep = ";", index_col=["sl_nr"], dtype = {"messergebnis_c":str ,"messergebnis_hinweis":str })

In [361]:
# duplictaes in index?
df[df.index.duplicated()]

Unnamed: 0_level_0,messstelle_id,name,e32,n32,gw_stockwerk,grundstueck,gemeinde_id,gwhorizont_id,gwhorizont,gwleiter_id,...,eigentuemer,betreiber,filterlaenge_cm,sumpfrohrlaenge_cm,ausbaudurchmesser_mm,historischer_ruhe_wsp,einbaulaenge_cm,oberkante_filter_cm,unterkante_filter_cm,genau
sl_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [304]:
# check if string can be converted to float
def is_float(element: str) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False

In [305]:
# check if string starts with '<'
def is_less(element: str) -> bool:
    return element[0] == "<" 

In [306]:
# check if string starts with '>'
def is_greater(element: str) -> bool:
    return element[0] == ">" 

In [362]:
%time idx_mess_is_float   = df_qual["messergebnis_c"].apply(is_float)
#%time idx_mess_not_float  = ~idx_mess_is_float
%time idx_mess_is_less    = df_qual["messergebnis_c"].apply(is_less)
%time idx_mess_is_greater = df_qual["messergebnis_c"].apply(is_greater)

Wall time: 2.64 s
Wall time: 644 ms
Wall time: 656 ms


In [390]:
df_qual[idx_mess_is_less].shape

(1974713, 15)

In [325]:
assert idx_mess_is_float.sum() + (~idx_mess_is_float).sum() == df_qual.shape[0], "mess has either to be float or not float"

In [365]:
# ((idx_mess_is_less | idx_mess_is_greater) == idx_mess_not_float).value_counts()
assert ((idx_mess_is_less | idx_mess_is_greater) != (~idx_mess_is_float)).sum() == 0, "'not float' supposed to come from '<' or '>' but nothing else."

In [366]:
# Print records which are neither less nor greater nor float -> should be empty data frame
assert df_qual[~idx_mess_is_less & ~idx_mess_is_greater & ~idx_mess_is_float].shape[0] == 0
print(df_qual[~idx_mess_is_less & ~idx_mess_is_greater & ~idx_mess_is_float])

Empty DataFrame
Columns: [messstelle_id, pna_id, datum_pn, stoff_nr, probengut, messergebnis_c, messergebnis_hinweis, bestimmungsgrenze, masseinheit, trennverfahren, verfahren, vor_ort, herkunft, aktual_dat, erstell_dat]
Index: []


In [372]:
# res = (~idx_mess_is_less & ~idx_mess_is_greater & ~idx_mess_is_float).value_counts()
res = (idx_mess_is_less | idx_mess_is_greater | idx_mess_is_float).value_counts()

In [373]:
res

True    3671913
Name: messergebnis_c, dtype: int64

In [382]:
s = df_qual.loc[17716627,"messergebnis_c"]
is_less(">1.234")

False

In [391]:
df_qual

Unnamed: 0_level_0,messstelle_id,pna_id,datum_pn,stoff_nr,probengut,messergebnis_c,messergebnis_hinweis,bestimmungsgrenze,masseinheit,trennverfahren,verfahren,vor_ort,herkunft,aktual_dat,erstell_dat
sl_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2903561,59620687,5/2005/4599,20051018,1164,Grundwasser,22.00000,,,µg/l,Gesamtgehalt,DIN 38406-E22 MAERZ 1988,,HYGC_BR-AR,20051205,20051205
2903564,59620687,5/2005/4599,20051018,1061,Grundwasser,6.80000,,,-,Gesamtgehalt,DIN 38404-C5 JANUAR 1984,ja,HYGC_BR-AR,20051205,20051205
2903565,59620687,5/2005/4599,20051018,1011,Grundwasser,12.80000,,,°C,Gesamtgehalt,DIN 38404-C4 DEZEMBER 1976,ja,HYGC_BR-AR,20051205,20051205
2903584,59620389,5/2005/5002,20051114,1011,Grundwasser,12.30000,,,°C,Gesamtgehalt,DIN 38404-C4 DEZEMBER 1976,ja,HYGC_BR-AR,20051205,20051205
2903585,59620080,5/2005/5001,20051111,1061,Grundwasser,7.40000,,,-,Gesamtgehalt,DIN 38404-C5 JANUAR 1984,ja,HYGC_BR-AR,20051205,20051205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2882791,10407406,1/2004/91121,20041026,1331,Grundwasser,19.20000,,,mg/l,Gesamtgehalt,DIN 38405-D19 FEBRUAR 1988,,HYGC_BR-K,20051107,20051107
2882792,10407406,1/2004/91121,20041026,1472,Grundwasser,5.13000,,,mmol/l,Gesamtgehalt,DIN 38409-H7-1-2 MAI 1979,ja,HYGC_BR-K,20051107,20051107
2882793,10407406,1/2004/91121,20041026,1477,Grundwasser,0.35000,,,mmol/l,Gesamtgehalt,DIN 38409-H7-2-2 MAI 1979,,HYGC_BR-K,20051107,20051107
2882794,10407406,1/2004/91121,20041026,1521,Grundwasser,0.50000,,,mg/l,Membranfilter,DIN 38409-H3 JUNI 1983,,HYGC_BR-K,20051107,20051107


In [333]:
%%time
df_qual.loc[idx_mess_is_less,"messergebnis_num"] = -9
df_qual.loc[idx_mess_is_greater,"messergebnis_num"] = -8


Wall time: 101 ms


In [334]:
%%time
df_qual.loc[idx_mess_is_float,"messergebnis_num"] = df_qual.loc[idx_mess_is_float,"messergebnis_c"].astype(float)

Wall time: 422 ms


In [335]:
# Reason for not being float? XOR
idx = (~idx_mess_is_float ^ idx_mess_is_less)
df_qual[idx]

Unnamed: 0_level_0,messstelle_id,pna_id,datum_pn,stoff_nr,probengut,messergebnis_c,messergebnis_hinweis,bestimmungsgrenze,masseinheit,trennverfahren,verfahren,vor_ort,herkunft,aktual_dat,erstell_dat,messergebnis_num
sl_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
17552890,60080164,0/2017/90347,20170510,1029,Grundwasser,>1.00000,,,FNU,Gesamtgehalt,"DIN EN ISO 7027, Abschnitt 6 - DO - C02 - 3",ja,LIMS_LANUV,20210619,20210619,-8.0
1263499,10420484,1/2004/90387,20041013,1249,Grundwasser,>0.03875,,,mg/l,Gesamtgehalt,,,HYGC_BR-K,20050127,20050127,-8.0
2016179,24170070,2/2003/90776,20031113,1695,Grundwasser,>1.00000,,,1/100ml,Nach Laborjournal,,,HYGC_BR-DET,20040220,20040220,-8.0
2923020,26502884,2/2005/90298,20050426,1695,Grundwasser,>1.00000,,,1/100ml,Nach Laborjournal,,,HYGC_BR-DET,20051223,20051223,-8.0
2923130,26502902,2/2005/90300,20050426,1695,Grundwasser,>1.00000,,,1/100ml,Nach Laborjournal,,,HYGC_BR-DET,20051223,20051223,-8.0
2792305,10407212,1/2006/91249,20060522,1695,Grundwasser,>1.00000,,,1/100ml,Gesamtgehalt,,,HYGC_BR-K,20070808,20070808,-8.0
6188714,86621567,8/2002/90102,20020612,1695,Grundwasser,>1.00000,,,1/100ml,Gesamtgehalt,,,HYGC_BR-D,20030107,20030107,-8.0
6209810,86452010,8/2000/90447,20001025,1015,Grundwasser,>0.00000,,,°C,Nach Laborjournal,DIN 38404-C4 DEZEMBER 1976,ja,HYGC_BR-D,20030107,20030107,-8.0
6210023,86452010,8/2001/90164,20010424,1015,Grundwasser,>0.00000,,,°C,Nach Laborjournal,DIN 38404-C4 DEZEMBER 1976,ja,HYGC_BR-D,20030107,20030107,-8.0
6210024,86452010,8/2001/90164,20010424,1028,Grundwasser,>0.00000,,,1/m,Gesamtgehalt,,,HYGC_BR-D,20030107,20030107,-8.0


In [207]:
df_qual["messergebnis_num"]

sl_nr    aktual_dat
2903561  20051205      22.00
2903564  20051205       6.80
2903565  20051205      12.80
2903584  20051205      12.30
2903585  20051205       7.40
                       ...  
2882791  20051107      19.20
2882792  20051107       5.13
2882793  20051107       0.35
2882794  20051107       0.50
2882795  20051107       0.00
Name: messergebnis_num, Length: 3671913, dtype: float64

In [158]:
is_float("234.342")

True

In [162]:
%%timeit
df_qual["messergebnis_c"].apply(is_float2)

1.31 s ± 39.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [160]:
def is_float2(element: str) -> bool:
    return element.replace('.','0',1).isdigit()

In [163]:
%%timeit
df_qual["messergebnis_c"].apply(is_float)

2.85 s ± 64.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
