In [1]:
import numpy as np
import pandas as pd

import zipfile
import io
import os
import glob
import time
import re

import folium

# Pfalz Groundwater data

RLP https://wasserportal.rlp-umwelt.de/geoexplorer does not provide a reasonable interface for automatically retrieving groundwater data. 

Simple parsing does not work, since the sites are dynamically generated with js.

river level data date back max 3 years, so not usable


example of direct download link for: 
- station id 2378115000
- start date 1953-11-02
- end date 1993-10-25

start and end date can be arbitrarily early/late, data downloads anyway. 

https://geodaten-wasser.rlp-umwelt.de/api/export/messstellen_grundwasser_grundwasserstaende.xls?w=messst_nr%3D2378115000&w=ana_datum%3E%3Disodate%3A1953-11-01T23%3A00%3A00.000%2B00%3A00&w=ana_datum%3C%3Disodate%3A1993-10-24T23%3A00%3A00.000%2B00%3A00

In [2]:
stations = pd.read_csv('./data/groundwater/pfalz/stations.txt', sep=';')
stations

Unnamed: 0,station_id,x,y
0,2377139400,441168,5452408
1,2377148500,448220,5452589
2,2377150000,450424,5452258
3,2377179100,454597,5453021
4,2377194000,458751,5457047
5,2379142700,460131,5464423
6,2378178400,456262,5462484
7,2378115000,449605,5462474
8,2378175100,443017,5465223
9,2378190000,434221,5455017


## 1. Download station data from list of stations

In [3]:
ids = stations.station_id.unique()

#url_base = 'https://geodaten-wasser.rlp-umwelt.de/grundwasser/'
#url_tail = '/stammdaten'
url_base = 'https://geodaten-wasser.rlp-umwelt.de/api/export/messstellen_grundwasser_grundwasserstaende.xls?w=messst_nr%3D'
url_tail = '&w=ana_datum%3E%3Disodate%3A1900-01-01T23%3A00%3A00.000%2B00%3A00&w=ana_datum%3C%3Disodate%3A2024-01-01T23%3A00%3A00.000%2B00%3A00'
urls = {}

down_path = './data/groundwater/pfalz/stations_raw'

for id in ids:
    urls[id] = f'{url_base}{id}{url_tail}'


In [4]:
# Prevent downloading if not necessary
download_new = False

# Check if the request was successful (status code 200)
if download_new == True:
    for id in ids:
        # Send a GET request to the URL
        response = requests.get(urls[id], verify=False)
        
        # Check if request was successful (status code 200)
        if response.status_code == 200:
            # Save the file to the specified folder
            with open(f'{down_path}{id}.xls', 'wb') as f:
                f.write(response.content)
            print(f"File downloaded successfully")
        else:
            print(f"Failed to download file")


## 2. Read groundwater station data to df

In [5]:
# Directory containing the folders
folder_path = './data/groundwater/pfalz/stations_raw/'

# File name pattern to search for
file_pattern = '*.xls' 

df_list = []

# Check if the item in the directory is a folder
if os.path.isdir(folder_path):
    # Use glob to search for files matching the pattern inside the folder
    files = glob.glob(os.path.join(folder_path, file_pattern))
    
    # Process the found files
    for file in files:
        #print("Found file '{}' in folder '{}'".format(os.path.basename(file), folder))
        df = pd.read_excel(file)#, sep=';')
        df_list.append(df)


In [6]:
df_list[0].head()

Unnamed: 0,Messstellennummer,Messstellenbezeichnung,Datum,Messpunkthöhe [NN+m],Abstich (m unter MPH),Wasserstand (NN+m)
0,2378135400,"1053 Böbingen,",01.11.1954,113.27,1.5,111.77
1,2378135400,"1053 Böbingen,",08.11.1954,113.27,1.47,111.8
2,2378135400,"1053 Böbingen,",15.11.1954,113.27,1.47,111.8
3,2378135400,"1053 Böbingen,",22.11.1954,113.27,1.45,111.82
4,2378135400,"1053 Böbingen,",29.11.1954,113.27,1.42,111.85


In [7]:
df_list[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2658 entries, 0 to 2657
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Messstellennummer       2658 non-null   int64  
 1   Messstellenbezeichnung  2658 non-null   object 
 2   Datum                   2658 non-null   object 
 3   Messpunkthöhe [NN+m]    2658 non-null   float64
 4   Abstich (m unter MPH)   2658 non-null   float64
 5   Wasserstand (NN+m)      2658 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 124.7+ KB


The data needs some basic cleaning before further use.

In [8]:
# clean column names
df = pd.concat(df_list)
#df.columns = df.columns.str.replace(' ', '')

# assign new column names
new_col_names = {'Messstellennummer': 'station_id', 
                 'Messstellenbezeichnung': 'name', 
                 'Datum': 'date', 
                 'Messpunkthöhe [NN+m]': 'elevation', 
                 'Abstich (m unter MPH)': 'water_depth', 
                 'Wasserstand (NN+m)': 'water_level'}
df = df.rename(columns=new_col_names)

# change date column to datetime type
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')

In [9]:
df.name.unique()

array(['1053 Böbingen,', '1044 A Lustadt,Holzmühle', '1319 Dudenhofen,',
       '1307, Germersheim', '1059 Neustadt an der Weinstraße, Geinsheim',
       '1199 Flemlingen,', '1058 Neustadt an der Weinstraße, Speyerdorf',
       '1316 II, Römerberg, Mechtersheim', '1450, Essingen',
       '1303 II Lustadt,', '1132 Speyer, '], dtype=object)

In [10]:
df.describe()

Unnamed: 0,station_id,date,elevation,water_depth,water_level
count,21820.0,21820,21820.0,21820.0,21820.0
mean,2377712000.0,1991-07-07 19:03:05.444546304,112.831093,3.965391,108.865701
min,2377139000.0,1953-11-02 00:00:00,95.82,0.2,90.28
25%,2377150000.0,1979-06-21 00:00:00,98.82,2.13,96.83
50%,2377194000.0,1991-02-25 00:00:00,113.27,3.13,107.78
75%,2378175000.0,2004-12-13 00:00:00,116.88,5.49,112.47
max,2379143000.0,2024-02-28 00:00:00,201.32,15.9,185.76
std,605369.6,,15.928979,2.685128,14.969659


In [11]:
df.head()

Unnamed: 0,station_id,name,date,elevation,water_depth,water_level
0,2378135400,"1053 Böbingen,",1954-11-01,113.27,1.5,111.77
1,2378135400,"1053 Böbingen,",1954-11-08,113.27,1.47,111.8
2,2378135400,"1053 Böbingen,",1954-11-15,113.27,1.47,111.8
3,2378135400,"1053 Böbingen,",1954-11-22,113.27,1.45,111.82
4,2378135400,"1053 Böbingen,",1954-11-29,113.27,1.42,111.85


In [12]:
df.to_csv('./data/groundwater/pfalz/gw.csv', index=False)