In [1]:
import numpy as np
import pandas as pd

import requests
import zipfile
import io
import os
import glob
import time

from bs4 import BeautifulSoup
import re

import folium

In [2]:
# coordinate range from Google maps that 
# specifies the area from which weather station data is downloaded

# Pfalz
#xrange = [7.9586, 8.5310]
#yrange = [48.9395, 49.55]

# Bergstraße
xrange = [8.33, 8.7]
yrange = [49.4, 49.92]

# Weather data

A list of all DWD stations with daily historical weather data can be found in their web database: 

https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/historical/

This database also has the weather data from all these stations in subfolders for downloading.

## 1. Station list

In [3]:
# path to weather station list
# downloaded from DWD:
# https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/historical/
# on 12.03.2024
# and prepared for further use as csv via Libreoffice Calc
path = './data/KL_Tageswerte_Beschreibung_Stationen.csv'

station_list = pd.read_csv(path, sep='\s+', engine='python', 
                           encoding='latin1', 
                          dtype={'Stations_id': str,
                                'von_datum': str,
                                'bis_datum': str,
                                'Stationsname': str,
                                'Stationshoehe': int,
                                'geoBreite': float,
                                'geoLaenge': float})


In [4]:
station_list = station_list.rename(columns={'Stations_id': 'station_id',
                                'von_datum': 'start_date',
                                'bis_datum': 'end_date',
                                'Stationsname': 'name',
                                'Stationshoehe': 'elevation',
                                'geoBreite': 'ycoord',
                                'geoLaenge': 'xcoord'})
station_list.head()

Unnamed: 0,station_id,start_date,end_date,elevation,ycoord,xcoord,name
0,1,19370101,19860630,478,47.8413,8.8493,Aach
1,3,18910101,20110331,202,50.7827,6.0941,Aachen
2,11,19800901,20240311,680,47.9736,8.5205,Donaueschingen
3,44,19690101,20240311,44,52.9336,8.237,Großenkneten
4,52,19690101,20011231,46,53.6623,10.199,Ahrensburg-Wulfsdorf


In [5]:
station_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359 entries, 0 to 1358
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   station_id  1359 non-null   object 
 1   start_date  1359 non-null   object 
 2   end_date    1359 non-null   object 
 3   elevation   1359 non-null   int64  
 4   ycoord      1359 non-null   float64
 5   xcoord      1359 non-null   float64
 6   name        1359 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 74.4+ KB


## 2. Get stations in research area

In [6]:
stations = station_list.loc[((station_list.xcoord > xrange[0]) 
                    & (station_list.xcoord < xrange[1])
                   ) & ((station_list.ycoord > yrange[0]) 
                    & (station_list.ycoord < yrange[1]))]
stations

Unnamed: 0,station_id,start_date,end_date,elevation,ycoord,xcoord,name
78,355,19470101,19891031,140,49.6789,8.6276,Bensheim
202,917,19950801,20240311,162,49.8809,8.6779,Darmstadt
203,918,19871001,19950731,122,49.8453,8.624,Darmstadt
204,919,19370203,19740731,169,49.8697,8.6796,Darmstadt-Botanischer
205,920,19490101,19870930,108,49.8564,8.5929,Darmstadt
360,1619,19560101,20051231,90,49.7622,8.4887,Gernsheim
429,2080,19350101,20120912,110,49.4206,8.6676,Heidelberg
431,2083,19490101,19581231,110,49.4192,8.6051,Heidelberg-Grenzhof
618,3007,19470101,19551231,96,49.4111,8.398,Limburgerhof
634,3076,19470101,19561231,95,49.4606,8.4246,Ludwigshafen-Mundenheim


In [7]:
stations.to_csv('./data/weather_station_list.csv',
                   index=False)

## 3. Download station data

The station info file is discrepant with the DWD database (dates not matching in all cases). Therefore, download links cannot be generated from the info file but must be scraped from the web database.

In [8]:
ids = stations.station_id.unique()
site = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/historical/'
urls = []

# Send a GET request to the URL
response = requests.get(site)

# Prevent downloading if not necessary
download_new = False

# Check if the request was successful (status code 200)
if download_new == True:
    if response.status_code == 200:
        # Parse the HTML content of the response using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all anchor tags (links) in the HTML content
        for link in soup.find_all('a', href=True):
            # Extract the value of the href attribute (URL)
            href = link.get('href')
            
            # Check if the href attribute contains a download URL pattern
            if re.match(r'tageswerte.*\.(zip)$', href):
                # Check for matching ids and append to urls 
                for id in ids:
                    if id == href.split('_')[-4]:
                        urls.append(href)
    else:
        print('Failed to fetch website:', response.status_code)
else:
    print('Downloads are turned off!')

Downloads are turned off!


In [9]:
urls
ids

array(['00355', '00917', '00918', '00919', '00920', '01619', '02080',
       '02083', '03007', '03076', '05692', '05906'], dtype=object)

In [10]:
download_to = './data/weather/'
url_base = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/historical/'

if download_new == True:
    for url in urls:
        # URL of the zipped file to download
        id = url.split('_')[-4]
        print(id)
        #print(url)
        response = requests.get(url_base+url)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Get the content of the response (zip file contents)
            zip_content = response.content
            
            # Create a ZipFile object from the zip content
            with zipfile.ZipFile(io.BytesIO(zip_content)) as zip_file:
                # Extract all files from the zip archive to the current directory
                zip_file.extractall(download_to+id)
                
            print('Files extracted successfully.')
        else:
            print('Failed to download file:', response.status_code)
else:
    print('Downloads are turned off!')

Downloads are turned off!


In [11]:
station_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359 entries, 0 to 1358
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   station_id  1359 non-null   object 
 1   start_date  1359 non-null   object 
 2   end_date    1359 non-null   object 
 3   elevation   1359 non-null   int64  
 4   ycoord      1359 non-null   float64
 5   xcoord      1359 non-null   float64
 6   name        1359 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 74.4+ KB


## 4. Show stations on map

In [12]:
# Create a map centered at a specific location
m = folium.Map(location=[np.mean(yrange), np.mean(xrange)], 
               #zoom_start=12
               )

for id in ids:
    s = station_list.loc[station_list.station_id == id]
    
    # Add a marker for a specific location
    #folium.Marker([np.mean(yrange), np.mean(xrange)], popup='Location Name').add_to(m)
    folium.CircleMarker([s.ycoord, s.xcoord], radius=5, 
                        color='blue', fill=True, fill_color='blue').add_to(m)
# Display the map
m


  float(coord)
  if math.isnan(float(coord)):
  return [float(x) for x in coords]


## 5. Read weather station data to df

In [13]:
# Directory containing the folders
directory = './data/weather/'

# File name pattern to search for
file_pattern = 'produkt*.txt' 

df_list = []

# Loop over all folders in the directory
for folder in os.listdir(directory):
    folder_path = os.path.join(directory, folder)
    
    # Check if the item in the directory is a folder
    if os.path.isdir(folder_path):
        # Use glob to search for files matching the pattern inside the folder
        files = glob.glob(os.path.join(folder_path, file_pattern))
        
        # Process the found files
        for file in files:
            #print("Found file '{}' in folder '{}'".format(os.path.basename(file), folder))
            df = pd.read_csv(file, sep=';')
            df_list.append(df)


The data needs some basic cleaning before further use.

In [14]:
# clean column names
df = pd.concat(df_list)
df.columns = df.columns.str.replace(' ', '')

# drop unnecessary columns
cols_to_keep = ['STATIONS_ID', 'MESS_DATUM', 'RSK', 
                'TMK', 'TXK', 'TNK', 'UPM']
df = df[cols_to_keep]

# assign new column names
new_col_names = {'STATIONS_ID': 'station_id', 'MESS_DATUM': 'date', 
                 'RSK': 'precip', 'TMK': 'tmean', 
                 'TXK': 'tmax', 'TNK': 'tmin', 'UPM': 'moisture'}
df = df.rename(columns=new_col_names)

# change date column to datetime type
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192969 entries, 0 to 23071
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   station_id  192969 non-null  int64         
 1   date        192969 non-null  datetime64[ns]
 2   precip      192969 non-null  float64       
 3   tmean       192969 non-null  float64       
 4   tmax        192969 non-null  float64       
 5   tmin        192969 non-null  float64       
 6   moisture    192969 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 11.8 MB


In [16]:
df.describe()

Unnamed: 0,station_id,date,precip,tmean,tmax,tmin,moisture
count,192969.0,192969,192969.0,192969.0,192969.0,192969.0,192969.0
mean,2714.669657,1969-12-27 22:06:11.960263046,1.454202,10.098789,14.316913,5.656918,33.699037
min,174.0,1876-01-01 00:00:00,-999.0,-999.0,-999.0,-999.0,-999.0
25%,1072.0,1954-02-09 00:00:00,0.0,4.4,7.5,1.0,66.0
50%,2522.0,1971-09-27 00:00:00,0.0,10.2,14.6,6.2,77.0
75%,3929.0,1991-12-14 00:00:00,1.8,16.1,21.4,11.3,85.0
max,5426.0,2022-12-31 00:00:00,101.6,31.0,40.2,26.2,100.0
std,1591.563083,,22.408523,8.217862,14.357843,19.356004,209.313513


## 6. Imputation of erroneous values

The value -999.0 denotes errors in the data. Since this value massively affects further applications, it must be dealt with. Possible options are:

- Replace with NaN
- Drop affected rows
- Imputation with value, like mean
- Model-based imputation

For simplicity, we will first replace with NaN.

In [17]:
# Replace -999.0 with NaN
df.replace(-999.0, float('nan'), inplace=True)

df.describe()

Unnamed: 0,station_id,date,precip,tmean,tmax,tmin,moisture
count,192969.0,192969,192876.0,192967.0,192945.0,192906.0,185381.0
mean,2714.669657,1969-12-27 22:06:11.960263046,1.936596,10.109248,14.442957,5.985023,75.96939
min,174.0,1876-01-01 00:00:00,0.0,-21.5,-17.4,-25.4,24.0
25%,1072.0,1954-02-09 00:00:00,0.0,4.4,7.5,1.0,67.0
50%,2522.0,1971-09-27 00:00:00,0.0,10.2,14.6,6.2,77.0
75%,3929.0,1991-12-14 00:00:00,1.8,16.1,21.4,11.3,86.0
max,5426.0,2022-12-31 00:00:00,101.6,31.0,40.2,26.2,100.0
std,1591.563083,,4.419912,7.548501,8.856297,6.710821,12.879864


## 7. Create derived tables and storing

### 7.1 Daily observations, all stations

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192969 entries, 0 to 23071
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   station_id  192969 non-null  int64         
 1   date        192969 non-null  datetime64[ns]
 2   precip      192876 non-null  float64       
 3   tmean       192967 non-null  float64       
 4   tmax        192945 non-null  float64       
 5   tmin        192906 non-null  float64       
 6   moisture    185381 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 11.8 MB


In [19]:
# merged table with daily observations and all stations
#df.to_excel('./data/weather_collections/weather_daily_all.xlsx', 
#         index=True)

In [20]:
# merged table with daily observations and all stations
df.to_csv('./data/weather_collections/weather_daily_all.csv', 
         index=False)

### 7.2 Daily observations, regional average

In [21]:
# daily averages for the whole region
daily_region = df.groupby('date').agg({'station_id': 'count',
                       'precip': ['mean', 'min', 'max', 'std'],
                       'tmean': ['mean', 'min', 'max', 'std'],
                       'tmax': ['mean', 'min', 'max', 'std'],
                       'tmin': ['mean', 'min', 'max', 'std'],
                       'moisture': ['mean', 'min', 'max', 'std']})

# rename columns
daily_region.columns = ['_'.join(col).strip() for col in daily_region.columns.values]
daily_region.rename(columns={('station_id_count'): 'station_count'})
daily_region.reset_index(inplace=True)

In [22]:
daily_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53357 entries, 0 to 53356
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              53357 non-null  datetime64[ns]
 1   station_id_count  53357 non-null  int64         
 2   precip_mean       53356 non-null  float64       
 3   precip_min        53356 non-null  float64       
 4   precip_max        53356 non-null  float64       
 5   precip_std        28854 non-null  float64       
 6   tmean_mean        53357 non-null  float64       
 7   tmean_min         53357 non-null  float64       
 8   tmean_max         53357 non-null  float64       
 9   tmean_std         28855 non-null  float64       
 10  tmax_mean         53357 non-null  float64       
 11  tmax_min          53357 non-null  float64       
 12  tmax_max          53357 non-null  float64       
 13  tmax_std          28855 non-null  float64       
 14  tmin_mean         5335

In [23]:
daily_region.head()

Unnamed: 0,date,station_id_count,precip_mean,precip_min,precip_max,precip_std,tmean_mean,tmean_min,tmean_max,tmean_std,...,tmax_max,tmax_std,tmin_mean,tmin_min,tmin_max,tmin_std,moisture_mean,moisture_min,moisture_max,moisture_std
0,1876-01-01,1,1.8,1.8,1.8,,-0.2,-0.2,-0.2,,...,2.0,,-1.0,-1.0,-1.0,,88.0,88.0,88.0,
1,1876-01-02,1,20.6,20.6,20.6,,2.8,2.8,2.8,,...,3.2,,-1.5,-1.5,-1.5,,96.0,96.0,96.0,
2,1876-01-03,1,7.6,7.6,7.6,,6.0,6.0,6.0,,...,8.0,,3.0,3.0,3.0,,95.0,95.0,95.0,
3,1876-01-04,1,1.0,1.0,1.0,,-1.9,-1.9,-1.9,,...,8.0,,-6.0,-6.0,-6.0,,91.0,91.0,91.0,
4,1876-01-05,1,1.0,1.0,1.0,,-6.2,-6.2,-6.2,,...,-3.0,,-9.0,-9.0,-9.0,,82.0,82.0,82.0,


In [24]:
dates = ['1899-12-29', '1899-12-30', '1899-12-31', '1900-01-01']
df.loc[df.date.isin(dates)]

Unnamed: 0,station_id,date,precip,tmean,tmax,tmin,moisture
8763,2522,1899-12-29,0.0,2.9,6.0,-0.2,91.0
8764,2522,1899-12-30,28.8,8.8,10.6,0.9,58.0
8765,2522,1899-12-31,0.0,9.4,13.0,6.6,80.0
8766,2522,1900-01-01,10.1,6.9,10.0,5.2,95.0


In [25]:
daily_region.iloc[8762:8768]

Unnamed: 0,date,station_id_count,precip_mean,precip_min,precip_max,precip_std,tmean_mean,tmean_min,tmean_max,tmean_std,...,tmax_max,tmax_std,tmin_mean,tmin_min,tmin_max,tmin_std,moisture_mean,moisture_min,moisture_max,moisture_std
8762,1899-12-28,1,2.0,2.0,2.0,,2.0,2.0,2.0,,...,5.2,,-0.2,-0.2,-0.2,,92.0,92.0,92.0,
8763,1899-12-29,1,0.0,0.0,0.0,,2.9,2.9,2.9,,...,6.0,,-0.2,-0.2,-0.2,,91.0,91.0,91.0,
8764,1899-12-30,1,28.8,28.8,28.8,,8.8,8.8,8.8,,...,10.6,,0.9,0.9,0.9,,58.0,58.0,58.0,
8765,1899-12-31,1,0.0,0.0,0.0,,9.4,9.4,9.4,,...,13.0,,6.6,6.6,6.6,,80.0,80.0,80.0,
8766,1900-01-01,1,10.1,10.1,10.1,,6.9,6.9,6.9,,...,10.0,,5.2,5.2,5.2,,95.0,95.0,95.0,
8767,1900-01-02,1,8.4,8.4,8.4,,6.0,6.0,6.0,,...,8.0,,4.5,4.5,4.5,,97.0,97.0,97.0,


In [26]:
daily_region.tail()

Unnamed: 0,date,station_id_count,precip_mean,precip_min,precip_max,precip_std,tmean_mean,tmean_min,tmean_max,tmean_std,...,tmax_max,tmax_std,tmin_mean,tmin_min,tmin_max,tmin_std,moisture_mean,moisture_min,moisture_max,moisture_std
53352,2022-12-27,4,0.0,0.0,0.0,0.0,3.475,1.8,4.4,1.152895,...,8.3,1.877054,-0.8,-2.6,0.5,1.512173,85.0225,81.38,93.71,5.834931
53353,2022-12-28,4,0.15,0.0,0.6,0.3,6.75,5.6,8.0,1.034408,...,13.0,1.908533,0.175,-1.3,2.0,1.631717,77.655,69.83,84.33,6.291484
53354,2022-12-29,4,1.3,0.2,2.2,0.959166,10.225,7.3,11.6,1.985573,...,14.0,1.944222,7.4,4.7,8.6,1.81659,76.4775,70.0,88.04,8.035962
53355,2022-12-30,4,3.075,1.6,5.1,1.543535,7.325,6.1,8.0,0.899537,...,14.8,2.055886,2.9,0.9,3.9,1.392839,90.03,87.58,93.71,2.75878
53356,2022-12-31,4,0.0,0.0,0.0,0.0,15.075,12.8,16.0,1.526161,...,19.4,2.040425,11.65,10.2,13.9,1.725302,69.41,66.38,75.38,4.138873


In [27]:
#daily_region.to_excel('./data/weather_collections/weather_daily_region.xlsx', 
#         index=True)

In [28]:
daily_region.to_csv('./data/weather_collections/weather_daily_region.csv', 
         index=False)

### 7.3 Biweekly average observation, all stations

probably not needed...

In [29]:
biweekly = df.groupby('station_id'
                        ).resample('2W', on='date'
                        ).agg({'station_id': 'count',
                       'precip': ['mean', 'min', 'max', 'std'],
                       'tmean': ['mean', 'min', 'max', 'std'],
                       'tmax': ['mean', 'min', 'max', 'std'],
                       'tmin': ['mean', 'min', 'max', 'std'],
                       'moisture': ['mean', 'min', 'max', 'std']})


In [30]:
# rename columns
biweekly.columns = ['_'.join(col).strip() for col in biweekly.columns.values]
biweekly.rename(columns={('station_id_count'): 'days_count'})
biweekly.reset_index(inplace=True)

In [31]:
#biweekly.to_excel('./data/weather_collections/weather_biweekly_all.xlsx', 
#         index=False)

In [32]:
biweekly.to_csv('./data/weather_collections/weather_biweekly_all.csv', 
         index=False)

### 7.4 Biweekly regional averages

In [33]:
# probably not needed...