# Extract Weather Stations
---
This script is only used to extract the latitude and longitude of the weather stations used.
The dataset should then be used with kepler to create a nice map.

---


In [2]:
# Main data packages. 
import numpy as np
import pandas as pd

# Getting data
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine

# More graphics
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load data

# Load environment variables from .env file
load_dotenv()

# Get database connection parameters from environment variables
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')

# Create the database URL
db_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create an engine
engine = create_engine(db_url)

# Define your query
query1 = 'SELECT stations_id FROM "01_bronze"."raw_open_meteo_weather_forecast"'
query2 = 'SELECT * FROM "01_bronze"."raw_dwd_weather_stations_full"'

# Execute the query and load the data into a pandas DataFrame
df_omf = pd.read_sql(query1, engine)
df_ws = pd.read_sql(query2, engine)

# Display the DataFrame
print('Weather Forecast:')
print(df_omf.head())
print('---'*10)
print('Weather Stations:')
print(df_ws.head())

Weather Forecast:
  stations_id
0         183
1         183
2         183
3         183
4         183
------------------------------
Weather Stations:
  Stations_ID         Stationsname Kennung Stationskennung   Breite   Länge   
0           1                 Aach      KL           02783  47.8410  8.8490  \
1           1                 Aach      RR           70191  47.8410  8.8490   
2           2  Aachen (Kläranlage)      RR           80313  50.8070  6.1000   
3           3               Aachen      TU           02205  50.7827  6.0941   
4           3               Aachen      SY           10501  50.7827  6.0941   

   Stationshöhe Flussgebiet Bundesland     Beginn       Ende  
0           478        None         BW 1937-01-01 1986-06-30  
1           478        None         BW 1912-01-01 1986-06-30  
2           138    803130.0         NW 1951-01-01 2006-12-31  
3           202    803100.0         NW 1950-04-01 2011-03-31  
4           202    803100.0         NW 1950-04-01 2011-04-0

In [6]:
# drop unused Weather stations
df_ws_used = df_ws[df_ws['Stations_ID'].isin(set(df_omf['stations_id']))]
df_ws_used

Unnamed: 0,Stations_ID,Stationsname,Kennung,Stationskennung,Breite,Länge,Stationshöhe,Flussgebiet,Bundesland,Beginn,Ende
349,183,Arkona,SF,03005,54.6791,13.4344,42,220400.0,MV,1967-01-01,1990-12-31
350,183,Arkona,MN,10091,54.6791,13.4344,42,220400.0,MV,2006-08-09,2024-06-25
351,183,Arkona,SO,03005,54.6791,13.4344,42,220400.0,MV,1951-01-01,2024-06-24
352,183,Arkona,MI,03005,54.6791,13.4344,42,220400.0,MV,1991-11-01,2006-08-08
353,183,Arkona,KL,03005,54.6791,13.4344,42,220400.0,MV,1947-01-01,2024-06-24
...,...,...,...,...,...,...,...,...,...,...,...
10592,5856,Fürstenzell,KL,04890,48.5451,13.3532,476,926260.0,BY,1997-01-03,2024-06-24
10593,5856,Fürstenzell,RR,92260,48.5451,13.3532,476,926260.0,BY,1997-01-03,2024-06-24
10594,5856,Fürstenzell,SY,10895,48.5451,13.3532,476,926260.0,BY,1997-01-07,2024-06-24
10595,5856,Fürstenzell,MN,10895,48.5451,13.3532,476,926260.0,BY,2006-09-26,2024-06-25


In [7]:
df_ws_used.Stations_ID.nunique()

24

In [8]:
df_ws_used.Stations_ID.value_counts()

Stations_ID
853     12
1684    12
3015    12
183     11
1048    11
1358    11
4271    11
3987    11
3631    10
4928    10
4466    10
3668    10
2290    10
662      9
2712     9
4336     9
4393     9
1975     9
691      9
5100     9
5404     9
5705     9
5856     9
5792     8
Name: count, dtype: int64

In [9]:
df_ws_used = df_ws_used[['Stations_ID', 'Stationsname', 'Breite', 'Länge', 'Stationshöhe']]
df_ws_used.columns = ['station_id', 'station_name', 'lat', 'lon', 'station_altitude']
df_ws_used.drop_duplicates(inplace=True)
print(df_ws_used)

      station_id               station_name      lat      lon   
349          183                     Arkona  54.6791  13.4344  \
1276         662               Braunschweig  52.2915  10.4464   
1321         691                     Bremen  53.0451   8.7981   
1618         853                   Chemnitz  50.7913  12.8720   
1998        1048          Dresden-Klotzsche  51.1278  13.7543   
2586        1358                Fichtelberg  50.4283  12.9536   
3158        1684                    Görlitz  51.1620  14.9510   
3645        1975        Hamburg-Fuhlsbüttel  53.6330   9.9880   
4172        2290            Hohenpeißenberg  47.8010  11.0110   
4982        2712                   Konstanz  47.6952   9.1307   
5518        3015                 Lindenberg  52.2090  14.1180   
5523        3015                 Lindenberg  52.2090  14.1180   
6721        3631                  Norderney  53.7123   7.1519   
6723        3631                  Norderney  53.7123   7.1519   
6798        3668         

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ws_used.drop_duplicates(inplace=True)


In [10]:
df_ws_used.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 349 to 10588
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   station_id        28 non-null     object 
 1   station_name      28 non-null     object 
 2   lat               28 non-null     float64
 3   lon               28 non-null     float64
 4   station_altitude  28 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 1.3+ KB


In [11]:
df_ws_used.station_id.value_counts()

station_id
4928    2
4466    2
3015    2
3631    2
183     1
3987    1
5792    1
5705    1
5404    1
5100    1
4393    1
4336    1
4271    1
3668    1
662     1
2712    1
2290    1
1975    1
1684    1
1358    1
1048    1
853     1
691     1
5856    1
Name: count, dtype: int64

In [13]:
df_ws_used[df_ws_used['station_id'].isin(['4928','4466','3015','3631'])]

Unnamed: 0,station_id,station_name,lat,lon,station_altitude
5518,3015,Lindenberg,52.209,14.118,98
5523,3015,Lindenberg,52.209,14.118,112
6721,3631,Norderney,53.7123,7.1519,12
6723,3631,Norderney,53.7123,7.1519,11
8189,4466,Schleswig,54.528,9.549,43
8195,4466,Schleswig,54.528,9.549,47
8967,4928,Stuttgart (Schnarrenberg),48.8281,9.2,314
8968,4928,Stuttgart (Schnarrenberg),48.8281,9.2,321


In [14]:
df_ws_used = df_ws_used.groupby(['station_id', 'station_name'], as_index=False).mean()
df_ws_used.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   station_id        24 non-null     object 
 1   station_name      24 non-null     object 
 2   lat               24 non-null     float64
 3   lon               24 non-null     float64
 4   station_altitude  24 non-null     float64
dtypes: float64(3), object(2)
memory usage: 1.1+ KB


In [15]:
df_ws_used

Unnamed: 0,station_id,station_name,lat,lon,station_altitude
0,1048,Dresden-Klotzsche,51.1278,13.7543,228.0
1,1358,Fichtelberg,50.4283,12.9536,1213.0
2,1684,Görlitz,51.162,14.951,239.0
3,183,Arkona,54.6791,13.4344,42.0
4,1975,Hamburg-Fuhlsbüttel,53.633,9.988,11.0
5,2290,Hohenpeißenberg,47.801,11.011,977.0
6,2712,Konstanz,47.6952,9.1307,428.0
7,3015,Lindenberg,52.209,14.118,105.0
8,3631,Norderney,53.7123,7.1519,11.5
9,3668,Nürnberg,49.503,11.0549,314.0


In [17]:
df_ws_used['station_altitude'] = df_ws_used.station_altitude.astype(int)
df_ws_used

Unnamed: 0,station_id,station_name,lat,lon,station_altitude
0,1048,Dresden-Klotzsche,51.1278,13.7543,228
1,1358,Fichtelberg,50.4283,12.9536,1213
2,1684,Görlitz,51.162,14.951,239
3,183,Arkona,54.6791,13.4344,42
4,1975,Hamburg-Fuhlsbüttel,53.633,9.988,11
5,2290,Hohenpeißenberg,47.801,11.011,977
6,2712,Konstanz,47.6952,9.1307,428
7,3015,Lindenberg,52.209,14.118,105
8,3631,Norderney,53.7123,7.1519,11
9,3668,Nürnberg,49.503,11.0549,314


In [19]:
df_ws_used.to_csv('../data/weather_stations.csv', index=False
                  )