Imports

In [None]:
import pandas as pd
import sqlite3
import joblib

# Geographical data plotting
import folium
from folium.map import Popup
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
from geopy.distance import distance
import time

Loading the dataset

In [None]:
# Connect to the SQLite database
conn = sqlite3.connect('../data/external/data.db')

# Get a cursor object
cursor = conn.cursor()

# List all of the tables
tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
table_names = [table[0] for table in tables]
print("Tables in the database:", table_names)


In [None]:
# Load data from each table into a pandas DataFrame
dataframes = {table: pd.read_sql_query(f"SELECT * FROM {table}", conn) for table in table_names}

# Name the datasets that we are going to work with
watercourse_measurements = dataframes[table_names[2]]
watercourse_stations = dataframes[table_names[1]]
weather_locations = dataframes[table_names[6]]
weather = dataframes[table_names[7]]

#### Watercourse data

Only leave data from 2010 to 2017

In [None]:
# Convert 'date' column to datetime
watercourse_measurements['date'] = pd.to_datetime(watercourse_measurements['date'])

# Filter data for years 2010 to 2017
watercourse_measurements = watercourse_measurements[watercourse_measurements['date'].dt.year.between(2010, 2017)]

In [None]:
watercourse_measurements

Split the data based on stations

In [None]:
# Make a dictionary with station_id's as keys
watercourse_measurements_stations = {station: data for station, data in watercourse_measurements.groupby('station_id')}

In [None]:
watercourse_measurements_stations

Filter data

In [None]:
# Only keep data that has rows in every year from 2010 to 2017
keys_to_delete = []
for key, data in watercourse_measurements_stations.items():
    if data['date'].dt.year.nunique() < 8:
        keys_to_delete.append(key)

In [None]:
# Delete the elements
for key in keys_to_delete:
    del watercourse_measurements_stations[key]

In [None]:
# Ensure there are no dates missing
for key, data in watercourse_measurements_stations.items():
    # Define the start and end date
    start_date = '2010-01-01'
    end_date = '2017-12-31'
    # Define the range
    all_dates = pd.date_range(start=start_date, end=end_date)

    # Reindex the DataFrame
    data.set_index('date', inplace=True)
    data = data.reindex(all_dates, fill_value=pd.NA).reset_index()
    data.rename(columns={'index': 'date'}, inplace=True)
    watercourse_measurements_stations[key] = data

In [None]:
# Delete the stations with more than 100 missing values
keys_to_delete = []
for key, data in watercourse_measurements_stations.items():
    if (data['level'].isna().sum() > 100):
        keys_to_delete.append(key)

In [None]:
# Delete from the dictionary
for key in keys_to_delete:
    del watercourse_measurements_stations[key]

In [None]:
for key, data in watercourse_measurements_stations.items():
    print(f"{key}: {data['level'].isna().sum()}")

Missing values

In [None]:
# Fill the missing values by interpolating the data
for key, data in watercourse_measurements_stations.items():
    # Linear interpolation of data
    data['level'] = data['level'].interpolate(method='linear')

In [None]:
# Fill the missing values at the beggining and at the end with backward and forward fill
for key, data in watercourse_measurements_stations.items():
    if (pd.isna(data['level']).any() == True):
        watercourse_measurements_stations[key]['level'] = watercourse_measurements_stations[key]['level'].ffill().bfill()

Adition of column with relative differences of water level

In [None]:
for key, data in watercourse_measurements_stations.items():
    data['level_diff'] = data['level'].diff().bfill()


Save the dictionary

In [None]:
# Save to a joblib file
joblib.dump(watercourse_measurements_stations, '../data/interim/watercourse_by_stations.joblib')

#### Weather data

In [None]:
# Convert 'time' column from Unix timestamp to datetime
weather['time'] = pd.to_datetime(weather['time'], unit='s')

# Group by location_id and count the number of unique years
location_years_counts = weather.groupby('location_id')['time'].apply(lambda x: x.dt.year.nunique())

# Filter location_ids that have instances in all years from 2010 to 2017
location_ids_all_years = location_years_counts[location_years_counts == 8].index.tolist()

# Filter weather to include only location with instances in all years from 2010 to 2017
weather = weather[weather['location_id'].isin(location_ids_all_years)]

# Fetch all of the location_id's
location_ids = weather['location_id'].unique()

# Filter the locations that are in the final locations
weather_locations = weather_locations[weather_locations['id'].isin(location_ids_all_years)]

In [None]:
# Rename 'time' column to 'date'
weather = weather.rename(columns={'time': 'date'})

# Remove the hour from date
weather['date'] = weather['date'].dt.date

Missing values

In [None]:
# Remove the columns with most missing values
weather = weather.drop('sun_duration', axis=1)
weather = weather.drop('snow_depth', axis=1)

Split the dataframe based on stations

In [None]:
# Make a dictionary with dataframes for specific weather locations
weather_by_locations = {location: data for location, data in weather.groupby('location_id')}

Missing values

In [None]:
# Add the missing dates
for key, data in weather_by_locations.items():
    # Define the start and end date
    start_date = '2010-01-01'
    end_date = '2017-12-31'
    # Define the range
    all_dates = pd.date_range(start=start_date, end=end_date)

    # Reindex the DataFrame
    data.set_index('date', inplace=True)
    data = data.reindex(all_dates, fill_value=pd.NA).reset_index()
    data.rename(columns={'index': 'date'}, inplace=True)
    weather_by_locations[key] = data

In [None]:
'''# Check how many missing values there are for each column for each station
# Delete the columns that have more than 100 missing values in any of the stations

columns_to_delete = set() #To store the columns that we are going to delete

# Let's first add the pressure and dew_point columns to the set.
# In the article they found out, that they are either too correlated to other features
# or uncorrelated to the target class
columns_to_delete.update(['dew_point_min', 'dew_point_max', 'dew_point_avg', 'pressure_min', 'pressure_max', 'pressure_avg'])

# Iterate over every station and every column
for key, dataset in weather_by_locations.items():
   for column, data in dataset.items():
      missing_values = data.isna().sum()
      # If there are more than 100 missing values,
      # store the column name in a list
      if missing_values > 100:
         columns_to_delete.add(column)

# Delete the columns from the list
for key, dataset in weather_by_locations.items():
   for column in columns_to_delete:
      weather_by_locations[key] = weather_by_locations[key].drop(column, axis=1)'''

In [None]:
# Check how many missing values there are for each column for each station
# Delete the stations that have more than 100 missing values in at least 1 column

stations_to_delete = set() #To store the stations that we are going to delete
columns_to_delete = set() # To store the columns that we are going to delete

# Let's first add the pressure and dew_point columns to the set.
# In the article they found out, that they are either too correlated to other features
# or uncorrelated to the target class
columns_to_delete.update(['dew_point_min', 'dew_point_max', 'dew_point_avg', 'pressure_min', 'pressure_max', 'pressure_avg'])

# Iterate over every station and every column
for key, dataset in weather_by_locations.items():
   for column, data in dataset.items():
      missing_values = data.isna().sum()
      # If there are more than 100 missing values,
      # store the station in a list
      if missing_values > 100:
        stations_to_delete.add(key)

# Delete the columns from the list
for key, dataset in weather_by_locations.items():
   for column in columns_to_delete:
      weather_by_locations[key] = weather_by_locations[key].drop(column, axis=1)

# Delete the stations from the list
for key in stations_to_delete:
   del weather_by_locations[key]

In [None]:
# In the remaining columns interpolate, backward fill and forward fill the data

for key, data in weather_by_locations.items():
    for column in data.columns:
        data[column] = data[column].interpolate(methon='linear')
        data[column] = data[column].ffill().bfill()

In [None]:
# In weather_locations keep only the ones that are remaining after all of the filtering
locations_to_keep = weather_by_locations.keys()
weather_locations = weather_locations[weather_locations['id'].isin(locations_to_keep)]

#### Join ground water and weather data

In [None]:
watercourse_stations

In [None]:
# Remove the roman numbers
watercourse_stations['name'] = watercourse_stations['name'].str.replace(r'\b(I|II|III|IV)\b', '', regex=True)


In [None]:
# Remove number 11
watercourse_stations['name'] = watercourse_stations['name'].str.replace(r'\b(11 )\b', '', regex=True)

In [None]:
# Remove the spaces at the beggining and at the end
watercourse_stations['name'] = watercourse_stations['name'].str.strip()

In [None]:
# Remove the stations that we are not going to use
ids_to_keep = watercourse_measurements_stations.keys()

watercourse_stations = watercourse_stations[watercourse_stations['id'].isin(ids_to_keep)]


In [None]:
# Initialize geocoder
geolocator = Nominatim(user_agent="geocoder_for_slovenia")

# Function to geocode a place name with retries
def geocode_place(place, retries=3, delay=1):
    for i in range(retries):
        try:
            location = geolocator.geocode(place)
            time.sleep(delay) #nominatim supports only 1 query/second
            if location:
                return location.latitude, location.longitude
            else:
                return None, None
        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"Error geocoding {place}: {e}, retrying in {delay} seconds...")
            time.sleep(delay)
    return None, None
# Geocode each station name
watercourse_stations['latitude'], watercourse_stations['longitude'] = zip(*watercourse_stations['name'].apply(geocode_place))

Plot the watercourse stations and weather stations on a map

In [None]:
#Initialize a map centered on Slovenia
slovenia_map = folium.Map(location=[46.151241, 14.995463], zoom_start=8)

# Add weather locations to the map (red colour)
for _, row in weather_locations.iterrows():
    folium.Marker(
        location=[row['lat'], row['lng']],
        popup=row['id'],
        icon=folium.Icon(color='red')
    ).add_to(slovenia_map)

# Add aquifer stations to the map (blue colour)
for _, row in watercourse_stations.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['id'],
        icon=folium.Icon(color='blue')
    ).add_to(slovenia_map)


# Save the map to an HTML file
slovenia_map.save('../data/interim/slovenia_map_surface_water.html')

Join the waeather and ground water data

In [None]:
# Find the closest weather stations to the aquifer stations

# Initialize dictionary to store closest locations
closest_locations = {}

# Iterate through each location in aquifer_stations
for idx1, row1 in watercourse_stations.iterrows():
    closest_location = None
    min_distance = float('inf')
    
    # Iterate through each location in weather_locations
    for idx2, row2 in weather_locations.iterrows():
        # Calculate distance using geopy
        dist = distance((row1['latitude'], row1['longitude']), (row2['lat'], row2['lng'])).km
        
        if dist < min_distance:
            min_distance = dist
            closest_location = row2['id']
    
    # Store closest location_id in dictionary
    closest_locations[row1['id']] = closest_location

In [None]:
closest_locations

In [None]:
# Merging the dictionaries


# Initialize an empty dictionary to store combined dataframes
combined_data = {}

# Iterate through mapping_dict and merge corresponding dataframes based on 'date'
for id1, loc_key in closest_locations.items():
    if id1 in watercourse_measurements_stations and loc_key in weather_by_locations:
        # For the aquifers we only choose the columns that are useful
        df1 = watercourse_measurements_stations[id1][['date', 'station_id', 'level', 'level_diff']]
        df2 = weather_by_locations[loc_key]
        
        # Cast both 'date' columns to datetime
        df1['date'] = pd.to_datetime(df1['date'])
        df2['date'] = pd.to_datetime(df2['date'])

        # Merge dataframes based on 'date'
        merged_df = pd.merge(df1, df2, on='date', how='outer')
        
        # Store merged dataframe in combined_data
        combined_data[id1] = merged_df
        

In [None]:
# Save the dictionary without the generated features
joblib.dump(combined_data, '../data/interim/surface-water-and-weather-no-new-features.joblib')

#### New feature generation

In [None]:
weather_by_locations[639].columns.tolist()

In [None]:
# Shifting the appropriate columns for 1 to days_to_shift days ahead
days_to_shift=10


# Specify the columns to shift
columns_to_shift = ['level_diff', 'precipitation', 'snow_accumulation', 'temperature_avg',
       'temperature_min', 'temperature_max', 'cloud_cover_avg',
       'cloud_cover_min', 'cloud_cover_max', 'humidity_avg', 'humidity_min', 'humidity_max', 
       'precipitation_probability_avg', 'precipitation_probability_min', 'precipitation_probability_max',
        'precipitation_intensity_avg', 'precipitation_intensity_min', 'precipitation_intensity_max']

# Iterate over all of the dataframes in the dictionary
for key, data in combined_data.items():
    # Iterate over all of the columns in the columns_to_shift
    for column in columns_to_shift:
        # Iterate over all shifts
        for shift in range (1, days_to_shift+1):
            first_value = data[column].iloc[0]
            data[f'{column}_shift{shift}'] = data[column].shift(shift)
            # Fill the first values (NaN) with the first values from original columns
            data[f'{column}_shift{shift}'] = data[f'{column}_shift{shift}'].fillna(first_value)

In [None]:
# Get the new generated features into a list
new_columns = combined_data[1060].columns.values[21:].tolist()

# Combine the new columns with columns to shift
# This is done so that original features and shifted features are averaged
columns_to_average = columns_to_shift + new_columns

In [None]:
# Calculating the averages for all of the features (2 to 10 days)
days_to_average = 10

# Iterate over all of the dataframes in the dictionary
for key, data in combined_data.items():
    # Iterate over all of the columns in the columns_to_average
    for column in columns_to_average:
        # Iterate over all average window sizes
        for average in range (2, days_to_average+1):
            data[f'{column}_average{average}'] = data[column].rolling(window=average, min_periods=1).mean()

In [None]:
# Delete the first days_to_average + days_to_shift days
# When shifting and averaging, these days don't have the
# appropriate values

for key, data in combined_data.items():
    combined_data[key] = combined_data[key][(days_to_average + days_to_shift):]

In [None]:
# Store the dictionary
joblib.dump(combined_data, '../data/interim/surface-water-and-weather.joblib')

In [None]:
combined_data[1060].columns.tolist()