### **Testing various statistical models** 

##### Imports

In [None]:
#%pip install sklearn-genetic-opt
#%pip install seaborn

In [None]:
import joblib
import pandas as pd
import numpy as np

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

import optuna

# RandomForest
from sklearn.ensemble import RandomForestRegressor

# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# GAFeatureSelectionCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.plots import plot_fitness_evolution

# Decision Tree
from sklearn.tree import DecisionTreeRegressor

# Linear Regression
from sklearn.linear_model import LinearRegression


##### Standard scaling

In [None]:
def standard_scaling(x):
    mean = np.mean(np.abs(x))
    s = np.std(x)

    return (x - mean)/s

def standard_unscaling(original, scaled):
    mean = np.mean(np.abs(original))
    s = np.std(original)

    return (scaled * s) + mean

# Scaler, that scales data according to other data
def standard_scaling_transform(original, to_scale):
    mean = np.mean(np.abs(original))
    s = np.std(original)

    return (to_scale - mean)/s

### **1** Random Forest

#### **1.1** Ground water data

In [None]:
# Get the data
aquifer_by_stations = joblib.load('../data/interim/ground-water-and-weather.joblib')

In [None]:
# Parameters, that we want to keep in the data
features_target = ['altitude_diff']
features_train = ['year', 'month', 'day','altitude_diff_shift1', 'altitude_diff_shift2', 'altitude_diff_shift3', 'altitude_diff_shift4', 'altitude_diff_shift5', 'precipitation_average4', 'precipitation_average3', 'precipitation_average5', 'precipitation_average6', 'precipitation_average7', 'precipitation_shift1_average2', 'precipitation_shift1_average3', 'precipitation_average2', 'precipitation_shift1_average4', 'precipitation_average8', 'precipitation_shift1_average5', 'precipitation_average9', 'precipitation_shift1', 'precipitation_average10', 'precipitation_shift1_average6', 'precipitation_shift1_average7', 'precipitation_shift1_average8']

In [None]:
# Transform date column to year, month and day columns
for key in aquifer_by_stations.keys():
    aquifer_by_stations[key]['year'] = aquifer_by_stations[key]['date'].dt.year
    aquifer_by_stations[key]['month'] = aquifer_by_stations[key]['date'].dt.month
    aquifer_by_stations[key]['day'] = aquifer_by_stations[key]['date'].dt.day

##### Feature selection

In [None]:
# Get the data
aquifer_by_stations = joblib.load('../data/interim/ground-water-and-weather.joblib')

In [None]:
aquifer = 85065
day_len = 365
val_len = 100 # Length of validation set
horizon = 3


In [None]:
X_train = aquifer_by_stations[aquifer][:-(day_len + horizon + val_len)].drop(columns=['altitude_diff', 'altitude', 'date'])
y_train = aquifer_by_stations[aquifer][features_target][horizon:-(day_len + val_len)]

In [None]:
# Add the predictions
postfixes = ['avg', 'min', 'max']
for i in range(1,6):
    for postfix in postfixes:
        X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
        X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    

In [None]:
# Remove the last 5 values
X_train = X_train[:-5]
y_train = y_train[:-5]

In [None]:
from sklearn.model_selection import BaseCrossValidator

class Last365TimeSeriesSplit(BaseCrossValidator):
    def __init__(self, n_splits=1, test_size=365):
        self.n_splits = n_splits
        self.test_size = test_size

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        indices = np.arange(n_samples)
        test_start = n_samples - self.test_size
        train_indices = indices[:test_start]
        test_indices = indices[test_start:]
        yield train_indices, test_indices

In [None]:
# Time series split
#tscv = TimeSeriesSplit(n_splits=2)

# Initialize model
model = GradientBoostingRegressor(n_estimators=50)

# Initialize genetic algorithm feature selector with max_features set
gafs = GAFeatureSelectionCV(
    estimator=model,
    cv=Last365TimeSeriesSplit(),
    scoring="r2",
    population_size=200,
    generations=25,
    n_jobs=-1,
    verbose=True,
    keep_top_k=5,
    elitism=True,
    max_features=40,  # Set the maximum number of features to select
    mutation_probability=0.2,
    crossover_probability=0.8
)

# Fit the feature selector
gafs.fit(X_train, y_train)

# Plot fitness evolution
plot_fitness_evolution(gafs)
# Output best features found
print("Best Features Found:", gafs.best_features_)

In [None]:
gafs.best_features_

In [None]:
# Save the best features to a list
best_features = [item for item, keep in zip(X_train.columns, gafs.best_features_) if keep]

In [None]:
best_features

In [None]:
# Best features with 365 days validation
'''
Linear regression
['cloud_cover_min_shift5',
 'precipitation_intensity_avg_shift3',
 'precipitation_intensity_max_shift2',
 'temperature_avg_average6',
 'cloud_cover_max_average7',
 'humidity_min_average4',
 'precipitation_probability_avg_average4',
 'precipitation_probability_max_average2',
 'precipitation_intensity_avg_average2',
 'altitude_diff_shift2_average3',
 'altitude_diff_shift5_average10',
 'altitude_diff_shift7_average2',
 'altitude_diff_shift9_average6',
 'altitude_diff_shift10_average4',
 'precipitation_shift10_average4',
 'temperature_min_shift8_average3',
 'temperature_min_shift9_average4',
 'temperature_max_shift2_average7',
 'temperature_max_shift3_average2',
 'cloud_cover_min_shift10_average2',
 'cloud_cover_max_shift3_average7',
 'humidity_avg_shift1_average3',
 'humidity_avg_shift2_average6',
 'humidity_avg_shift6_average10',
 'humidity_min_shift9_average3',
 'humidity_max_shift6_average7',
 'humidity_max_shift8_average5',
 'precipitation_probability_avg_shift8_average7',
 'precipitation_probability_min_shift1_average7',
 'precipitation_probability_min_shift4_average9',
 'precipitation_probability_max_shift1_average3',
 'precipitation_probability_max_shift2_average7',
 'precipitation_probability_max_shift7_average5',
 'precipitation_probability_max_shift10_average2',
 'precipitation_intensity_min_shift9_average6',
 'precipitation_intensity_avg_shift-1',
 'precipitation_probability_max_shift-1',
 'precipitation_intensity_max_shift-1',
 'precipitation_intensity_avg_shift-2']
 
 random forest
 ['precipitation_intensity_avg',
 'temperature_max_shift5',
 'cloud_cover_max_shift2',
 'humidity_avg_shift1',
 'precipitation_probability_min_shift7',
 'altitude_diff_average3',
 'temperature_max_average7',
 'cloud_cover_min_average2',
 'altitude_diff_shift6_average10',
 'precipitation_shift2_average5',
 'temperature_min_shift8_average2',
 'temperature_min_shift9_average3',
 'temperature_min_shift10_average9',
 'temperature_max_shift2_average3',
 'temperature_max_shift4_average9',
 'cloud_cover_avg_shift2_average6',
 'cloud_cover_avg_shift9_average6',
 'cloud_cover_avg_shift10_average10',
 'humidity_avg_shift3_average5',
 'humidity_avg_shift5_average9',
 'humidity_avg_shift8_average3',
 'humidity_max_shift4_average2',
 'humidity_max_shift6_average7',
 'precipitation_intensity_avg_shift6_average3',
 'precipitation_intensity_avg_shift8_average2',
 'precipitation_intensity_avg_shift-1',
 'precipitation_intensity_avg_shift-2']
 
 gradient boosting
 ['temperature_min',
 'snow_accumulation_average6',
 'temperature_avg_average2',
 'cloud_cover_max_average3',
 'humidity_avg_average4',
 'precipitation_probability_avg_average2',
 'precipitation_probability_max_average5',
 'precipitation_intensity_avg_average6',
 'snow_accumulation_shift1_average4',
 'temperature_min_shift1_average3',
 'temperature_min_shift3_average6',
 'temperature_min_shift9_average5',
 'temperature_max_shift4_average2',
 'temperature_max_shift6_average6',
 'humidity_avg_shift10_average2',
 'humidity_min_shift8_average2',
 'humidity_min_shift9_average8',
 'humidity_min_shift10_average5',
 'humidity_max_shift3_average10',
 'humidity_max_shift8_average8',
 'humidity_max_shift8_average9',
 'precipitation_probability_min_shift7_average4',
 'precipitation_probability_min_shift9_average10',
 'precipitation_probability_max_shift5_average10',
 'precipitation_probability_max_shift10_average3',
 'precipitation_intensity_avg_shift2_average2',
 'precipitation_intensity_min_shift5_average4',
 'precipitation_intensity_max_shift2_average10',
 'precipitation_intensity_max_shift4_average5',
 'precipitation_probability_avg_shift-1',
 'precipitation_probability_avg_shift-2',
 'precipitation_probability_max_shift-2',
 'precipitation_intensity_max_shift-3']
 
 gradient boosting n_estimators=50
 ['precipitation_average6',
 'temperature_max_average3',
 'humidity_min_average4',
 'humidity_min_average7',
 'precipitation_probability_avg_average3',
 'precipitation_probability_avg_average7',
 'precipitation_intensity_max_average2',
 'altitude_diff_shift4_average8',
 'temperature_max_shift7_average2',
 'cloud_cover_avg_shift1_average10',
 'cloud_cover_min_shift1_average4',
 'cloud_cover_min_shift1_average8',
 'cloud_cover_min_shift6_average4',
 'cloud_cover_max_shift5_average10',
 'cloud_cover_max_shift8_average5',
 'humidity_avg_shift3_average2',
 'humidity_avg_shift3_average9',
 'humidity_avg_shift8_average10',
 'humidity_avg_shift9_average7',
 'humidity_min_shift1_average4',
 'humidity_min_shift1_average6',
 'humidity_min_shift6_average4',
 'humidity_min_shift8_average5',
 'humidity_max_shift2_average8',
 'humidity_max_shift5_average4',
 'humidity_max_shift6_average4',
 'precipitation_probability_avg_shift9_average9',
 'precipitation_probability_min_shift10_average4',
 'precipitation_probability_max_shift10_average5',
 'precipitation_intensity_avg_shift10_average8',
 'precipitation_intensity_min_shift10_average8',
 'precipitation_intensity_max_shift6_average6',
 'precipitation_probability_avg_shift-1',
 'precipitation_intensity_avg_shift-2']
 '''

In [None]:
# Best features for only linear regression
'''
 ['temperature_avg_shift7',
 'humidity_max_shift6',
 'altitude_diff_average2',
 'precipitation_average2',
 'temperature_min_average2',
 'temperature_min_average4',
 'cloud_cover_avg_average9',
 'precipitation_intensity_avg_average2',
 'precipitation_intensity_min_average5',
 'altitude_diff_shift6_average8',
 'precipitation_shift10_average9',
 'temperature_min_shift3_average6',
 'temperature_max_shift1_average10',
 'temperature_max_shift5_average10',
 'temperature_max_shift9_average5',
 'cloud_cover_avg_shift5_average7',
 'cloud_cover_avg_shift8_average4',
 'humidity_min_shift3_average10',
 'humidity_min_shift4_average3',
 'humidity_min_shift5_average2',
 'humidity_min_shift7_average6',
 'humidity_min_shift9_average3',
 'humidity_min_shift10_average5',
 'humidity_max_shift5_average5',
 'precipitation_intensity_avg_shift1_average6',
 'precipitation_intensity_avg_shift2_average3',
 'precipitation_intensity_avg_shift2_average5',
 'precipitation_intensity_avg_shift3_average9',
 'precipitation_intensity_max_shift1_average8',
 'precipitation_intensity_avg_shift-1',
 'precipitation_intensity_max_shift-1',
 'precipitation_probability_avg_shift-2',
 'precipitation_probability_max_shift-2',
 'precipitation_intensity_max_shift-2']
'''

In [None]:
gafs.get_feature_names_out(X_train.columns)

##### Hyperparameter tuning

In [None]:
best_features = ['precipitation_intensity_avg',
 'altitude_diff_shift3',
 'altitude_diff_average2',
 'precipitation_average2',
 'precipitation_intensity_avg_average2',
 'precipitation_shift1_average3',
 'precipitation_shift3_average5',
 'snow_accumulation_shift3_average6',
 'temperature_avg_shift6_average2',
 'temperature_avg_shift9_average9',
 'temperature_min_shift3_average3',
 'temperature_min_shift7_average6',
 'temperature_min_shift10_average3',
 'temperature_max_shift7_average2',
 'cloud_cover_avg_shift3_average5',
 'cloud_cover_avg_shift5_average9',
 'cloud_cover_max_shift10_average8',
 'humidity_avg_shift7_average2',
 'humidity_min_shift1_average10',
 'humidity_min_shift2_average10',
 'humidity_min_shift6_average3',
 'humidity_max_shift5_average9',
 'precipitation_intensity_min_shift6_average10',
 'precipitation_intensity_min_shift9_average4',
 'precipitation_intensity_min_shift10_average10',
 'precipitation_intensity_max_shift3_average5',
 'precipitation_intensity_avg_shift-1',
 'precipitation_intensity_avg_shift-2',
 # manually added features
 'precipitation_intensity_avg_shift-3',
 'precipitation_intensity_avg_shift-4',
 'precipitation_intensity_avg_shift-5']

In [None]:
# best features with larger validation
best_features = ['precipitation_intensity_avg',
 'temperature_max_shift5',
 'cloud_cover_max_shift2',
 'humidity_avg_shift1',
 'precipitation_probability_min_shift7',
 'altitude_diff_average3',
 'temperature_max_average7',
 'cloud_cover_min_average2',
 'altitude_diff_shift6_average10',
 'precipitation_shift2_average5',
 'temperature_min_shift8_average2',
 'temperature_min_shift9_average3',
 'temperature_min_shift10_average9',
 'temperature_max_shift2_average3',
 'temperature_max_shift4_average9',
 'cloud_cover_avg_shift2_average6',
 'cloud_cover_avg_shift9_average6',
 'cloud_cover_avg_shift10_average10',
 'humidity_avg_shift3_average5',
 'humidity_avg_shift5_average9',
 'humidity_avg_shift8_average3',
 'humidity_max_shift4_average2',
 'humidity_max_shift6_average7',
 'precipitation_intensity_avg_shift6_average3',
 'precipitation_intensity_avg_shift8_average2',
 'precipitation_intensity_avg_shift-1',
 'precipitation_intensity_avg_shift-2',
 #manually added
 'precipitation_intensity_avg_shift-3',
 'precipitation_intensity_avg_shift-4',
 'precipitation_intensity_avg_shift-5',
 ]

In [None]:
aquifers_list = [85065, 85064]

In [None]:
# Add the predictions
def weather_forecast(X_train):
    postfixes = ['avg', 'min', 'max']
    for i in range(1,6):
        for postfix in postfixes:
            X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
            X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    return X_train

In [None]:
for aquifer in aquifers_list:
    aquifer_by_stations[aquifer] = weather_forecast(aquifer_by_stations[aquifer])
    aquifer_by_stations[aquifer] = aquifer_by_stations[aquifer][:-5]

In [None]:
# Define the horizon, day_len (number of predicted days), test_len (number of days used for final testing)
horizon_max = 5
day_len = 100
test_len = 365

In [None]:
# Define the function which contains parameters to tune and the model

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 20, 30, 50])
    max_features = trial.suggest_categorical('max_features', ["sqrt", "log2", 0.5, 1.0])

    # Initialize the RandomForestClassifier
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=6,
                                  random_state=42)
    
    # List for r2 results for different prediction horizons
    r2_scores = [[] for _ in range(horizon_max)]
    
    for aquifer in aquifers_list:
    
        for horizon in range (1, horizon_max+1, 1):
            # Define the train and test set
            X_train = aquifer_by_stations[aquifer][best_features][:-(day_len + horizon + test_len)]
            y_train = aquifer_by_stations[aquifer][features_target][horizon:-(day_len + test_len)]
    
            X_test = aquifer_by_stations[aquifer][best_features][-(day_len + horizon + test_len):-(horizon + test_len)]
            y_test = aquifer_by_stations[aquifer][features_target][-(day_len + test_len):-test_len]
    
            # Train the model
            model.fit(X_train, y_train)
    
            # Make predictions
            forecast = model.predict(X_test).tolist()
            
            # Calculate and save the r2 score
            r2_scores[horizon-1].append(r2_score(y_test, forecast))
    
    # Calculate the average r2 score
    r2_average =  []
    
    for i in range(5):
        r2_average.append(np.mean(r2_scores[i]))

    # Set the loss as average of average r2 scores for different prediction horizons
    loss = np.mean(r2_average)

    return loss

In [None]:
# Run the optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [None]:
study.best_params

In [None]:
study.best_value

##### Testing on multiple stations

In [None]:
# Get the data
aquifer_by_stations = joblib.load('../data/interim/ground-water-and-weather.joblib')

In [None]:
# Parameters, that we want to keep in the data
features_target = ['altitude_diff']

In [None]:
# Transform date column to year, month and day columns
for key in aquifer_by_stations.keys():
    aquifer_by_stations[key]['year'] = aquifer_by_stations[key]['date'].dt.year
    aquifer_by_stations[key]['month'] = aquifer_by_stations[key]['date'].dt.month
    aquifer_by_stations[key]['day'] = aquifer_by_stations[key]['date'].dt.day

In [None]:
# best features with larger validation
best_features = ['precipitation_intensity_avg',
 'temperature_max_shift5',
 'cloud_cover_max_shift2',
 'humidity_avg_shift1',
 'precipitation_probability_min_shift7',
 'altitude_diff_average3',
 'temperature_max_average7',
 'cloud_cover_min_average2',
 'altitude_diff_shift6_average10',
 'precipitation_shift2_average5',
 'temperature_min_shift8_average2',
 'temperature_min_shift9_average3',
 'temperature_min_shift10_average9',
 'temperature_max_shift2_average3',
 'temperature_max_shift4_average9',
 'cloud_cover_avg_shift2_average6',
 'cloud_cover_avg_shift9_average6',
 'cloud_cover_avg_shift10_average10',
 'humidity_avg_shift3_average5',
 'humidity_avg_shift5_average9',
 'humidity_avg_shift8_average3',
 'humidity_max_shift4_average2',
 'humidity_max_shift6_average7',
 'precipitation_intensity_avg_shift6_average3',
 'precipitation_intensity_avg_shift8_average2',
 'precipitation_intensity_avg_shift-1',
 'precipitation_intensity_avg_shift-2',
 #manually added
 'precipitation_intensity_avg_shift-3',
 'precipitation_intensity_avg_shift-4',
 'precipitation_intensity_avg_shift-5',
 ]

In [None]:
aquifers_list = [85065, 85064]

In [None]:
# Add the predictions
def weather_forecast(X_train):
    postfixes = ['avg', 'min', 'max']
    for i in range(1,6):
        for postfix in postfixes:
            X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
            X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    return X_train

In [None]:
for aquifer in aquifers_list:
    aquifer_by_stations[aquifer] = weather_forecast(aquifer_by_stations[aquifer])
    aquifer_by_stations[aquifer] = aquifer_by_stations[aquifer][:-5]

In [None]:
'''{'n_estimators': 164, 'max_depth': 20, 'max_features': 0.5}'''

In [None]:
# Initialize the RandomForestClassifier
model = RandomForestRegressor(n_estimators= 164,
                             max_depth=20,
                             max_features=0.5,
                             n_jobs=-1,
                             random_state=42)

horizon_max = 5
day_len = 365

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# Dictionary for storing the predictions for all of the stations
predictions_by_stations = {key: [] for key in aquifers_list}

for aquifer in aquifers_list:
    predictions = []

    for horizon in range (1, horizon_max+1, 1):
        # Define the train and test set
        X_train = aquifer_by_stations[aquifer][best_features][:-(day_len + horizon)]
        y_train = aquifer_by_stations[aquifer][features_target][horizon:-day_len]

        X_test = aquifer_by_stations[aquifer][best_features][-(day_len + horizon):-horizon]
        y_test = aquifer_by_stations[aquifer][features_target][-day_len:]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        forecast = model.predict(X_test).tolist()

        # Store to the predictions
        predictions.append(forecast)
        
        # Calculate and save the r2 score
        r2_scores[horizon-1].append(r2_score(y_test, forecast))

    # Store the predictions to the dictionary
    predictions_by_stations[aquifer] = predictions

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['altitude_diff'][-200:], color="royalblue", label="true data")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[0][-200:], color="tomato", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[2][-200:], color="green", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[4][-200:], color="grey", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation_probability_max'][-200:].apply(lambda x: x/20), color="brown", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation'][-200:].apply(lambda x: x/130), color="olive", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions_by_stations[85064][0][-200:], color="grey", label="forecast")
#plt.savefig('../data/interim/plot.svg', format='svg')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the average r2 score
r2_average =  []
std_dev = []

for i in range(5):
    r2_average.append(np.mean(r2_scores[i]))
    std_dev.append(np.std(r2_scores[i]))

In [None]:
r2_average

In [None]:
# Save the average r2_scores
with open('../reports/randomforest/randomforest-ground-water-r2.txt', 'w') as file:
    for item in r2_average:
        file.write(f"{item}\n")

In [None]:
# Save the standard deviations
with open('../reports/randomforest/randomforest-ground-water-std-dev.txt', 'w') as file:
    for item in std_dev:
        file.write(f"{item}\n")

In [None]:
# Transpose the r2_scores list
r2_scores_transposed = [list(x) for x in zip(*r2_scores)]
# Pair up the stations with their r2_scores and store them in a dictionary
scores = dict(zip(aquifers_list, r2_scores_transposed))
scores

In [None]:
# Sort them by the value in r2_scores[0]
scores_sorted = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1][0])}
scores_sorted

In [None]:
# Save the r2_scores
joblib.dump(scores_sorted, '../reports/randomforest/randomforest-ground-water-r2-stations.joblib')

In [None]:
# Save the dictionary with predictions
joblib.dump(predictions_by_stations, '../reports/randomforest/randomforest-ground-water-predictions.joblib')

##### Troubleshooting

###### <span style="font-size:1.5em;">Testing shifts and averages</span>

In [None]:
aquifer = aquifer_by_stations[85064]

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(aquifer['date'][-60:-30], aquifer['altitude_diff'][-60:-30], color="royalblue", label="true data")
plt.plot(aquifer['date'][-60:-30], aquifer['altitude_diff_shift1_average2'][-60:-30], color="tomato", label="forecast")
plt.plot(aquifer['date'][-60:-30], aquifer['altitude_diff_shift2_average2'][-60:-30], color="green", label="forecast")
plt.plot(aquifer['date'][-60:-30], aquifer['altitude_diff_shift3_average2'][-60:-30], color="orange", label="forecast")
plt.legend()
plt.grid()
plt.show()

###### <span style="font-size:1.5em;">Testing the weather data</span>

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(aquifer['date'][200:300], aquifer['altitude_diff'][200:300], color="royalblue", label="true data")
#plt.plot(aquifer['date'][:60], aquifer['altitude_diff_shift1_average2'][:60], color="tomato", label="forecast")
#plt.plot(aquifer['date'][:60], aquifer['altitude_diff_shift2_average2'][:60], color="green", label="forecast")
#plt.plot(aquifer['date'][:60], aquifer['altitude_diff_shift3_average2'][:60], color="orange", label="forecast")
plt.plot(aquifer['date'][200:300], aquifer['precipitation'][200:300].apply(lambda x: x/30), color="tomato", label="true data")
plt.legend()
plt.grid()
plt.show()

###### <span style="font-size:1.5em;">Testing index shifts</span>

In [None]:
# Parameters, that we want to keep in the data
features_target = ['altitude_diff']
features_train = ['year', 'month', 'day','altitude_diff_shift1', 'altitude_diff_shift2', 'altitude_diff_shift3', 'altitude_diff_shift4', 'altitude_diff_shift5', 'precipitation_average4', 'precipitation_average3', 'precipitation_average5', 'precipitation_average6', 'precipitation_average7', 'precipitation_shift1_average2', 'precipitation_shift1_average3', 'precipitation_average2', 'precipitation_shift1_average4', 'precipitation_average8', 'precipitation_shift1_average5', 'precipitation_average9', 'precipitation_shift1', 'precipitation_average10', 'precipitation_shift1_average6', 'precipitation_shift1_average7', 'precipitation_shift1_average8']

In [None]:
aquifers_list = [85065]

In [None]:
for aquifer in aquifers_list:
    aquifer_by_stations[aquifer]['day_shift'] = aquifer_by_stations[aquifer]['day']

In [None]:
# Initialize the RandomForestClassifier
model = RandomForestRegressor(n_estimators= 51,
                             criterion= 'squared_error',
                             max_depth= 20,
                             min_samples_split= 9,
                             min_samples_leaf= 4,
                             max_features= 0.36336466788790966,
                             #bootstrap= True,
                             oob_score= True,
                             max_leaf_nodes= 10,
                             min_impurity_decrease= 0.0,
                             ccp_alpha= 0.0,
                             #max_samples= None,
                             n_jobs=6,
                             random_state=42)

horizon_max = 5
day_len = 365

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# List for storing the predictions (useful for visualization)
predictions = []

for aquifer in aquifers_list:
    predictions = []

    for horizon in range (1, horizon_max+1, 1):
        # Define the train and test set
        X_train = aquifer_by_stations[aquifer][features_train][:-(day_len + horizon)]
        y_train = aquifer_by_stations[aquifer][features_target][horizon:-day_len]

        X_test = aquifer_by_stations[aquifer][features_train][-(day_len + horizon):-horizon]
        y_test = aquifer_by_stations[aquifer][features_target][-day_len:]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        forecast = model.predict(X_test).tolist()

        # Store to the predictions
        predictions.append(forecast)
        
        # Calculate and save the r2 score
        r2_scores[horizon-1].append(r2_score(y_test, forecast))

In [None]:
y_test[:10]

In [None]:
X_test[:10]

In [None]:
y_train[-10:]

In [None]:
X_train[-10:]

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['altitude_diff'][-200:], color="royalblue", label="true data")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[0][-200:], color="tomato", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[1][-200:], color="green", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[2][-200:], color="orange", label="forecast")
plt.legend()
plt.grid()
plt.show()

###### <span style="font-size:1.5em;">Testing index shifts on sinusoid data</span>

In [None]:
num_points = 1000

# 0 to 20pi range with 1000 points
time_a = np.linspace(0 , 20 * np.pi, num_points)
frequency = 1
amplitude = 0.01  # Amplitude of the sine wave

# Generate the sine wave data
sinusoid = amplitude * np.sin(frequency * time_a)

# Shift the curve up by 1
shifted_sinusoid = sinusoid + 1

# cast to dataframe
shifted_sinusoid = pd.DataFrame(shifted_sinusoid)

shifted_sinusoid.rename(columns={0: 'absolute'}, inplace=True)

shifted_sinusoid['diff'] = shifted_sinusoid['absolute'].diff()
shifted_sinusoid['absolute_shift1'] = shifted_sinusoid['absolute'].shift(1)
shifted_sinusoid['absolute_shift2'] = shifted_sinusoid['absolute'].shift(2)
shifted_sinusoid['absolute_shift3'] = shifted_sinusoid['absolute'].shift(3)
shifted_sinusoid['absolute_shift4'] = shifted_sinusoid['absolute'].shift(4)

In [None]:
shifted_sinusoid = shifted_sinusoid[4:]

In [None]:
shifted_sinusoid

In [None]:
# Plot the shifted sinusoidal curve
plt.plot(shifted_sinusoid['absolute'][-50:], label='Shifted Sinusoid')
plt.plot(shifted_sinusoid['absolute_shift1'][-50:], color='red', label='Shifted Sinusoid')
plt.plot(shifted_sinusoid['absolute_shift2'][-50:], color='orange', label='Shifted Sinusoid')
plt.title('Shifted Sinusoidal Curve Around 1')
plt.xlabel('Time')
plt.ylabel('Value')
plt.axhline(y=1, color='r', linestyle='--', label='y=1')
plt.legend()
plt.show()

In [None]:
# Initialize the RandomForestClassifier
model = RandomForestRegressor(n_estimators= 1000,
                             #criterion= 'squared_error',
                             #max_depth= 20,
                             #min_samples_split= 9,
                             #min_samples_leaf= 4,
                             #max_features= 0.36336466788790966,
                             #bootstrap= True,
                             #oob_score= True,
                             #max_leaf_nodes= 10,
                             #min_impurity_decrease= 0.0,
                             #ccp_alpha= 0.0,
                             #max_samples= None,
                             n_jobs=6,
                             random_state=42)

horizon_max = 5
day_len = 100

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# List for storing the predictions (useful for visualization)
predictions = []
for horizon in range (1, horizon_max+1, 1):
    # Define the train and test set
    X_train = shifted_sinusoid[['diff', 'absolute_shift1', 'absolute_shift2',
       'absolute_shift3', 'absolute_shift4' ]][:-(day_len + horizon)]
    y_train = shifted_sinusoid['absolute'][horizon:-day_len]
    X_test = shifted_sinusoid[['diff', 'absolute_shift1', 'absolute_shift2',
       'absolute_shift3', 'absolute_shift4' ]][-(day_len + horizon):-horizon]
    y_test = shifted_sinusoid['absolute'][-day_len:]
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    forecast = model.predict(X_test).tolist()
    # Store to the predictions
    predictions.append(forecast)
    
    # Calculate and save the r2 score
    r2_scores[horizon-1].append(r2_score(y_test, forecast))

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(list(range(100)), shifted_sinusoid['absolute'][-100:], color="royalblue", label="true data")
#plt.plot(predictions[0][-25:], color="red", label="forecast")
#plt.plot(predictions[1][-25:], color="green", label="forecast")
#plt.plot(predictions[2][-25:], color="purple", label="forecast")
#plt.plot(predictions[3][-25:], color="orange", label="forecast")
plt.plot(predictions[4][-100:], color="brown", label="forecast")
plt.legend()
plt.grid()
plt.show()

###### <span style="font-size:1.5em;">Testing with only one prediction horizon</span>

In [None]:
# Initialize the RandomForestClassifier
model = RandomForestRegressor(n_estimators= 2000,
                             #criterion= 'squared_error',
                             #max_depth= 20,
                             #min_samples_split= 9,
                             #min_samples_leaf= 4,
                             #max_features= 0.36336466788790966,
                             #bootstrap= True,
                             #oob_score= True,
                             #max_leaf_nodes= 10,
                             #min_impurity_decrease= 0.0,
                             #ccp_alpha= 0.0,
                             #max_samples= None,
                             n_jobs=6,
                             random_state=42)

horizon = 5
day_len = 200

# List for storing the predictions (useful for visualization)
predictions = []
# Define the train and test set
X_train = shifted_sinusoid[:-(day_len + horizon)]
y_train = shifted_sinusoid[horizon:-day_len]
X_test = shifted_sinusoid[-(day_len + horizon):-horizon]
y_test = shifted_sinusoid[-day_len:]
# Train the model
model.fit(X_train, y_train)
# Make predictions
forecast = model.predict(X_test).tolist()
# Store to the predictions
predictions.append(forecast)

# Calculate and save the r2 score
r2_scores[horizon-1].append(r2_score(y_test, forecast))

In [None]:
predictions[0][-5:]

In [None]:
X_test[-5:]

In [None]:
y_test[-5:]

In [None]:
X_train[-5:]

In [None]:
y_train[-5:]

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(list(range(200)), shifted_sinusoid[-200:], color="royalblue", label="true data")
#plt.plot(predictions[0][-25:], color="red", label="forecast")
#plt.plot(predictions[1][-25:], color="green", label="forecast")
#plt.plot(predictions[2][-25:], color="purple", label="forecast")
#plt.plot(predictions[3][-25:], color="orange", label="forecast")
plt.plot(predictions[0][-200:], color="brown", label="forecast")
plt.legend()
plt.grid()
plt.show()

###### <span style="font-size:1.5em;">Testing on simpler data</span>

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv")

# We create shifts of the #Passengers
df['#Passengers_shift1'] = df['#Passengers'].shift(-1)
df['#Passengers_shift2'] = df['#Passengers'].shift(-2)
df['#Passengers_shift3'] = df['#Passengers'].shift(-3)
df['#Passengers_shift4'] = df['#Passengers'].shift(-4)
df['Month'] = df['Month'].str[-2:]


# Initialize the RandomForestClassifier
model = RandomForestRegressor(n_estimators= 100,
                             n_jobs=6,
                             random_state=42)

horizon_max = 5
day_len = 30

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# List for storing the predictions (useful for visualization)
predictions = []
for horizon in range (1, horizon_max+1, 1):
    # Define the train and test set
    X_train = df[['Month', '#Passengers_shift1', '#Passengers_shift2', '#Passengers_shift3', '#Passengers_shift4']][:-(day_len + horizon)]
    y_train = df['#Passengers'][horizon:-day_len]
    X_test = df[['Month', '#Passengers_shift1', '#Passengers_shift2', '#Passengers_shift3', '#Passengers_shift4']][-(day_len + horizon):-horizon]
    y_test = df['#Passengers'][-day_len:]
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    forecast = model.predict(X_test).tolist()
    # Store to the predictions
    predictions.append(forecast)
    
    # Calculate and save the r2 score
    r2_scores[horizon-1].append(r2_score(y_test, forecast))




In [None]:
df

In [None]:
forecast_index = list(range(len(df['#Passengers'])-day_len, len(df['#Passengers'])))
plt.figure(figsize=(8, 4))
plt.plot(df["#Passengers"][-30:], color="royalblue", label="historical data")
plt.plot(forecast_index[-30:], predictions[0][-30:], color="tomato", label="forecast")
plt.plot(forecast_index[-30:], predictions[1][-30:], color="green", label="forecast")
plt.plot(forecast_index[-30:], predictions[2][-30:], color="orange", label="forecast")
plt.plot(forecast_index[-30:], predictions[3][-30:], color="grey", label="forecast")
plt.plot(forecast_index[-30:], predictions[4][-30:], color="brown", label="forecast")
plt.legend()
plt.grid()
plt.show()

In [None]:
r2_scores

###### <span style="font-size:1.5em;">Ploting precipitation</span>

In [None]:
plt.figure(figsize=(8, 4))
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['altitude_diff'][-200:], color="royalblue", label="true data")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[2][-200:], color="tomato", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[2][-200:], color="green", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[4][-200:], color="grey", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation_probability_max'][-200:].apply(lambda x: x/20), color="brown", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation'][-200:].apply(lambda x: x/130), color="olive", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation_intensity_max'][-200:].apply(lambda x: x/130), color="salmon", label="forecast")
plt.savefig('../data/interim/plot.svg', format='svg')
plt.legend()
plt.grid()
plt.show()

##### Final prediction pipeline

In [None]:
# Function to add the predictions
def weather_forecast(X_train):
    postfixes = ['avg', 'min', 'max']
    for i in range(1,6):
        for postfix in postfixes:
            X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
            X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    return X_train

In [None]:
aquifers_list = [85065, 85064]

In [None]:
for aquifer in aquifers_list:
    aquifer_by_stations[aquifer] = weather_forecast(aquifer_by_stations[aquifer])
    aquifer_by_stations[aquifer] = aquifer_by_stations[aquifer][:-5]

In [None]:
'''{'n_estimators': 51,
 'criterion': 'squared_error',
 'max_depth': 20,
 'min_samples_split': 9,
 'min_samples_leaf': 4,
 'max_features': 0.36336466788790966,
 'oob_score': True,
 'max_leaf_nodes': 10,
 'min_impurity_decrease': 0.0,
 'ccp_alpha': 0.0}'''

In [None]:
# Initialize the RandomForestClassifier
model = RandomForestRegressor(n_estimators= 200,
                             #criterion= 'squared_error',
                             #max_depth= 50,
                             #min_samples_split= 9,
                             #min_samples_leaf= 4,
                             max_features= 'sqrt',
                             #bootstrap= True,
                             #oob_score= True,
                             #max_leaf_nodes= 10,
                             #min_impurity_decrease= 0.0,
                             #ccp_alpha= 0.0,
                             #max_samples= None,
                             n_jobs=6,
                             random_state=42)

horizon_max = 5
day_len = 365

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# List for storing the predictions (useful for visualization)
predictions = []

for aquifer in aquifers_list:
    predictions = [] # make sure that the list is empty for every aquifer

    for horizon in range (1, horizon_max+1, 1):
        # Define the train and test set
        X_train = aquifer_by_stations[aquifer][best_features][:-(day_len + horizon)]
        y_train = aquifer_by_stations[aquifer][features_target][horizon:-day_len]

        X_test = aquifer_by_stations[aquifer][best_features][-(day_len + horizon):-horizon]
        y_test = aquifer_by_stations[aquifer][features_target][-day_len:]

        # Choose the best hyperparameter for the station and horizon

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        forecast = model.predict(X_test).tolist()

        # Store to the predictions
        predictions.append(forecast)
        
        # Calculate and save the r2 score
        r2_scores[horizon-1].append(r2_score(y_test, forecast))

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['altitude_diff'][-200:], color="royalblue", label="true data")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[0][-200:], color="tomato", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[2][-200:], color="green", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[4][-200:], color="grey", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation_probability_max'][-200:].apply(lambda x: x/20), color="brown", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation'][-200:].apply(lambda x: x/130), color="olive", label="forecast")
plt.savefig('../data/interim/plot.svg', format='svg')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the average r2 score
r2_average =  []
std_dev = []

for i in range(5):
    r2_average.append(np.mean(r2_scores[i]))
    std_dev.append(np.std(r2_scores[i]))

In [None]:
r2_average

In [None]:
# Save the average r2_scores
with open('../reports/randomforest/n-hits-ground-water-r2.txt', 'w') as file:
    for item in r2_average:
        file.write(f"{item}\n")

In [None]:
# Save the standard deviations
with open('../reports/randomforest/n-hits-ground-water-std-dev.txt', 'w') as file:
    for item in std_dev:
        file.write(f"{item}\n")

In [None]:
# Transpose the r2_scores list
r2_scores_transposed = [list(x) for x in zip(*r2_scores)]
# Pair up the stations with their r2_scores and store them in a dictionary
scores = dict(zip(aquifers_list, r2_scores_transposed))
scores

In [None]:
# Sort them by the value in r2_scores[0]
scores_sorted = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1][0])}
scores_sorted

In [None]:
# Save the r2_scores
joblib.dump(scores_sorted, '../reports/randomforest/n-hits-ground-water-r2-stations.joblib')

#### **1.2** Surface water data

In [None]:
# Read the dataset
watercourse_by_stations = joblib.load('../data/interim/surface-water-and-weather.joblib')

In [None]:
# Parameters, that we want to keep in the data
features_target = ['level_diff']
features_train = ['year', 'month', 'day','level_diff_shift1', 'level_diff_shift2', 'level_diff_shift3', 
                  'level_diff_shift4', 'level_diff_shift5', 'precipitation_intensity_avg_average2', 
                  'precipitation_average2', 'precipitation_intensity_max_average2', 
                  'precipitation_probability_avg_average2', 'precipitation_intensity_avg', 
                  'precipitation_intensity_avg_shift1', 'precipitation', 'precipitation_shift1', 
                  'precipitation_intensity_avg_average3', 'precipitation_probability_avg_shift1', 
                  'precipitation_probability_avg', 'precipitation_average3', 'precipitation_intensity_max', 
                  'precipitation_intensity_max_shift1', 'precipitation_probability_max_average2', 
                  'precipitation_probability_avg_average3', 'precipitation_intensity_max_average3', 
                  'precipitation_probability_max', 'precipitation_probability_max_shift1', 'precipitation_probability_max_average3', 
                  'precipitation_intensity_avg_average4', 'precipitation_average4', 'precipitation_probability_avg_average4', 
                  'precipitation_probability_min_average2', 'precipitation_intensity_max_average4', 
                  'precipitation_probability_avg_shift1_average2', 'precipitation_intensity_avg_shift1_average2']

In [None]:
# Transform date column to year, month and day columns
for key in watercourse_by_stations.keys():
    watercourse_by_stations[key]['year'] = watercourse_by_stations[key]['date'].dt.year
    watercourse_by_stations[key]['month'] = watercourse_by_stations[key]['date'].dt.month
    watercourse_by_stations[key]['day'] = watercourse_by_stations[key]['date'].dt.day

##### Feature selection

In [None]:
# Get the data
watercourse_by_stations = joblib.load('../data/interim/surface-water-and-weather.joblib')

In [None]:
station = 2530
day_len = 365
horizon = 3


In [None]:
X_train = watercourse_by_stations[station][:-(day_len + horizon)].drop(columns=['level_diff', 'level', 'date'])
y_train = watercourse_by_stations[station][features_target][horizon:-day_len]

In [None]:
# Add the predictions
postfixes = ['avg', 'min', 'max']
for i in range(1,6):
    for postfix in postfixes:
        X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
        X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    

In [None]:
# Remove the last 5 values
X_train = X_train[:-5]
y_train = y_train[:-5]

In [None]:
# Time series split
tscv = TimeSeriesSplit(n_splits=2)

# Initialize model
model = LinearRegression(n_jobs=4)

# Initialize genetic algorithm feature selector with max_features set
gafs = GAFeatureSelectionCV(
    estimator=model,
    cv=tscv,
    scoring="r2",
    population_size=200,
    generations=35,
    n_jobs=-1,
    verbose=True,
    keep_top_k=5,
    elitism=True,
    max_features=40,  # Set the maximum number of features to select
    mutation_probability=0.2,
    crossover_probability=0.8
)

# Fit the feature selector
gafs.fit(X_train, y_train)

# Plot fitness evolution
plot_fitness_evolution(gafs)
# Output best features found
print("Best Features Found:", gafs.best_features_)

In [None]:
gafs.best_features_

In [None]:
# Save the best features to a list
best_features = [item for item, keep in zip(X_train.columns, gafs.best_features_) if keep]

In [None]:
best_features

##### Hyperparameter tuning

In [None]:
# Define the horizon, day_len (number of predicted days), test_len (number of days used for final testing)
horizon_max = 5
day_len = 100
test_len = 365

In [None]:
# Stations to test
station_list = [4270, 4570, 4515, 6068]

In [None]:
# Define the function which contains parameters to tune and the model

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 20, 30, 50])
    max_features = trial.suggest_categorical('max_features', ["sqrt", "auto", "auto", None])

    # Initialize the RandomForestClassifier
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=6,
                                  random_state=42)
    
    # List for r2 results for different prediction horizons
    r2_scores = [[] for _ in range(horizon_max)]
    
    for station in station_list:
    
        for horizon in range (1, horizon_max+1, 1):
            # Define the train and test set
            X_train = watercourse_by_stations[station][features_train][:-(day_len + horizon + test_len)]
            y_train = watercourse_by_stations[station][features_target][horizon:-(day_len + test_len)]
    
            X_test = watercourse_by_stations[station][features_train][-(day_len + horizon + test_len):-(horizon + test_len)]
            y_test = watercourse_by_stations[station][features_target][-(day_len + test_len):-test_len]
    
            # Train the model
            model.fit(X_train, y_train)
    
            # Make predictions
            forecast = model.predict(X_test).tolist()
            
            # Calculate and save the r2 score
            r2_scores[horizon-1].append(r2_score(y_test, forecast))
    
    # Calculate the average r2 score
    r2_average =  []
    
    for i in range(5):
        r2_average.append(np.mean(r2_scores[i]))

    # Set the loss as average of average r2 scores for different prediction horizons
    loss = np.mean(r2_average)

    return loss

In [None]:
# Run the optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [None]:
study.best_params

In [None]:
study.best_value

##### Testing on multiple stations

In [None]:
# Best features
'''['precipitation_shift1',
 'snow_accumulation_shift10',
 'temperature_avg_shift1',
 'precipitation_probability_max_shift7',
 'level_diff_average3',
 'precipitation_average2',
 'precipitation_probability_min_average10',
 'precipitation_shift8_average4',
 'snow_accumulation_shift4_average2',
 'temperature_min_shift2_average4',
 'temperature_max_shift1_average4',
 'temperature_max_shift4_average4',
 'cloud_cover_min_shift2_average2',
 'cloud_cover_max_shift4_average7',
 'cloud_cover_max_shift9_average3',
 'humidity_avg_shift2_average6',
 'humidity_avg_shift7_average9',
 'humidity_avg_shift10_average5',
 'humidity_min_shift2_average10',
 'humidity_min_shift6_average5',
 'humidity_min_shift6_average9',
 'humidity_max_shift8_average6',
 'precipitation_probability_avg_shift5_average7',
 'precipitation_probability_avg_shift9_average6',
 'precipitation_probability_avg_shift10_average2',
 'precipitation_probability_avg_shift10_average4',
 'precipitation_intensity_avg_shift3_average5',
 'precipitation_intensity_max_shift2_average6',
 'precipitation_intensity_max_shift7_average10',
 'precipitation_intensity_max_shift10_average10',
 'day',
 'precipitation_intensity_max_shift-5',
 'precipitation_intensity_max_shift-4',
 'precipitation_intensity_max_shift-3',
 'precipitation_intensity_max_shift-2',
 'precipitation_intensity_max_shift-1']'''

In [None]:
best_features = ['precipitation_shift1',
 'snow_accumulation_shift10',
 'temperature_avg_shift1',
 'precipitation_probability_max_shift7',
 'level_diff_average3',
 'precipitation_average2',
 'precipitation_probability_min_average10',
 'precipitation_shift8_average4',
 'snow_accumulation_shift4_average2',
 'temperature_min_shift2_average4',
 'temperature_max_shift1_average4',
 'temperature_max_shift4_average4',
 'cloud_cover_min_shift2_average2',
 'cloud_cover_max_shift4_average7',
 'cloud_cover_max_shift9_average3',
 'humidity_avg_shift2_average6',
 'humidity_avg_shift7_average9',
 'humidity_avg_shift10_average5',
 'humidity_min_shift2_average10',
 'humidity_min_shift6_average5',
 'humidity_min_shift6_average9',
 'humidity_max_shift8_average6',
 'precipitation_probability_avg_shift5_average7',
 'precipitation_probability_avg_shift9_average6',
 'precipitation_probability_avg_shift10_average2',
 'precipitation_probability_avg_shift10_average4',
 'precipitation_intensity_avg_shift3_average5',
 'precipitation_intensity_max_shift2_average6',
 'precipitation_intensity_max_shift7_average10',
 'precipitation_intensity_max_shift10_average10',
 'day',
 'precipitation_intensity_max_shift-5',
 'precipitation_intensity_max_shift-4',
 'precipitation_intensity_max_shift-3',
 'precipitation_intensity_max_shift-2',
 'precipitation_intensity_max_shift-1']

In [None]:
# List of station used for testing
station_list = [2530, 2620, 4200, 4230, 4270, 4515, 4520, 4570, 4575, 5040, 5078, 5330, 5425, 5500, 6060, 6068, 6200, 6220, 6300, 6340, 8454, 8565]

In [None]:
# Add the weather forecast features
for station in station_list:
    watercourse_by_stations[station] = weather_forecast(watercourse_by_stations[station])
    # Get rid of the last 5 rows
    watercourse_by_stations[station] = watercourse_by_stations[station][:-5]

In [None]:
station_list = [2530]

In [None]:
# Initialize the RandomForestClassifier
model = RandomForestRegressor(n_estimators= 250,
                             #criterion= 'squared_error',
                             #max_depth= 20,
                             #min_samples_split= 9,
                             #min_samples_leaf= 4,
                             #max_features= 0.36336466788790966,
                             #bootstrap= True,
                             #oob_score= True,
                             #max_leaf_nodes= 10,
                             #min_impurity_decrease= 0.0,
                             #ccp_alpha= 0.0,
                             #max_samples= None,
                             n_jobs=6,
                             random_state=42)

horizon_max = 5
day_len = 365

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# List for storing the predictions (useful for visualization)
predictions = []

for station in station_list:
    predictions = []

    for horizon in range (1, horizon_max+1, 1):
        # Define the train and test set
        X_train = watercourse_by_stations[station][best_features][:-(day_len + horizon)]
        y_train = watercourse_by_stations[station][features_target][horizon:-day_len]

        X_test = watercourse_by_stations[station][best_features][-(day_len + horizon):-horizon]
        y_test = watercourse_by_stations[station][features_target][-day_len:]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        forecast = model.predict(X_test).tolist()

        # Store to the predictions
        predictions.append(forecast)
        
        # Calculate and save the r2 score
        r2_scores[horizon-1].append(r2_score(y_test, forecast))

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(watercourse_by_stations[station]['date'][-150:-80], watercourse_by_stations[station]['level_diff'][-150:-80], color="royalblue", label="true data")
plt.plot(watercourse_by_stations[station]['date'][-150:-80], predictions[0][-150:-80], color="tomato", label="forecast")
plt.plot(watercourse_by_stations[station]['date'][-150:-80], predictions[1][-150:-80], color="green", label="forecast")
plt.plot(watercourse_by_stations[station]['date'][-150:-80], predictions[2][-150:-80], color="orange", label="forecast")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the average r2 score
r2_average =  []
std_dev = []

for i in range(5):
    r2_average.append(np.mean(r2_scores[i]))
    std_dev.append(np.std(r2_scores[i]))

In [None]:
r2_average

In [None]:
# Save the average r2_scores
with open('../reports/randomforest/n-hits-ground-water-r2.txt', 'w') as file:
    for item in r2_average:
        file.write(f"{item}\n")

In [None]:
# Save the standard deviations
with open('../reports/randomforest/n-hits-ground-water-std-dev.txt', 'w') as file:
    for item in std_dev:
        file.write(f"{item}\n")

In [None]:
# Transpose the r2_scores list
r2_scores_transposed = [list(x) for x in zip(*r2_scores)]
# Pair up the stations with their r2_scores and store them in a dictionary
scores = dict(zip(aquifers_list, r2_scores_transposed))
scores

In [None]:
# Sort them by the value in r2_scores[0]
scores_sorted = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1][0])}
scores_sorted

In [None]:
# Save the r2_scores
joblib.dump(scores_sorted, '../reports/randomforest/n-hits-ground-water-r2-stations.joblib')

### **2** Gradient Boosting

#### **2.1** Ground water data

In [None]:
# Get the data
aquifer_by_stations = joblib.load('../data/interim/ground-water-and-weather.joblib')

In [None]:
# Parameters, that we want to keep in the data
features_target = ['altitude_diff']
features_train = ['year', 'month', 'day','altitude_diff_shift1', 'altitude_diff_shift2', 'altitude_diff_shift3', 'altitude_diff_shift4', 'altitude_diff_shift5', 'precipitation_average4', 'precipitation_average3', 'precipitation_average5', 'precipitation_average6', 'precipitation_average7', 'precipitation_shift1_average2', 'precipitation_shift1_average3', 'precipitation_average2', 'precipitation_shift1_average4', 'precipitation_average8', 'precipitation_shift1_average5', 'precipitation_average9', 'precipitation_shift1', 'precipitation_average10', 'precipitation_shift1_average6', 'precipitation_shift1_average7', 'precipitation_shift1_average8']

In [None]:
# Transform date column to year, month and day columns
for key in aquifer_by_stations.keys():
    aquifer_by_stations[key]['year'] = aquifer_by_stations[key]['date'].dt.year
    aquifer_by_stations[key]['month'] = aquifer_by_stations[key]['date'].dt.month
    aquifer_by_stations[key]['day'] = aquifer_by_stations[key]['date'].dt.day

##### Hyperparameter tuning

In [None]:
# best features longer validation
best_features =  ['temperature_min',
 'snow_accumulation_average6',
 'temperature_avg_average2',
 'cloud_cover_max_average3',
 'humidity_avg_average4',
 'precipitation_probability_avg_average2',
 'precipitation_probability_max_average5',
 'precipitation_intensity_avg_average6',
 'snow_accumulation_shift1_average4',
 'temperature_min_shift1_average3',
 'temperature_min_shift3_average6',
 'temperature_min_shift9_average5',
 'temperature_max_shift4_average2',
 'temperature_max_shift6_average6',
 'humidity_avg_shift10_average2',
 'humidity_min_shift8_average2',
 'humidity_min_shift9_average8',
 'humidity_min_shift10_average5',
 'humidity_max_shift3_average10',
 'humidity_max_shift8_average8',
 'humidity_max_shift8_average9',
 'precipitation_probability_min_shift7_average4',
 'precipitation_probability_min_shift9_average10',
 'precipitation_probability_max_shift5_average10',
 'precipitation_probability_max_shift10_average3',
 'precipitation_intensity_avg_shift2_average2',
 'precipitation_intensity_min_shift5_average4',
 'precipitation_intensity_max_shift2_average10',
 'precipitation_intensity_max_shift4_average5',
 'precipitation_probability_avg_shift-1',
 'precipitation_probability_avg_shift-2',
 'precipitation_probability_max_shift-2',
 'precipitation_intensity_max_shift-3',
 # manually added
 'precipitation_probability_avg_shift-5',
 'precipitation_probability_avg_shift-4',
 'precipitation_probability_avg_shift-3'
 ]

In [None]:
aquifers_list = [85065, 85064]

In [None]:
# Add the predictions
def weather_forecast(X_train):
    postfixes = ['avg', 'min', 'max']
    for i in range(1,6):
        for postfix in postfixes:
            X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
            X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    return X_train

In [None]:
for aquifer in aquifers_list:
    aquifer_by_stations[aquifer] = weather_forecast(aquifer_by_stations[aquifer])
    aquifer_by_stations[aquifer] = aquifer_by_stations[aquifer][:-5]

In [None]:
# Define the horizon, day_len (number of predicted days), test_len (number of days used for final testing)
horizon_max = 5
day_len = 100
test_len = 365

In [None]:
# Define the function which contains parameters to tune and the model

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    max_depth = trial.suggest_categorical('max_depth', [None, 10, 20, 30, 50])
    max_features = trial.suggest_categorical('max_features', ["sqrt", "log2", 0.5, 1.0])


    # Initialize the RandomForestClassifier
    model = GradientBoostingRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  random_state=42)
    
    # List for r2 results for different prediction horizons
    r2_scores = [[] for _ in range(horizon_max)]
    
    for aquifer in aquifers_list:
    
        for horizon in range (1, horizon_max+1, 1):
            # Define the train and test set
            X_train = aquifer_by_stations[aquifer][best_features][:-(day_len + horizon + test_len)]
            y_train = aquifer_by_stations[aquifer][features_target][horizon:-(day_len + test_len)]
    
            X_test = aquifer_by_stations[aquifer][best_features][-(day_len + horizon + test_len):-(horizon + test_len)]
            y_test = aquifer_by_stations[aquifer][features_target][-(day_len + test_len):-test_len]
    
            # Train the model
            model.fit(X_train, y_train)
    
            # Make predictions
            forecast = model.predict(X_test).tolist()
            
            # Calculate and save the r2 score
            r2_scores[horizon-1].append(r2_score(y_test, forecast))
    
    # Calculate the average r2 score
    r2_average =  []
    
    for i in range(5):
        r2_average.append(np.mean(r2_scores[i]))

    # Set the loss as average of average r2 scores for different prediction horizons
    loss = np.mean(r2_average)

    return loss

In [None]:
# Run the optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [None]:
study.best_params

In [None]:
study.best_value

##### Testing on multiple stations

In [None]:
# Get the data
aquifer_by_stations = joblib.load('../data/interim/ground-water-and-weather.joblib')

In [None]:
# Parameters, that we want to keep in the data
features_target = ['altitude_diff']
features_train = ['year', 'month', 'day','altitude_diff_shift1', 'altitude_diff_shift2', 'altitude_diff_shift3', 'altitude_diff_shift4', 'altitude_diff_shift5', 'precipitation_average4', 'precipitation_average3', 'precipitation_average5', 'precipitation_average6', 'precipitation_average7', 'precipitation_shift1_average2', 'precipitation_shift1_average3', 'precipitation_average2', 'precipitation_shift1_average4', 'precipitation_average8', 'precipitation_shift1_average5', 'precipitation_average9', 'precipitation_shift1', 'precipitation_average10', 'precipitation_shift1_average6', 'precipitation_shift1_average7', 'precipitation_shift1_average8']

In [None]:
# Transform date column to year, month and day columns
for key in aquifer_by_stations.keys():
    aquifer_by_stations[key]['year'] = aquifer_by_stations[key]['date'].dt.year
    aquifer_by_stations[key]['month'] = aquifer_by_stations[key]['date'].dt.month
    aquifer_by_stations[key]['day'] = aquifer_by_stations[key]['date'].dt.day

In [None]:
# best features longer validation
best_features =  ['temperature_min',
 'snow_accumulation_average6',
 'temperature_avg_average2',
 'cloud_cover_max_average3',
 'humidity_avg_average4',
 'precipitation_probability_avg_average2',
 'precipitation_probability_max_average5',
 'precipitation_intensity_avg_average6',
 'snow_accumulation_shift1_average4',
 'temperature_min_shift1_average3',
 'temperature_min_shift3_average6',
 'temperature_min_shift9_average5',
 'temperature_max_shift4_average2',
 'temperature_max_shift6_average6',
 'humidity_avg_shift10_average2',
 'humidity_min_shift8_average2',
 'humidity_min_shift9_average8',
 'humidity_min_shift10_average5',
 'humidity_max_shift3_average10',
 'humidity_max_shift8_average8',
 'humidity_max_shift8_average9',
 'precipitation_probability_min_shift7_average4',
 'precipitation_probability_min_shift9_average10',
 'precipitation_probability_max_shift5_average10',
 'precipitation_probability_max_shift10_average3',
 'precipitation_intensity_avg_shift2_average2',
 'precipitation_intensity_min_shift5_average4',
 'precipitation_intensity_max_shift2_average10',
 'precipitation_intensity_max_shift4_average5',
 'precipitation_probability_avg_shift-1',
 'precipitation_probability_avg_shift-2',
 'precipitation_probability_max_shift-2',
 'precipitation_intensity_max_shift-3',
 # manually added
 'precipitation_probability_avg_shift-5',
 'precipitation_probability_avg_shift-4',
 'precipitation_probability_avg_shift-3'
 ]

In [None]:
aquifers_list = [85065, 85064]

In [None]:
# Add the predictions
def weather_forecast(X_train):
    postfixes = ['avg', 'min', 'max']
    for i in range(1,6):
        for postfix in postfixes:
            X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
            X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    return X_train

In [None]:
for aquifer in aquifers_list:
    aquifer_by_stations[aquifer] = weather_forecast(aquifer_by_stations[aquifer])
    aquifer_by_stations[aquifer] = aquifer_by_stations[aquifer][:-5]

In [None]:
# best parameters for n_estimators=10
'''{'n_estimators': 28, 'max_depth': 10, 'max_features': 'log2'}'''

In [None]:
# best parameters for n_estimators=50
'''{'n_estimators': 140, 'max_depth': 30, 'max_features': 'sqrt'}'''

In [None]:
# Initialize the GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=28, max_features='log2', max_depth=10, random_state=42)

horizon_max = 5
day_len = 365

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# Dictionary for storing the predictions for all of the stations
predictions_by_stations = {key: [] for key in aquifers_list}

for aquifer in aquifers_list:
    predictions = []

    for horizon in range (1, horizon_max+1, 1):
        # Define the train and test set
        X_train = aquifer_by_stations[aquifer][best_features][:-(day_len + horizon)]
        y_train = aquifer_by_stations[aquifer][features_target][horizon:-day_len]

        X_test = aquifer_by_stations[aquifer][best_features][-(day_len + horizon):-horizon]
        y_test = aquifer_by_stations[aquifer][features_target][-day_len:]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        forecast = model.predict(X_test).tolist()

        # Store to the predictions
        predictions.append(forecast)
        
        # Calculate and save the r2 score
        r2_scores[horizon-1].append(r2_score(y_test, forecast))

    predictions_by_stations[aquifer] = predictions

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(aquifer_by_stations[aquifer]['date'][-100:-60], aquifer_by_stations[aquifer]['altitude_diff'][-100:-60], color="royalblue", label="true data")
plt.plot(aquifer_by_stations[aquifer]['date'][-100:-60], predictions[0][-100:-60], color="tomato", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-100:-60], predictions[1][-100:-60], color="green", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-100:-60], predictions_by_stations[85064][0][-100:-60], color="orange", label="forecast")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the average r2 score
r2_average =  []
std_dev = []

for i in range(5):
    r2_average.append(np.mean(r2_scores[i]))
    std_dev.append(np.std(r2_scores[i]))

In [None]:
r2_average

In [None]:
# Save the average r2_scores
with open('../reports/gradient-boosting/gradient-boosting-ground-water-r2.txt', 'w') as file:
    for item in r2_average:
        file.write(f"{item}\n")

In [None]:
# Save the standard deviations
with open('../reports/gradient-boosting/gradient-boosting-ground-water-std-dev.txt', 'w') as file:
    for item in std_dev:
        file.write(f"{item}\n")

In [None]:
# Transpose the r2_scores list
r2_scores_transposed = [list(x) for x in zip(*r2_scores)]
# Pair up the stations with their r2_scores and store them in a dictionary
scores = dict(zip(aquifers_list, r2_scores_transposed))
scores

In [None]:
# Sort them by the value in r2_scores[0]
scores_sorted = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1][0])}
scores_sorted

In [None]:
# Save the r2_scores
joblib.dump(scores_sorted, '../reports/gradient-boosting/gradient-boosting-ground-water-r2-stations.joblib')

In [None]:
joblib.dump(predictions_by_stations, '../reports/gradient-boosting/gradient-boosting-ground-water-predictions.joblib')

#### **2.2** Surface water data

In [None]:
# Read the dataset
watercourse_by_stations = joblib.load('../data/interim/surface-water-and-weather.joblib')

In [None]:
# Parameters, that we want to keep in the data
features_target = ['level_diff']
features_train = ['year', 'month', 'day','level_diff_shift1', 'level_diff_shift2', 'level_diff_shift3', 
                  'level_diff_shift4', 'level_diff_shift5', 'precipitation_intensity_avg_average2', 
                  'precipitation_average2', 'precipitation_intensity_max_average2', 
                  'precipitation_probability_avg_average2', 'precipitation_intensity_avg', 
                  'precipitation_intensity_avg_shift1', 'precipitation', 'precipitation_shift1', 
                  'precipitation_intensity_avg_average3', 'precipitation_probability_avg_shift1', 
                  'precipitation_probability_avg', 'precipitation_average3', 'precipitation_intensity_max', 
                  'precipitation_intensity_max_shift1', 'precipitation_probability_max_average2', 
                  'precipitation_probability_avg_average3', 'precipitation_intensity_max_average3', 
                  'precipitation_probability_max', 'precipitation_probability_max_shift1', 'precipitation_probability_max_average3', 
                  'precipitation_intensity_avg_average4', 'precipitation_average4', 'precipitation_probability_avg_average4', 
                  'precipitation_probability_min_average2', 'precipitation_intensity_max_average4', 
                  'precipitation_probability_avg_shift1_average2', 'precipitation_intensity_avg_shift1_average2']

In [None]:
# Transform date column to year, month and day columns
for key in watercourse_by_stations.keys():
    watercourse_by_stations[key]['year'] = watercourse_by_stations[key]['date'].dt.year
    watercourse_by_stations[key]['month'] = watercourse_by_stations[key]['date'].dt.month
    watercourse_by_stations[key]['day'] = watercourse_by_stations[key]['date'].dt.day

##### Hyperparameter tuning

In [None]:
# Define the horizon, day_len (number of predicted days), test_len (number of days used for final testing)
horizon_max = 5
day_len = 100
test_len = 365

In [None]:
# Stations to test
station_list = [4270, 4570, 4515, 6068]

In [None]:
# Define the function which contains parameters to tune and the model

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    learning_rate = trial.suggest_float('learning_rate', 0.0, 1.0)
    max_depth = trial.suggest_int('max_depth', 3, 9)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    max_features = trial.suggest_float('max_features', 0.5, 1.0)
    loss = trial.suggest_categorical('loss', ['squared_error', 'absolute_error', 'huber'])
    tol = trial.suggest_float('tol', 1e-4, 1e-2)
    n_iter_no_change = trial.suggest_categorical('n_iter_no_change', [None, 5, 10, 20])


    # Initialize the RandomForestClassifier
    model = GradientBoostingRegressor(n_estimators=n_estimators,
                                  learning_rate=learning_rate,
                                  max_depth=max_depth,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf,
                                  subsample=subsample,
                                  max_features=max_features,
                                  tol=tol,
                                  n_iter_no_change=n_iter_no_change,
                                  random_state=42)
    
    # List for r2 results for different prediction horizons
    r2_scores = [[] for _ in range(horizon_max)]
    
    for station in station_list:
    
        for horizon in range (1, horizon_max+1, 1):
            # Define the train and test set
            X_train = watercourse_by_stations[station][features_train][:-(day_len + horizon + test_len)]
            y_train = watercourse_by_stations[station][features_target][horizon:-(day_len + test_len)]
    
            X_test = watercourse_by_stations[station][features_train][-(day_len + horizon + test_len):-(horizon + test_len)]
            y_test = watercourse_by_stations[station][features_target][-(day_len + test_len):-test_len]
    
            # Train the model
            model.fit(X_train, y_train)
    
            # Make predictions
            forecast = model.predict(X_test).tolist()
            
            # Calculate and save the r2 score
            r2_scores[horizon-1].append(r2_score(y_test, forecast))
    
    # Calculate the average r2 score
    r2_average =  []
    
    for i in range(5):
        r2_average.append(np.mean(r2_scores[i]))

    # Set the loss as average of average r2 scores for different prediction horizons
    loss = np.mean(r2_average)

    return loss

In [None]:
# Run the optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)

In [None]:
study.best_params

In [None]:
study.best_value

##### Testing on multiple stations

In [None]:
best_features = ['precipitation_shift1',
 'snow_accumulation_shift10',
 'temperature_avg_shift1',
 'precipitation_probability_max_shift7',
 'level_diff_average3',
 'precipitation_average2',
 'precipitation_probability_min_average10',
 'precipitation_shift8_average4',
 'snow_accumulation_shift4_average2',
 'temperature_min_shift2_average4',
 'temperature_max_shift1_average4',
 'temperature_max_shift4_average4',
 'cloud_cover_min_shift2_average2',
 'cloud_cover_max_shift4_average7',
 'cloud_cover_max_shift9_average3',
 'humidity_avg_shift2_average6',
 'humidity_avg_shift7_average9',
 'humidity_avg_shift10_average5',
 'humidity_min_shift2_average10',
 'humidity_min_shift6_average5',
 'humidity_min_shift6_average9',
 'humidity_max_shift8_average6',
 'precipitation_probability_avg_shift5_average7',
 'precipitation_probability_avg_shift9_average6',
 'precipitation_probability_avg_shift10_average2',
 'precipitation_probability_avg_shift10_average4',
 'precipitation_intensity_avg_shift3_average5',
 'precipitation_intensity_max_shift2_average6',
 'precipitation_intensity_max_shift7_average10',
 'precipitation_intensity_max_shift10_average10',
 'day',
 'precipitation_intensity_max_shift-5',
 'precipitation_intensity_max_shift-4',
 'precipitation_intensity_max_shift-3',
 'precipitation_intensity_max_shift-2',
 'precipitation_intensity_max_shift-1']

In [None]:
# List of station used for testing
station_list = [2530, 2620, 4200, 4230, 4270, 4515, 4520, 4570, 4575, 5040, 5078, 5330, 5425, 5500, 6060, 6068, 6200, 6220, 6300, 6340, 8454, 8565]

In [None]:
# Initialize the RandomForestClassifier
# Initialize the GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

horizon_max = 5
day_len = 365

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# List for storing the predictions (useful for visualization)
predictions = []

for station in station_list:
    predictions = []

    for horizon in range (1, horizon_max+1, 1):
        # Define the train and test set
        X_train = watercourse_by_stations[station][features_train][:-(day_len + horizon)]
        y_train = watercourse_by_stations[station][features_target][horizon:-day_len]

        X_test = watercourse_by_stations[station][features_train][-(day_len + horizon):-horizon]
        y_test = watercourse_by_stations[station][features_target][-day_len:]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        forecast = model.predict(X_test).tolist()

        # Store to the predictions
        predictions.append(forecast)
        
        # Calculate and save the r2 score
        r2_scores[horizon-1].append(r2_score(y_test, forecast))

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(watercourse_by_stations[aquifer]['date'][-100:-60], watercourse_by_stations[aquifer]['altitude_diff'][-100:-60], color="royalblue", label="true data")
plt.plot(watercourse_by_stations[aquifer]['date'][-100:-60], predictions[0][-100:-60], color="tomato", label="forecast")
plt.plot(watercourse_by_stations[aquifer]['date'][-100:-60], predictions[1][-100:-60], color="green", label="forecast")
plt.plot(watercourse_by_stations[aquifer]['date'][-100:-60], predictions[2][-100:-60], color="orange", label="forecast")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the average r2 score
r2_average =  []
std_dev = []

for i in range(5):
    r2_average.append(np.mean(r2_scores[i]))
    std_dev.append(np.std(r2_scores[i]))

In [None]:
r2_average

In [None]:
# Save the average r2_scores
with open('../reports/randomforest/n-hits-ground-water-r2.txt', 'w') as file:
    for item in r2_average:
        file.write(f"{item}\n")

In [None]:
# Save the standard deviations
with open('../reports/randomforest/n-hits-ground-water-std-dev.txt', 'w') as file:
    for item in std_dev:
        file.write(f"{item}\n")

In [None]:
# Transpose the r2_scores list
r2_scores_transposed = [list(x) for x in zip(*r2_scores)]
# Pair up the stations with their r2_scores and store them in a dictionary
scores = dict(zip(aquifers_list, r2_scores_transposed))
scores

In [None]:
# Sort them by the value in r2_scores[0]
scores_sorted = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1][0])}
scores_sorted

In [None]:
# Save the r2_scores
joblib.dump(scores_sorted, '../reports/randomforest/n-hits-ground-water-r2-stations.joblib')

### **3** Linear Regression

#### **3.1** Ground water data

In [None]:
# Get the data
aquifer_by_stations = joblib.load('../data/interim/ground-water-and-weather.joblib')

In [None]:
# Target feature
features_target = ['altitude_diff']

In [None]:
# Transform date column to year, month and day columns
for key in aquifer_by_stations.keys():
    aquifer_by_stations[key]['year'] = aquifer_by_stations[key]['date'].dt.year
    aquifer_by_stations[key]['month'] = aquifer_by_stations[key]['date'].dt.month
    aquifer_by_stations[key]['day'] = aquifer_by_stations[key]['date'].dt.day

##### Testing on multiple stations

In [None]:
# best features for validation 365 days
best_features = ['cloud_cover_min_shift5',
 'precipitation_intensity_avg_shift3',
 'precipitation_intensity_max_shift2',
 'temperature_avg_average6',
 'cloud_cover_max_average7',
 'humidity_min_average4',
 'precipitation_probability_avg_average4',
 'precipitation_probability_max_average2',
 'precipitation_intensity_avg_average2',
 'altitude_diff_shift2_average3',
 'altitude_diff_shift5_average10',
 'altitude_diff_shift7_average2',
 'altitude_diff_shift9_average6',
 'altitude_diff_shift10_average4',
 'precipitation_shift10_average4',
 'temperature_min_shift8_average3',
 'temperature_min_shift9_average4',
 'temperature_max_shift2_average7',
 'temperature_max_shift3_average2',
 'cloud_cover_min_shift10_average2',
 'cloud_cover_max_shift3_average7',
 'humidity_avg_shift1_average3',
 'humidity_avg_shift2_average6',
 'humidity_avg_shift6_average10',
 'humidity_min_shift9_average3',
 'humidity_max_shift6_average7',
 'humidity_max_shift8_average5',
 'precipitation_probability_avg_shift8_average7',
 'precipitation_probability_min_shift1_average7',
 'precipitation_probability_min_shift4_average9',
 'precipitation_probability_max_shift1_average3',
 'precipitation_probability_max_shift2_average7',
 'precipitation_probability_max_shift7_average5',
 'precipitation_probability_max_shift10_average2',
 'precipitation_intensity_min_shift9_average6',
 'precipitation_intensity_avg_shift-1',
 'precipitation_probability_max_shift-1',
 'precipitation_intensity_max_shift-1',
 'precipitation_intensity_avg_shift-2',
 # manually added
 'precipitation_intensity_avg_shift-3',
 'precipitation_intensity_avg_shift-4',
 'precipitation_intensity_avg_shift-5',
 ]

In [None]:
aquifers_list = [85065, 85064]

In [None]:
# Add the predictions
def weather_forecast(X_train):
    postfixes = ['avg', 'min', 'max']
    for i in range(1,6):
        for postfix in postfixes:
            X_train[f"precipitation_probability_{postfix}_shift-{i}"] = X_train[f'precipitation_probability_{postfix}'].shift(-i)
            X_train[f"precipitation_intensity_{postfix}_shift-{i}"] = X_train[f'precipitation_intensity_{postfix}'].shift(-i)
    return X_train

In [None]:
for aquifer in aquifers_list:
    aquifer_by_stations[aquifer] = weather_forecast(aquifer_by_stations[aquifer])
    aquifer_by_stations[aquifer] = aquifer_by_stations[aquifer][:-5]

In [None]:
# Initialize the RandomForestClassifier
model = LinearRegression(n_jobs=-1)

horizon_max = 5
day_len = 365

# List for r2 results for different prediction horizons
r2_scores = [[] for _ in range(horizon_max)]

# Dictionary for storing the predictions for all of the stations
predictions_by_stations = {key: [] for key in aquifers_list}

for aquifer in aquifers_list:
    predictions = []

    for horizon in range (1, horizon_max+1, 1):
        # Define the train and test set
        X_train = aquifer_by_stations[aquifer][best_features][:-(day_len + horizon)]
        y_train = aquifer_by_stations[aquifer][features_target][horizon:-day_len]

        X_test = aquifer_by_stations[aquifer][best_features][-(day_len + horizon):-horizon]
        y_test = aquifer_by_stations[aquifer][features_target][-day_len:]
        
        # Scale the features
        for column in X_test.columns:
            X_test[column] = standard_scaling_transform(X_train[column], X_test[column])
            X_train[column] = standard_scaling(X_train[column])
        
        y_train = standard_scaling(y_train)

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        forecast = model.predict(X_test).tolist()
        # Flatten
        forecast = np.ravel(forecast)

        # Unscale the predictions
        forecast = standard_unscaling(aquifer_by_stations[aquifer]['altitude_diff'][horizon:-day_len], forecast)

        # Store to the predictions
        predictions.append(forecast)
        
        # Calculate and save the r2 score
        r2_scores[horizon-1].append(r2_score(y_test, forecast))

    # Store the predictions to the dictionary
    predictions_by_stations[aquifer] = predictions

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['altitude_diff'][-200:], color="royalblue", label="true data")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[4][-200:], color="tomato", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[2][-200:], color="green", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions[4][-200:], color="grey", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation_probability_max'][-200:].apply(lambda x: x/20), color="brown", label="forecast")
#plt.plot(aquifer_by_stations[aquifer]['date'][-200:], aquifer_by_stations[aquifer]['precipitation'][-200:].apply(lambda x: x/130), color="olive", label="forecast")
plt.plot(aquifer_by_stations[aquifer]['date'][-200:], predictions_by_stations[85064][0][-200:], color="grey", label="forecast")
#plt.savefig('../data/interim/plot.svg', format='svg')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the average r2 score
r2_average =  []
std_dev = []

for i in range(5):
    r2_average.append(np.mean(r2_scores[i]))
    std_dev.append(np.std(r2_scores[i]))

In [None]:
r2_average

In [None]:
# Save the average r2_scores
with open('../reports/linear-regression/linear-regression-ground-water-r2.txt', 'w') as file:
    for item in r2_average:
        file.write(f"{item}\n")

In [None]:
# Save the standard deviations
with open('../reports/linear-regression/linear-regression-ground-water-std-dev.txt', 'w') as file:
    for item in std_dev:
        file.write(f"{item}\n")

In [None]:
# Transpose the r2_scores list
r2_scores_transposed = [list(x) for x in zip(*r2_scores)]
# Pair up the stations with their r2_scores and store them in a dictionary
scores = dict(zip(aquifers_list, r2_scores_transposed))
scores

In [None]:
# Sort them by the value in r2_scores[0]
scores_sorted = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1][0])}
scores_sorted

In [None]:
# Save the r2_scores
joblib.dump(scores_sorted, '../reports/linear-regression/linear-regression-ground-water-r2-stations.joblib')

In [None]:
# Save the dictionary with predictions
joblib.dump(predictions_by_stations, '../reports/linear-regression/linear-regression-ground-water-predictions.joblib')