# Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import datetime

from sklearn.model_selection import train_test_split, cross_validate, learning_curve, GridSearchCV, cross_val_predict
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, ElasticNet
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, VotingRegressor
import joblib
from sklearn.preprocessing import StandardScaler, RobustScaler

In [17]:
#Clean DataFrames
DePanne = pd.read_csv('raw_data/combi/DePanne_combined.csv', index_col = 'Timestamp')
#Ostende = pd.read_csv('/content/drive/MyDrive/SurfWaves/oostend_clean.csv', index_col = 'Timestamp')
#Knokke = pd.read_csv('/content/drive/MyDrive/SurfWaves/Knokke/knokke_clean.csv', index_col = 'Timestamp')

In [18]:
DePanne

Unnamed: 0_level_0,wave_height,wave_period,wind_direction,wind_speed,tide
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-01 00:00:00+00:00,22.0,2.39,,,
2000-01-01 00:15:00+00:00,22.0,2.49,,,
2000-01-01 00:30:00+00:00,23.0,2.57,,,
2000-01-01 00:45:00+00:00,23.0,2.48,,,
2000-01-01 01:00:00+00:00,23.0,2.59,,,
...,...,...,...,...,...
2021-12-30 22:00:00+00:00,62.0,3.25,6.94,217.0,434.3
2021-12-30 22:30:00+00:00,53.0,3.17,6.41,212.0,416.0
2021-12-30 23:00:00+00:00,51.0,3.18,6.32,209.0,393.0
2021-12-30 23:30:00+00:00,51.0,3.12,6.60,211.0,363.8


# Pipeline

In [19]:
from SurfWaves.utils import trans_func, cos_list,sin_list
from sklearn.compose import make_column_transformer
from datetime import timedelta
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import math

num_transformer = RobustScaler()
custom_tr = FunctionTransformer(trans_func)
cycle_tr_sin = FunctionTransformer(sin_list)
cycle_tr_cos = FunctionTransformer(cos_list)

preprocessor_tr = make_column_transformer(
    (num_transformer, ['wave_period', 'wind_speed']),
    (custom_tr, ['tide']),
    (cycle_tr_sin, ['wind_direction']),
    (cycle_tr_cos, ['wind_direction']),
    remainder='passthrough'
    )   

pipe_preproc = Pipeline([
    ('preproc', preprocessor_tr),
    ('imputer', IterativeImputer())
])
pipe_preproc

In [95]:
import pickle
with open("raw_data/pipeline_4.pkl", "wb") as file:
    pickle.dump(pipe_preproc, file)

# Baseline

Average wave height
DePanne = 58 cm
Ostende = 65 cm
Knokke = 48 cm
Baseline = 50%

#De Panne

In [22]:
X_DePanne = DePanne.drop(columns=['wave_height'])
y_DePanne = DePanne['wave_height']

In [72]:
y_DePanne.to_csv('raw_data/DePanne_yraw.csv')

In [23]:
preproc_model_dp = pipe_preproc.fit(X_DePanne)
X_DePanne_trans = pd.DataFrame(preproc_model_dp.transform(X_DePanne))



In [80]:
X_DePanne

Unnamed: 0_level_0,wave_period,wind_direction,wind_speed,tide
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00+00:00,2.39,,,
2000-01-01 00:15:00+00:00,2.49,,,
2000-01-01 00:30:00+00:00,2.57,,,
2000-01-01 00:45:00+00:00,2.48,,,
2000-01-01 01:00:00+00:00,2.59,,,
...,...,...,...,...
2021-12-30 22:00:00+00:00,3.25,6.94,217.0,434.3
2021-12-30 22:30:00+00:00,3.17,6.41,212.0,416.0
2021-12-30 23:00:00+00:00,3.18,6.32,209.0,393.0
2021-12-30 23:30:00+00:00,3.12,6.60,211.0,363.8


In [73]:
X_DePanne_trans.to_csv('raw_data/DePanne_x_trans.csv')

In [68]:
X_DePanne_trans

Unnamed: 0,0,1,2,3,4
0,-1.056818,-0.060614,3.0,0.106852,0.993865
1,-0.943182,-0.062796,3.0,0.107666,0.993647
2,-0.852273,-0.064542,3.0,0.108318,0.993472
3,-0.954545,-0.062578,3.0,0.107585,0.993669
4,-0.829545,-0.064979,3.0,0.108481,0.993429
...,...,...,...,...,...
675787,-0.079545,0.083333,3.0,0.120830,0.992673
675788,-0.170455,0.051282,3.0,0.111642,0.993748
675789,-0.159091,0.032051,3.0,0.110081,0.993923
675790,-0.227273,0.044872,3.0,0.114937,0.993373


In [24]:
X_train_DePanne , X_test_DePanne, y_train_DePanne, y_test_DePanne = train_test_split(X_DePanne_trans, y_DePanne, test_size = 0.3)

In [78]:
#Test if Linear Regression is a good fit for our Data > Does not output a model
model = LinearRegression()
cv_results = cross_validate(model, X_train_DePanne, y_train_DePanne, cv=5)
DePanne_lin_reg_score = cv_results['test_score'].mean()
DePanne_lin_reg_score

0.5172218378057039

In [33]:
model.fit(X_train_DePanne,y_train_DePanne)

In [75]:
#Real time data
wind_speed = 15
wind_direction= 300
wave_period= 4.04
tide= 141.8

df = pd.DataFrame([[wind_speed, wind_direction,wave_period,
                           tide]],
                          columns=['wind_speed', 'wind_direction','wave_period',
                           'tide'])

In [79]:
#Preprocess real-time input data
x = preproc_model_dp.fit_transform(df)
x

array([[ 0.       ,  0.       ,  1.       , -0.8660254,  0.5      ]])

In [34]:
#Prediction based on real-time data
res = model.predict(x)
res

array([200.82096035])

In [35]:
#KNN
DePanne_knn = KNeighborsRegressor()

grid = {'n_neighbors': [5, 10, 15, 20, 25, 50],
        'p':[1,2], 
        'weights':['uniform', 'distance']}

DePanne_search_knn = GridSearchCV(DePanne_knn, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

KNN_fitted_DePanne = DePanne_search_knn.fit(X_train_DePanne, y_train_DePanne)
DePanne_knn_score = DePanne_search_knn.best_score_
print('Best score is:', DePanne_knn_score)
print('Best parameters are:', DePanne_search_knn.best_params_)
print('Best estimator is:', DePanne_search_knn.best_estimator_)

Best score is: 0.538236398116896
Best parameters are: {'n_neighbors': 50, 'p': 1, 'weights': 'distance'}
Best estimator is: KNeighborsRegressor(n_neighbors=50, p=1, weights='distance')


In [40]:
y_predict = DePanne_search_knn.predict(x)
y_predict

array([38.38456244])

In [66]:
#SGD
DePanne_sgd = SGDRegressor()

grid = {'loss': ['squared_error', 'huber', 'epsilon_insensitive'],
        'penalty': ['l2', 'l1', 'elasticnet'],
        'alpha': [0.01, 0.1, 1],
       }

DePanne_search_sgd = GridSearchCV(DePanne_sgd, 
                      grid,
                      scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error'],
                      cv=5, 
                      n_jobs=-1,
                      refit='neg_mean_squared_error'
                     )

SGD_fitted_DePanne = DePanne_search_sgd.fit(X_train_DePanne, y_train_DePanne)

In [49]:
DePanne_sgd_score = DePanne_search_sgd.cv_results_['mean_test_r2'].max()
print('Best MEA is:', DePanne_search_sgd.cv_results_['mean_test_neg_mean_absolute_error'].max())
print('Best MSE is:', DePanne_search_sgd.cv_results_['mean_test_neg_mean_squared_error'].max())
print('Best RMSE is:', DePanne_search_sgd.cv_results_['mean_test_neg_root_mean_squared_error'].max())
print('Best R² is:', DePanne_search_sgd.cv_results_['mean_test_r2'].max())

Best MEA is: -20.401775980110155
Best MSE is: -759.2230427626868
Best RMSE is: -27.55385661530979
Best R² is: 0.5168555680316345


In [67]:
SGD_fitted_DePanne.predict(x)

array([-306.3449453])

In [81]:
DePanne_xgb = XGBRegressor()

grid = {'max_depth': [5, 10, 15],
        'n_estimators':[15, 20, 25, 50], 
        'learning_rate':[0.01, 0.1]}

DePanne_search_xgb = GridSearchCV(DePanne_xgb, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

XGB_fitted_DePanne = DePanne_search_xgb.fit(X_train_DePanne, y_train_DePanne)

In [82]:
XGB_fitted_DePanne.score(X_test_DePanne,y_test_DePanne)

0.5565814834957037

In [83]:
XGB_fitted_DePanne.predict(x)

array([60.77579], dtype=float32)

In [86]:
#Decision Tree
DePanne_tree = DecisionTreeRegressor()

cv_results = cross_validate(DePanne_tree, X_train_DePanne, y_train_DePanne, scoring="r2", cv=5)
DePanne_tree_score = cv_results['test_score'].mean()
DePanne_tree.fit(X_train_DePanne, y_train_DePanne)
DePanne_tree_score

0.5302290701920904

In [87]:
DePanne_tree.predict(x)

array([82.19512195])

In [92]:
#Stacking
DePanne_ensemble = VotingRegressor(estimators=[('knn', DePanne_search_knn), ('xgb', DePanne_search_xgb), ('dt', DePanne_tree)], weights=[.2,.6,.2])
DePanne_ensemble_stacked = DePanne_ensemble.fit(X_train_DePanne, y_train_DePanne)
DePanne_ensemble_score = DePanne_ensemble.score(X_test_DePanne, y_test_DePanne)
DePanne_ensemble_score



0.5547259467502557

In [93]:
DePanne_ensemble_stacked.predict(x)

array([59.37928234])

In [94]:
filename = 'DePanne_stacked_3.joblib'
joblib.dump(DePanne_ensemble_stacked, filename)

['DePanne_stacked_3.joblib']

# Ostende

In [None]:
model = LinearRegression()
cv_results = cross_validate(model, X_train_Ostende, y_train_Ostende, cv=5)
Ostende_lin_reg_score = cv_results['test_score'].mean()
Ostende_lin_reg_score

0.6019087292548797

In [None]:
#Linear Regression
Ostende_linear_regression = LinearRegression()
Ostende_linear_regression.fit(X_train_Ostende, y_train_Ostende)

LinearRegression()

In [None]:
#KNN
Ostende_knn = KNeighborsRegressor()

grid = {'n_neighbors': [15, 20, 25, 50],
        'p':[1,2], 
        'weights':['uniform', 'distance']}

Ostende_search_knn = GridSearchCV(Ostende_knn, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

KNN_fitted_Ostende = Ostende_search_knn.fit(X_train_Ostende, y_train_Ostende)
Ostende_knn_score = Ostende_search_knn.best_score_
print('Best score is:', Ostende_knn_score)
print('Best parameters are:', Ostende_search_knn.best_params_)
print('Best estimator is:', Ostende_search_knn.best_estimator_)

Best score is: 0.6597294554191816
Best parameters are: {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
Best estimator is: KNeighborsRegressor(n_neighbors=15, weights='distance')


In [None]:
#SGD
Ostende_sgd = SGDRegressor()

grid = {'loss': ['squared_error', 'huber', 'epsilon_insensitive'],
        'penalty': ['l2', 'l1', 'elasticnet'],
        'alpha': [0.01, 0.1, 1],
       }

Ostende_search_sgd = GridSearchCV(Ostende_sgd, 
                      grid,
                      scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error'],
                      cv=5, 
                      n_jobs=-1,
                      refit=False
                     )

SGD_fitted_Ostende = Ostende_search_sgd.fit(X_train_Ostende, y_train_Ostende)

In [None]:
Ostende_sgd_score = Ostende_search_sgd.cv_results_['mean_test_r2'].max()
print('Best MEA is:', Ostende_search_sgd.cv_results_['mean_test_neg_mean_absolute_error'].max())
print('Best MSE is:', Ostende_search_sgd.cv_results_['mean_test_neg_mean_squared_error'].max())
print('Best RMSE is:', Ostende_search_sgd.cv_results_['mean_test_neg_root_mean_squared_error'].max())
print('Best R² is:', Ostende_search_sgd.cv_results_['mean_test_r2'].max())

Best MEA is: -0.42283742941562813
Best MSE is: -0.33181666769588813
Best RMSE is: -0.5759619573270667
Best R² is: 0.5830685774385433


In [None]:
#XGB
Ostende_xgb = XGBRegressor()

grid = {'max_depth': [5, 10, 15],
        'n_estimators':[15, 20, 25, 50], 
        'learning_rate':[0.01, 0.1]}

Ostende_search_xgb = GridSearchCV(Ostende_xgb, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

XGB_fitted_Ostende = Ostende_search_xgb.fit(X_train_Ostende, y_train_Ostende)

In [None]:
Ostende_xgb_score = Ostende_xgb.score(X_test_Ostende, y_test_Ostende)

In [None]:
#Decision Tree
Ostende_tree = DecisionTreeRegressor()

cv_results = cross_validate(Ostende_tree, X_train_Ostende, y_train_Ostende, scoring="r2", cv=5)
Ostende_tree_score = cv_results['test_score'].mean()
Ostende_tree_score

0.48725766974187346

In [None]:
#Stacking
Ostende_ensemble = VotingRegressor(estimators=[('lr', Ostende_linear_regression), ('knn', Ostende_search_knn), ('xgb', Ostende_search_xgb)], weights=[1,1,1])
Ostende_ensemble_stacked = Ostende_ensemble.fit(X_train_Ostende, y_train_Ostende)
Ostende_ensemble_score = Ostende_ensemble.score(X_test_Ostende, y_test_Ostende)
Ostende_ensemble_score

0.710446461185847

In [None]:
filename = 'Ostende_stacked_2.joblib'
joblib.dump(Ostende_ensemble_stacked, filename)

['Ostende_stacked_2.joblib']

#Knokke

In [None]:
#Linear Regression
Knokke_linear_regression = LinearRegression()
cv_results = cross_validate(Knokke_linear_regression, X_train_Knokke, y_train_Knokke, cv=5)
Knokke_lin_reg_score = cv_results['test_score'].mean()
Knokke_lin_reg_score

0.6496839068574192

In [None]:
#Linear Regression
Knokke_linear_regression = LinearRegression()
Knokke_linear_regression.fit(X_train_Knokke, y_train_Knokke)

LinearRegression()

In [None]:
#KNN
Knokke_knn = KNeighborsRegressor()

grid = {'n_neighbors': [15, 20, 25, 50],
        'p':[1,2], 
        'weights':['uniform', 'distance']}

Knokke_search_knn = GridSearchCV(Knokke_knn, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

KNN_fitted_Knokke = Knokke_search_knn.fit(X_train_Knokke, y_train_Knokke)
Knokke_knn_score = Knokke_search_knn.best_score_
print('Best score is:', Knokke_knn_score)
print('Best parameters are:', Knokke_search_knn.best_params_)
print('Best estimator is:', Knokke_search_knn.best_estimator_)

Best score is: 0.7197418973799665
Best parameters are: {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
Best estimator is: KNeighborsRegressor(n_neighbors=15, weights='distance')


In [None]:
#SGD
Knokke_sgd = SGDRegressor()

grid = {'loss': ['squared_error', 'huber', 'epsilon_insensitive'],
        'penalty': ['l2', 'l1', 'elasticnet'],
        'alpha': [0.01, 0.1, 1],
       }

Knokke_search_sgd = GridSearchCV(Knokke_sgd, 
                      grid,
                      scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error'],
                      cv=5, 
                      n_jobs=-1,
                      refit=False
                     )

SGD_fitted_Knokke = Knokke_search_sgd.fit(X_train_Knokke, y_train_Knokke)

In [None]:
Knokke_sgd_score = search.cv_results_['mean_test_r2'].max()
print('Best MEA is:', search.cv_results_['mean_test_neg_mean_absolute_error'].max())
print('Best MSE is:', search.cv_results_['mean_test_neg_mean_squared_error'].max())
print('Best RMSE is:', search.cv_results_['mean_test_neg_root_mean_squared_error'].max())
print('Best R² is:', search.cv_results_['mean_test_r2'].max())

Best MEA is: -0.4210729607931417
Best MSE is: -0.34366947441633144
Best RMSE is: -0.5861466557076526
Best R² is: 0.6187404043304496


In [None]:
#XGB
Knokke_xgb = XGBRegressor()

grid = {'max_depth': [5, 10, 15],
        'n_estimators':[15, 20, 25, 50], 
        'learning_rate':[0.01, 0.1]}

Knokke_search_xgb = GridSearchCV(Knokke_xgb, 
                      grid,
                      scoring='r2',
                      cv=5, 
                      n_jobs=-1
                     )

XGB_fitted_Knokke = Knokke_search_xgb.fit(X_train_Knokke, y_train_Knokke)

In [None]:
Knokke_xgb_score = Knokke_xgb.score(X_test_Knokke, y_test_Knokke)

In [None]:
#Decision Tree
Knokke_tree = DecisionTreeRegressor()

cv_results = cross_validate(Knokke_tree, X_train_Knokke, y_train_Knokke, scoring="r2", cv=5)
Knokke_tree_score = cv_results['test_score'].mean()
Knokke_tree_score

0.5900120665440847

In [None]:
#Stacking
Knokke_ensemble = VotingRegressor(estimators=[('lr', Knokke_linear_regression), ('knn', Knokke_search_knn), ('xgb', Knokke_search_xgb)], weights=[1,1,1])
Knokke_ensemble_stacked = Knokke_ensemble.fit(X_train_Knokke, y_train_Knokke)
Knokke_ensemble_score = Knokke_ensemble.score(X_test_Knokke, y_test_Knokke)
Knokke_ensemble_score

0.7643329753437005

In [None]:
filename = 'Knokke_stacked_2.joblib'
joblib.dump(Knokke_ensemble_stacked, filename)

['Knokke_stacked_2.joblib']

In [None]:
Knokke

Unnamed: 0_level_0,wave_height,wave_period,wind_speed,wind_direction,tide,wind_direction_sin,wind_direction_cos
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-02-22 14:00:00+00:00,-0.500000,-0.736264,-0.833333,132.0,3,0.743145,-0.669131
2019-02-22 15:00:00+00:00,-0.526316,-0.428571,-0.833333,125.0,3,0.819152,-0.573576
2019-02-22 16:00:00+00:00,-0.552632,0.010989,-0.750000,116.0,3,0.898794,-0.438371
2019-02-22 17:00:00+00:00,-0.631579,0.076923,-0.750000,106.0,3,0.961262,-0.275637
2019-02-22 18:00:00+00:00,-0.710526,-0.153846,-0.666667,96.0,2,0.994522,-0.104528
...,...,...,...,...,...,...,...
2022-05-28 17:00:00+00:00,0.447368,-0.186813,0.666667,353.0,1,-0.121869,0.992546
2022-05-28 18:00:00+00:00,0.526316,-0.263736,0.666667,356.0,1,-0.069756,0.997564
2022-05-28 19:00:00+00:00,0.763158,-0.340659,0.416667,357.0,1,-0.052336,0.998630
2022-05-28 20:00:00+00:00,1.026316,-0.307692,0.250000,357.0,1,-0.052336,0.998630


#Overview

In [None]:
data = {'DePanne Score': [DePanne_lin_reg_score, DePanne_knn_score, DePanne_sgd_score, DePanne_xgb_score, DePanne_tree_score, DePanne_ensemble_score],
        'Ostende Score': [Ostende_lin_reg_score, Ostende_knn_score, Ostende_sgd_score, Ostende_xgb_score, Ostende_tree_score, Ostende_ensemble_score],
        'Knokke Score': [Knokke_lin_reg_score, Knokke_knn_score, Knokke_sgd_score, Knokke_xgb_score, Knokke_tree_score, Knokke_ensemble_score]
        }

df = pd.DataFrame(data, index=['Linear Regression', 'KNN', 'SGD', 'XGB', 'Decision Tree', 'Stacking'])

In [None]:
df.style.background_gradient(cmap="Greens")

Unnamed: 0,DePanne Score,Ostende Score,Knokke Score
Linear Regression,0.798026,0.60403,0.649684
KNN,0.894669,0.658912,0.725089
SGD,0.795221,0.590996,0.61874
XGB,0.900532,0.72438,0.78814
Decision Tree,0.490306,0.487258,0.590012
Stacking,0.888834,0.706307,0.767204


In [None]:
filename = 'DePanne_stacked.joblib'
joblib.dump(DePanne_ensemble, filename)
 
filename = 'Ostende_stacked.joblib'
joblib.dump(Ostende_ensemble, filename)

filename = 'Knokke_stacked.joblib'
joblib.dump(Knokke_ensemble, filename)

['Knokke_stacked.joblib']