### Content

This dataset contains Measured/Calculated wave parameters. Measured and derived wave data from data collected by oceanographic wave measuring buoys anchored at Mooloolaba. Coverage period: 30 months.

### Acknowledgements

This data comes from Queensland Government Data - https://data.qld.gov.au/dataset.

* Date/Time: Date
* Hs: Significant wave height, an average of the highest third of the waves in a record
* Hmax: The maximum wave height in the record
* Tz: The zero upcrossing wave period
* Tp: The peak energy wave period
* Peak Direction: Direction (related to true north) from which the peak period waves are coming from
* SST: Approximation of sea surface temperature

In [None]:
# import warning
# warning.filterwarnings('ignore')
import numpy as np
import tensorflow as tf
import tensorflow.keras as K
import pandas as pd
import matplotlib.pyplot as plt
from scipy import optimize, signal, interpolate
print(tf.__version__)

In [None]:
import os
os.listdir('/kaggle/input/')

In [None]:

df = pd.read_csv('/kaggle/input/Coastal Data System - Waves (Mooloolaba) 01-2017 to 06 - 2019.csv')
df.head()
# df.info

In [None]:
var = np.array(df['SST'])
# plt.plot(var)
time = np.arange(len(var))
var[np.where(var<-90)] = np.nan
# sst = np.ma.masked_where(var < -90, var)
# print(~np.isnan(var),var[-3:])
try:
    sst = interpolate.interp1d(time[~np.isnan(var)], var[~np.isnan(var)], time)
except:
    for _ in range(len(var)):
        if np.isnan(var[_]):
            if _ < 20:
                var[_] = np.nanmean(var[:_+20])
            else:
                var[_] = np.nanmean(var[_-20:_+20])
    
    sst = var




In [None]:
split_time = round(len(var) * 0.8 / 1000) * 1000
print(split_time)
var[np.where(var < -90)] = np.nan
sst = var

time_train = time[:split_time]
x_train = sst[:split_time]
time_valid = time[split_time:]
x_valid = sst[split_time:]
plt.figure(figsize=(10, 6))
plt.plot(time_train, x_train)
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(time_valid, x_valid)
plt.show()

In [None]:
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
  dataset = tf.data.Dataset.from_tensor_slices(series)
  dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
  dataset = dataset.shuffle(shuffle_buffer).map(lambda window: (window[:-1], window[-1]))
  dataset = dataset.batch(batch_size).prefetch(1)
  return dataset

In [None]:
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000

dataset = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)


model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(100, input_shape=[window_size], activation="relu"), 
    tf.keras.layers.Dense(10, activation="relu"), 
    tf.keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(lr=1e-6, momentum=0.9), metrics=['mean_squared_error'])
# model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(lr=0.2, momentum=0.9), metrics=['accuracy'])
history = model.fit(dataset,epochs=20,verbose=1)

In [None]:
forecast = []
for _ in range(len(sst) - window_size):
  forecast.append(model.predict(sst[_:_ + window_size][np.newaxis]))

forecast = forecast[split_time-window_size:]
results = np.array(forecast)[:, 0, 0]


plt.figure(figsize=(10, 6))

plt.plot(time_valid, x_valid, color='#00FFFF')
plt.plot(time_valid, results, color='#FF69B4')

In [None]:
from scipy.stats import pearsonr
# covariance = np.cov(x_valid, results)
corr, _ = pearsonr(x_valid, results)
print(corr,_)

In [None]:

acc=history.history['mean_squared_error']
loss=history.history['loss']
epochs=range(len(acc)) # Get number of epochs



plt.plot(epochs, loss, 'r', "Training loss")
plt.title('Training and validation loss')
plt.ylim([0.03,0.06])
plt.figure()
# print(loss)

### define seasonality


In [None]:
def linf(x,a,b):
    return a * x + b
paramsl,params_covariance = optimize.curve_fit(linf, time, sst , maxfev = 10000)
baseline = linf(np.arange(len(var)), paramsl[0], paramsl[1]) ;

def func(x, a, b, c, d):
    return a * (np.sin(b * x * np.pi/(360*24*2) + c)) + d

# Para = leastsq(error, p0, args=(x, y), maxfev=500000)
params,params_covariance = optimize.curve_fit(func, time, sst - baseline, maxfev = 10000)


plt.figure(figsize=(10,4))
plt.plot(np.arange(len(var)),sst - baseline,color='#00FFFF')
# plt.plot(np.arange(len(var)),baseline,color='#00FFFF')
plt.plot(func(time, params[0], params[1], params[2], params[3]),
         color='#FF69B4')

In [None]:

remove_ssn = sst - linf(np.arange(len(var)), paramsl[0], paramsl[1]) \
            - func(np.arange(len(var)), params[0], params[1], params[2], params[3])

time_train = time[:split_time]
x_train = remove_ssn[:split_time]
time_valid = time[split_time:]
x_valid = remove_ssn[split_time:]
plt.figure(figsize=(10, 6))
plt.plot(time_train, x_train)
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(time_valid, x_valid)
plt.show()


In [None]:
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000

dataset = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)


model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(100, input_shape=[window_size], activation="relu"), 
    tf.keras.layers.Dense(10, activation="relu"), 
    tf.keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(lr=1e-6, momentum=0.9), metrics=['mean_squared_error'])
# model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(lr=0.2, momentum=0.9), metrics=['accuracy'])
history = model.fit(dataset,epochs=20,verbose=1)

In [None]:
forecast = []
for _ in range(len(sst) - window_size):
  forecast.append(model.predict(remove_ssn[_:_ + window_size][np.newaxis]))

forecast = forecast[split_time-window_size:]
results = np.array(forecast)[:, 0, 0]


plt.figure(figsize=(10, 6))

plt.plot(time_valid, x_valid, color='#00FFFF')
plt.plot(time_valid, results, color='#FF69B4')

In [None]:
# covariance = np.cov(x_valid, results)
corr, _ = pearsonr(x_valid, results)
print(corr, _)

In [None]:
acc=history.history['mean_squared_error']
loss=history.history['loss']
epochs=range(len(acc)) # Get number of epochs



plt.plot(epochs, loss, 'r', "Training loss")
plt.title('Training and validation loss')
plt.ylim([0.02,0.2])
plt.figure()
# print(loss)


## Conclusion

It seems the simplest model can predict the SST, but we can not find the physical reasons.

further more, It seems the result almost the same whether trend and seasonality were removed or not