# Dataset
Usamos o volume do Rio Negro de novembro de 2014 até dezembro de 2020 como dataset. Os dados foram obtidos através portal do Porto de Manaus (https://www.portodemanaus.com.br/).

In [None]:
import pandas as pd

#Carregando dataset e indexando pelo Date
df = pd.read_csv(
    "../input/rio-negro-level/rio_negro_level.csv",
    sep=',',
    parse_dates={'dt':['Date']},
    infer_datetime_format=True,
    low_memory=False, na_values=['nan','?'],
    index_col='dt'
)
df.head()

### Colunas

In [None]:
df.info()

In [None]:
# O dataset contém a captura de dados de 1 dia
# Vamos agrupar por mês
df = df.groupby(pd.Grouper(freq='M')).mean()
df.head()

In [None]:
import matplotlib.pyplot as plt

print("Number of measurements: "+str(len(df['Level'])))
df.plot(y='Level', rot=25, figsize=(16,8));

In [None]:
level = df['Level']
level.head()

In [None]:
import numpy as np

# separando 75% dos casos para treino
train_length = int(len(level) * .75)

train = level[:train_length].values
plt.plot(np.arange(len(train)),train)
train = train.reshape((len(train), 1))
test = level[train_length:].values
plt.plot(np.arange(len(train), len(train)+len(test)),test)
test = test.reshape((len(test), 1))
#plt.plot(np.arange(len(df3d)),df3d)

In [None]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Trabalhamos com sequencias de 12 meses
length = 12
generator = TimeseriesGenerator(train,train,length=length, batch_size=1)

validation_generator = TimeseriesGenerator(test,test,length=length, batch_size=1)

# Criando modelo

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model = Sequential()
#model.add(GRU(20, activation='relu', input_shape=(length,1)))
model.add(GRU(40, activation='relu', return_sequences=True, input_shape=(length,1)))
model.add(GRU(20, activation='relu', return_sequences=True, input_shape=(length,1)))
model.add(GRU(10, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
# Trainning the model
from datetime import datetime
start = datetime.now()
epochs = 30
early_stop = EarlyStopping(monitor='val_loss',patience=10)
ckpt = ModelCheckpoint('model.hdf5', save_best_only=True, monitor='val_loss', verbose=1)
history = model.fit_generator(
    generator,
    steps_per_epoch=len(generator),
    epochs=epochs,
    validation_data=validation_generator,
    callbacks=[early_stop, ckpt])
end = datetime.now()
print(end - start)

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

epochs_x = range(1, len(loss_values) + 1)
plt.figure(figsize=(16,8))
#plt.subplot(2,1,1)
plt.plot(epochs_x, loss_values, 'bo', label='Training loss')
plt.plot(epochs_x, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation Loss and Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Evaluating

In [None]:
# Load the best model
model.load_weights("model.hdf5")

# Predicting some days ahead.
test_predictions = []
first_eval_batch = train[-length:]
current_batch = first_eval_batch.reshape((1, length, 1))
for i in range(len(test)):
    # get prediction 1 time stamp ahead ([0] is for grabbing just the number instead of [array])
    current_pred = model.predict(current_batch)[0]
    # store prediction
    test_predictions.append(current_pred)
    # update batch to now include prediction and drop first value
    current_batch = np.append(current_batch[:,1:,:],[[current_pred]],axis=1)
#prediction = scaler.inverse_transform(test_predictions)

In [None]:
# Comparing test data and predictions
plt.plot(np.arange(len(train)), train)
plt.plot(np.arange(len(train),len(train)+len(test)),test)
plt.plot(np.arange(len(train),len(train)+len(test)),test_predictions)

In [None]:
# Calculating the mean squared error
loss = np.mean(np.square(test[:,0] - np.array(test_predictions)[:,0]), axis=-1)
print("mse: "+str(loss))