# Load Pre-Processed Data & Define Train-, Test- and Validation Sets

In [None]:
from datetime import datetime
from os import listdir
from os.path import isfile, join

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
input_file_water_level_measurements = "PATH TO WATER LEVEL MEASUREMENTS"
input_file_velocity_measurements = "PATH TO VELOCITY MEASUREMENTS"

frame_folder = "FRAME FOLDER"
output_file = "OUTPUT PATH"

In [None]:
frame_index = frame_folder + r"\frame_index.csv"
train_set_index = frame_folder + r"\train_index.csv"
test_set_index = frame_folder + r"\test_index.csv"
validation_set_index = frame_folder + r"\validation_index.csv"

In [None]:
frame_index

In [None]:
water_level = pd.read_csv(input_file_water_level_measurements)
velocity = pd.read_csv(input_file_velocity_measurements)

water_level["Timestamp_no_millis"] = water_level["Timestamp"].apply(lambda t: t[0:-7])
water_level.set_index("Timestamp", inplace=True)
velocity.set_index("Timestamp", inplace=True)

data = water_level.join(velocity, how="inner")

print(data)

Write to file:

In [None]:
data.to_csv(output_file)

## Correlation

In [None]:
level = "Percentage Full [%%]"
velocity = "velocity"

In [None]:
fig = plt.figure()

plt.scatter(data[level], data[velocity])
plt.xlabel("Water Level")
plt.ylabel("Velocity")

plt.show()
plt.close()

data[level].corr(data[velocity])

## Train-, Test- & Validation Sets

In [None]:
frames = [f for f in listdir(frame_folder) if isfile(join(frame_folder, f)) and f.endswith(".jpg")]
frames = pd.DataFrame({"Frame" : frames})

def _date_extractor(frame_name): 
    
    extractor = r"2021_\d\d_\d\d_\d\d_\d\d_\d\d_\d+"
    #print(frame_name)
    match = re.findall(extractor, frame_name)
    date_str = match[0]
    
    return date_str

frames["Timestamp"] = frames["Frame"].map(_date_extractor)
frames["Timestamp_no_millis"] = frames["Timestamp"].apply(lambda t: t[0:-7])

frames.set_index("Timestamp", inplace=True)

frames.to_csv(frame_index)

frames

### Water Level Split

In [None]:
def _get_label(i):
    return "L%d" % i

NUMBER_OF_BINS = 10
labels = [_get_label(i) for i in range(NUMBER_OF_BINS)]

water_level_label = "Water Level Label"

data[water_level_label] = pd.cut(data[level], NUMBER_OF_BINS, labels=labels)
data[water_level_label] = data[water_level_label].astype(str)

In [None]:
data

In [None]:
set_label = "Set"
data[set_label] = ""

train_set_label = "Train"
test_set_label = "Test"
validation_set_label = "Validation"

train_ratio = 0.6
test_ratio = 0.2
validation_ratio = 0.2
np.random.seed(1989)


def _get_set_assigner(n):
    
    labels = pd.Series([train_set_label])
    labels = labels.repeat(n)
    labels = labels.reset_index(drop=True)
    
    n_test = int(n * test_ratio)
    n_validation = int(n * validation_ratio)
        
    for i in range(n_test):
        labels[i] = test_set_label
    
    for i in range(n_validation):
        labels[i+n_test] = validation_set_label
    
    labels = labels.sample(frac=1).reset_index(drop=True)
    
    for i in range(n):
        yield labels[i]

            
def _set_assigner_wrapper(value, assigner):
    return next(assigner)
    

for i in range(NUMBER_OF_BINS):
    current_label = _get_label(i)
    
    n = (data[water_level_label] == current_label).sum()
    set_assigner = _get_set_assigner(n)
    
    data.loc[data[water_level_label] == current_label, set_label] = data.loc[data[water_level_label] == current_label, set_label].apply(_set_assigner_wrapper, assigner=set_assigner)

In [None]:
training_set = data[data[set_label] == train_set_label]
training_set.to_csv(train_set_index)

training_set

In [None]:
test_set = data[data[set_label] == test_set_label]
test_set.to_csv(test_set_index)

test_set

In [None]:
validation_set = data[data[set_label] == validation_set_label]
validation_set.to_csv(validation_set_index)

validation_set

In [None]:
figure, axes = plt.subplots(nrows=3, \
                            #sharex=True, \
                            figsize=(15,15/1.62))
figure.tight_layout()

training_set["Waterlevel [mm]"].plot(ax=axes[0], title="Training Data")
validation_set["Waterlevel [mm]"].plot(ax=axes[1], title="Validation Data")
test_set["Waterlevel [mm]"].plot(ax=axes[2], title="Test Data")

plt.show()
plt.close()

In [None]:
figure, axes = plt.subplots(nrows=3, \
                            #sharex=True, \
                            figsize=(15,15/1.62))
figure.tight_layout()

training_set["Percentage Full [%%]"].plot(ax=axes[0], title="Training Data")

plt.show()
plt.close()