In [1]:
import holidays
import pandas as pd
import numpy as np
import os

import sys
sys.path.append('C:\\Users\\20182960\\Documents\\3.1 DataChallenge3\\jbg060_2019-2020-documented_implementations\\group6_DC3_documented_implementation_ORIGINALFILE\\Documented Implementation\\Code')

import preprocessing as pre
from sklearn.model_selection import train_test_split

In [None]:
def get_measurements(path, convert_time=True):
    """
    Will read all measurement data from given path and store them in separate dataframes.
    ~~~ EXAMPLE CALL ~~~
    flow_data, level_data = get_measurements("C:/mypath/RG8150")
    ~~~~~~~~~~~~~~~~~~~~
    """
    files = os.listdir(path)
    
    data = [pd.read_csv(path + "/" + i, sep = ";") for i in files]
    data =  pd.concat(data, sort = False, ignore_index = True)
    
    data["RG_ID"] = data["Tagname"].str.slice(9,13).astype(int)
    data["Value"] = data["Value"].str.replace(",", ".").astype(float)
    data["DataQuality"] = (data["DataQuality"] == "Good").astype(int)
    if convert_time == True:
        data["TimeStamp"] = pd.to_datetime(data["TimeStamp"], format="%d-%m-%Y %H:%M:%S")
        
    data = data[["Tagname", "RG_ID", "TimeStamp", "Value", "DataQuality"]]
    
    flow_data = data[data["Tagname"].str.contains("Debietmeting")].reset_index(drop = True)
    level_data = data[data["Tagname"].str.contains("Niveaumeting")].reset_index(drop = True)
    
    flow_data.drop("Tagname", axis=1, inplace=True)
    level_data.drop("Tagname", axis=1, inplace=True)
    
    return flow_data, level_data

In [2]:
def add_predictor_columns(data):
    """
    Will return predictive variables given a data-set with the 'TimeHour' column.
    'TimeHour' can be created by applying the .replace() method on the 'TimeStamp'
    column. The following variables will be added:

    NAME ~~~~~~~~~~~~~~~ COLUMN ~~~~~~~ FORMAT
    Hour of the day      hour_XX        Dummy, binary
    Month of the year    month_XX       Dummy, binary
    Holiday              is_holiday     Binary

    Holiday is based on all holidays in the Netherlands in 2018, 2019, 2020.
    """
    # Fetch holidays of given period
    NL_holidays = [i[0] for i in holidays.Netherlands(years = [2018, 2019, 2020]).items()]
    
    # Check each date whether in holidays
    is_holiday = data["TimeHour"].astype('datetime64[ns]').apply(lambda i: i.date() in NL_holidays).astype(int)

    # Create dummies for hour of day and month of year
    hour_dummies = pd.get_dummies(data["TimeHour"].astype('datetime64[ns]').apply(lambda i: i.hour), prefix="hour")
    month_dummies = pd.get_dummies(data["TimeHour"].astype('datetime64[ns]').apply(lambda i: i.month), prefix="month")
   
    # Concatenate and add constant/intercept
    X = pd.concat([hour_dummies, month_dummies, is_holiday], axis=1)
    X["Constant"] = 1

    return X

In [4]:
def create_rain_predictions_file (hirlam_filelocation):
    
    hirlam2018 = pd.read_csv(hirlam_filelocation + '2018_hirlam_predictions.csv', sep=';')
    hirlam2019 = pd.read_csv(hirlam_filelocation + '2019_hirlam_predictions.csv', sep=',')
    hirlam2020 = pd.read_csv(hirlam_filelocation + '2020_hirlam_predictions.csv', sep=',')
    
    hirlamList = [hirlam2018, hirlam2019, hirlam2020]
    
    LZS_rain_pred = pd.DataFrame(columns=['Time', 'Prediction'])
    
    for hirlam in hirlamList: 

        start = 12
        while start < len(hirlam):
            LZS_rain_pred = LZS_rain_pred.append(hirlam[start:start+6][['Time', 'Prediction']])
            start=start+49


    LZS_rain_pred = LZS_rain_pred.rename(columns={"Time": "TimeHour"})
    LZS_rain_pred = LZS_rain_pred.reset_index()
    
    return(LZS_rain_pred)

In [1]:
def create_dataset(rain_prediction, flow_data, level_data, imputation = "simple"):


    # LZS: no rain grid needed, since the rain predictions are the same for the whole area.

    # Omit minor data defficiencies
    flow_data = pre.clean_mes_data(flow_data, convert_timestamp=False)
    level_data = pre.clean_mes_data(level_data, convert_timestamp=False)

    # Merges flow and level on timestamps, as normal flow data is biased
    # given no measurements are made when there is no flow.
    flow_data, level_data = pre.merge_flow_level(flow_data, level_data)

    # Can perform simple imputation or LM-imputation
    if imputation == "simple":
        flow_data = pre.fill_flow(flow_data)
    elif imputation == "complex":
        flow_data = data_imputation.fill_flow(flow_data, level_data)
    else:
        pass

    # Groups flow by hour
    flow_data_by_hour = pre.flow_by_hour(flow_data)
      
    flow_data_by_hour['TimeHour'] = flow_data_by_hour['TimeHour'].astype(str)
    rain_prediction['TimeHour'] = rain_prediction['TimeHour'].astype(str)

    result = pd.merge(flow_data_by_hour, rain_prediction, on='TimeHour', how='inner')

    # Concatenate grid data and other variables
    X = add_predictor_columns(result).values

    X = np.concatenate((result[['Prediction']], X), axis=1)
   
    # Select dependent variable
    y = result["Flow"].values    
    TimeHour = result['TimeHour']
    
    return(X,y,TimeHour)
