In [3]:
import pandas as pd
import numpy as np
import glob
import os
import datetime
import matplotlib.pyplot as plt
from functools import reduce


%matplotlib inline

In [4]:
def load_data(files):
    
    # initiate dict to store data
    dfs = {}

    for f in files:

        # get name of the water body from the filename
        database = f.split('\\')[-1].replace('.csv','')

        # load the data
        df = pd.read_csv(f)

        # drop rows for which all values are NaN
        df.dropna(how='all', axis=0, inplace=True)

        # get start and end data of the data
        if 'Date' in df.columns:
            df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

        # store data in dictionary by water body name
        dfs[database] = df
    
    return dfs

In [24]:
def clean_data(dfclean, date_min, date_max, drop_cols=[], replace_cols=[], rename_cols={}, offsets=[]):
    
    
    # select time range and make sure there is a record for each day
    dfclean = pd.merge(pd.DataFrame(index=pd.date_range(date_min, date_max)),
                       dfclean.set_index('Date'),
                       left_index=True,
                       right_index=True,
                       how='left')
    
    # rename columns
    dfclean.rename(columns=rename_cols, inplace=True)
    
    # drop columns
    dfclean.drop(columns=drop_cols, inplace=True)

    # set dtype
    for c in dfclean.columns:
        dfclean[c] = dfclean[c].astype(float)

    # fix data offsets
    for col, offset, loc_min, loc_max in offsets:
        dfclean.loc[loc_min:loc_max, col] = dfclean.loc[loc_min:loc_max, col] + offset
        
    # replace invalid values by NaN
    for c, values in replace_cols:
        dfclean[c] = dfclean[c].replace(*values)

    # interpolate data (if only 1 subsequent value is missing)
    for c in dfclean.columns:
        dfclean[c] = dfclean[c].interpolate(method='time', limit=1)
        
    # all flow columns must have positive values
    for c in dfclean.filter(regex='^Flow_Rate').columns:
        dfclean[c] = dfclean[c].abs()
        
    dfclean.reset_index(inplace=True)
    dfclean.rename(columns={'index':'Date'}, inplace=True)
        
    print('shape', dfclean.shape)
        
    return dfclean



# Load data

In [25]:
datadir = '../data/'
rawdir = os.path.join(datadir, 'raw')

In [26]:
files = glob.glob(os.path.join(rawdir, '*.csv'))

In [27]:
dfs = load_data(files)
names = list(dfs.keys())

# Clean data

## Auser

In [28]:
# settings
nr=0

name = names[nr]
filename = files[nr].replace('raw','clean')

# time selection
date_min = pd.to_datetime('2015-01-01')
date_max = pd.to_datetime('2020-06-30')

# feature selection based on availability of data
drop_cols = ['Temperature_Ponte_a_Moriano']

# replace invalid values with NaN
replace_cols = [('Depth_to_Groundwater_COS', (0, np.nan)), ('Depth_to_Groundwater_SAL', (0, np.nan))]

# rename columns
rename_cols = {'Depth_to_Groundwater_CoS': 'Depth_to_Groundwater_COS'}

# offset
offsets = [('Hydrometry_Piaggione', 1.19, '2010-01-01', '2021-01-01')]


In [31]:
# clean data
dfclean = clean_data(dfs[name].copy(), 
                     date_min, 
                     date_max, 
                     drop_cols=drop_cols, 
                     replace_cols=replace_cols, 
                     rename_cols=rename_cols, 
                     offsets=offsets)

# save data
dfclean.to_csv(filename, index=False)

shape (2008, 26)


In [32]:
# Note: there are still missing values in the Depth_to_Groundwater_* columns. 
#       This will be dealt with later, after creation of lagged features
dfclean.isnull().sum()

Date                                         0
Rainfall_Gallicano                           0
Rainfall_Pontetetto                          0
Rainfall_Monte_Serra                         0
Rainfall_Orentano                            0
Rainfall_Borgo_a_Mozzano                     0
Rainfall_Piaggione                           0
Rainfall_Calavorno                           0
Rainfall_Croce_Arcana                        0
Rainfall_Tereglio_Coreglia_Antelminelli      0
Rainfall_Fabbriche_di_Vallico                0
Depth_to_Groundwater_LT2                   184
Depth_to_Groundwater_SAL                   114
Depth_to_Groundwater_PAG                    12
Depth_to_Groundwater_COS                    80
Depth_to_Groundwater_DIEC                  135
Temperature_Orentano                         0
Temperature_Monte_Serra                      0
Temperature_Lucca_Orto_Botanico              0
Volume_POL                                   0
Volume_CC1                                   0
Volume_CC2   

## Petrignano

In [82]:
# settings
nr=3

name = names[nr]
filename = files[nr].replace('raw','clean')

# time selection
date_min = pd.to_datetime('2016-01-01')
date_max = pd.to_datetime('2020-06-30')





In [83]:
# clean data
dfclean = clean_data(dfs[name].copy(), date_min, date_max)

# save data
dfclean.to_csv(filename, index=False)

shape (1643, 8)


In [84]:
# Note: there are still missing values in the Depth_to_Groundwater_* columns. 
#       This will be dealt with later, after creation of lagged features
dfclean.isnull().sum()

Date                                    0
Rainfall_Bastia_Umbra                   0
Depth_to_Groundwater_P24                9
Depth_to_Groundwater_P25                9
Temperature_Bastia_Umbra                0
Temperature_Petrignano                  0
Volume_C10_Petrignano                   0
Hydrometry_Fiume_Chiascio_Petrignano    0
dtype: int64

## River Arno

In [85]:
# settings
nr=5

name = names[nr]
filename = files[nr].replace('raw','clean')

# time selection
date_min = pd.to_datetime('2004-01-01')
date_max = pd.to_datetime('2020-06-30')

# feature selection based on availability of data
drop_cols = ['Rainfall_Vernio','Rainfall_Stia', 'Rainfall_Consuma', 'Rainfall_Incisa',
             'Rainfall_Montevarchi', 'Rainfall_S_Savino', 'Rainfall_Laterina',
             'Rainfall_Bibbiena', 'Rainfall_Camaldoli', 'Temperature_Firenze']

# replace invalid value by NaN
replace_cols = [('Hydrometry_Nave_di_Rosano', (0, np.nan))]


In [86]:
# clean data
dfclean = clean_data(dfs[name].copy(), date_min, date_max, drop_cols=drop_cols, replace_cols=replace_cols)

# save data
dfclean.to_csv(filename, index=False)

shape (6026, 7)


In [87]:
# Note: there are still missing values for Hydrometry_Navi_di_Rosano. 
#       This will be dealt with later, after creation of lagged features
dfclean.isnull().sum()

Date                           0
Rainfall_Le_Croci              0
Rainfall_Cavallina             0
Rainfall_S_Agata               0
Rainfall_Mangona               0
Rainfall_S_Piero               0
Hydrometry_Nave_di_Rosano    183
dtype: int64

## Lake Bilancino

In [88]:
# settings
nr=4

name = names[nr]
filename = files[nr].replace('raw','clean')

# time selection
date_min = pd.to_datetime('2004-01-01')
date_max = pd.to_datetime('2020-06-30')



In [89]:
# clean data
dfclean = clean_data(dfs[name].copy(), date_min, date_max)

# save data
dfclean.to_csv(filename, index=False)

shape (6026, 9)
