In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The Acea Group is one of the leading Italian multiutility operators. Listed on the Italian Stock Exchange since 1999, the company manages and develops water and electricity networks and environmental services. Acea is the foremost Italian operator in the water services sector supplying 9 million inhabitants in Lazio, Tuscany, Umbria, Molise, Campania.

In this competition we will focus only on the water sector to help Acea Group preserve precious waterbodies. As it is easy to imagine, a water supply company struggles with the need to forecast the water level in a waterbody (water spring, lake, river, or aquifer) to handle daily consumption. During fall and winter waterbodies are refilled, but during spring and summer they start to drain. To help preserve the health of these waterbodies it is important to predict the most efficient water availability, in terms of level and water flow for each day of the year.

**Executive Summary **
* 

**Data**

**Data Preprocessing**
* Many values are missing, espeicially data in the distant past. We delete the first one or two years of data that is largely missing; for other missing points we use interpolation to fill the data 
* There are a few unlikely zero values - we convert them to nan and interpolate 
* We take absolute value of the flow rate, as told in the discussion section
* We convert float to int, for easier computation 
* Date: we convert the date into date and take month out. We downsample the data to each month using averages. We use one-hot encoding for the month. 

**Feature Engineering**
* We are dealing with a timeseries with seasonal influences, and past data will affect the next one. We use past value as a feature - at least attempting to. For some datasets, this value does not seem to be of relevance. 
* There is definitely a lagging effect on the level of rainfall and temperature on the water level. We augment the dataset with the past three and past month data - sum of rainfall in the past one and three months and weather averages for the past one and three months, and also the (averaged) target value of the past month and past three months.  

**Model Selection**
* We use simple baseline - value at t to infer value at t+1
* Baseline leanring model is a SVM with rbf kernel and a linear model with basic features
* We experiement with different and combined version of augmented features and select factors of relevance 
* We use LSTM and GRU (seq2seq) model to predict the next value given past data

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np 
df = pd.read_csv("/kaggle/input/acea-water-prediction/Aquifer_Auser.csv") #River_Arno.csv
df = df[3000:]
import scipy
df = df.interpolate(method="pchip")
from datetime import datetime
df['date'] = df['Date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y"))
df['month'] = df['date'].apply(lambda x: int(x.month))
# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['month'])
# Drop column B as it is now encoded
df = df.drop('month',axis = 1)
# Join the encoded df
df = df.join(one_hot)
df.set_index('date',inplace=True)
df = df.resample('M').mean()
df = df.dropna().reset_index()
print(list(df.columns))
print(df)

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns
corrMatrix = df.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
train = df[:102]
test = df[102:]

x_train = train[['Rainfall_Gallicano', 'Rainfall_Pontetetto', 'Rainfall_Monte_Serra', 'Rainfall_Orentano', 'Rainfall_Borgo_a_Mozzano', 'Rainfall_Piaggione', 'Rainfall_Calavorno', 'Rainfall_Croce_Arcana', 'Rainfall_Tereglio_Coreglia_Antelminelli', 'Rainfall_Fabbriche_di_Vallico', 'Temperature_Orentano', 'Temperature_Monte_Serra', 'Temperature_Ponte_a_Moriano', 'Temperature_Lucca_Orto_Botanico', 'Volume_POL', 'Volume_CC1', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL', 'Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]].to_numpy().astype(int)
#y=  df['Depth_to_Groundwater_SAL'].astype(int) 
y_train=  train['Depth_to_Groundwater_LT2'].astype(int) 
x_test = test[['Rainfall_Gallicano', 'Rainfall_Pontetetto', 'Rainfall_Monte_Serra', 'Rainfall_Orentano', 'Rainfall_Borgo_a_Mozzano', 'Rainfall_Piaggione', 'Rainfall_Calavorno', 'Rainfall_Croce_Arcana', 'Rainfall_Tereglio_Coreglia_Antelminelli', 'Rainfall_Fabbriche_di_Vallico', 'Temperature_Orentano', 'Temperature_Monte_Serra', 'Temperature_Ponte_a_Moriano', 'Temperature_Lucca_Orto_Botanico', 'Volume_POL', 'Volume_CC1', 'Volume_CC2', 'Volume_CSA', 'Volume_CSAL', 'Hydrometry_Monte_S_Quirico', 'Hydrometry_Piaggione', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]].to_numpy().astype(int)
#y=  df['Depth_to_Groundwater_SAL'].astype(int) 
y_test =  test['Depth_to_Groundwater_LT2'].astype(int) 
#y=  df['Depth_to_Groundwater_SAL'].astype(int) 
#y=  df['Depth_to_Groundwater_PAG'].astype(int) 
#y=  df['Depth_to_Groundwater_CoS'].astype(int) 
#y=  df['Depth_to_Groundwater_DIEC'].astype(int) 

In [None]:
#from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

from sklearn.svm import SVR
rf = SVR(kernel = 'rbf')
rf.fit(x_train, y_train)

In [None]:
predictions = rf.predict(x_test)
from sklearn import metrics
errors = metrics.mean_absolute_error(y_test, predictions)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
errors2 = np.sqrt(metrics.mean_squared_error(y_test, predictions))
# Print out the rooted mse
print('rMSE:', round(np.mean(errors2), 2), 'degrees.')