In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
from datetime import datetime as dt

**Objective**

Preserve the waterbodies (Lake, Aquifier, Water Spring, River) for efficient water supply management. To achieve this,it is important to predict the most efficient water availability, in terms of level and water flow for each day of the year.

In [None]:
#Dowloading the data for Lake Bilancino 
lb= pd.read_csv('/kaggle/input/acea-water-prediction/Lake_Bilancino.csv')
lb.info()

In [None]:
lb.describe()

In [None]:
#Checking the columns
lb.columns

In [None]:
#Checking data of Lake Bilancino
lb.tail()

**Dataset features provided**

The features provided are Rain fall, Temperature, Lake level, Flow Rate

**Output expected**
Predicted Lake Level and Flow rate.

**Tentative Analysis approach**

1) Plot unfiltered data to see trend->Check the Corelation->Use ML Models to predict flow & Level of lake -> fine tune the model to improve the accuracy.

2) Plot unfiltered data to see trend->Check the Corelation->Use features to build a mathematical model to predict flow & Level of lake -> fine tune the model to improve the accuracy

In [None]:
lb['Date']= pd.to_datetime(lb['Date'])
lb['Year']= lb['Date'].dt.year
lb['Month']=lb['Date'].dt.month
lb.info()


In [None]:
lb.head()

In [None]:
#grouping by year and months
Lake_level= lb.groupby(["Year", "Month"]).Lake_Level.mean().reset_index()
Lake_level.head()

In [None]:
#plotting the lake level over the TS to see the variation over year
sns.lineplot(data=Lake_level, x="Year", y="Lake_Level")

In [None]:
#min(lb['Lake_Level'])
print(Lake_level[Lake_level.Lake_Level==Lake_level.Lake_Level.min()])

In [None]:
#max(lb['Lake_Level'])
print(Lake_level[Lake_level.Lake_Level==Lake_level.Lake_Level.max()])

In [None]:
#plotting the lake level over the TS to see the variation over month
sns.lineplot(data=Lake_level, x="Month", y="Lake_Level")

The Extreme lake levels were found to be 251.58 and 244.520645 during 2010-March & 2012-October respectively.

**Checking the flow rate over Months and Years**

In [None]:
flow_rate= lb.groupby(["Year","Month"]).Flow_Rate.mean().reset_index()
flow_rate.head()

In [None]:
#Plotting the flow rate over the years (2002-2020)
sns.lineplot(data=flow_rate,x="Year",y= "Flow_Rate")

In [None]:
#Plotting the flow rate over the years (2002-2020)
sns.lineplot(data=flow_rate,x="Month",y= "Flow_Rate")

**Finding the Month and year with extreme flow rate data**

In [None]:
print(flow_rate[flow_rate.Flow_Rate==flow_rate.Flow_Rate.min()])

In [None]:
print(flow_rate[flow_rate.Flow_Rate==flow_rate.Flow_Rate.max()])

Flow rate was minimum during May-2012 and Maximum during March 2013

# **Rain fall and Temperature Variation over Month & year**

In [None]:
# Rain ditribution over the year
rain_yearly= lb.groupby(["Year"])[['Rainfall_S_Piero', 'Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].mean().reset_index()
rain_yearly.head()

# **Plotting the rain pattern over the years**

In [None]:
plt.plot( 'Year', 'Rainfall_S_Piero', data=rain_yearly, marker='', color='blue', linewidth=2)
plt.plot( 'Year', 'Rainfall_Mangona', data=rain_yearly, marker='', color='red', linewidth=2)
plt.plot( 'Year', 'Rainfall_S_Agata', data=rain_yearly, marker='', color='yellow', linewidth=2)
plt.plot( 'Year', 'Rainfall_Cavallina', data=rain_yearly, marker='', color='black', linewidth=2)
plt.plot( 'Year', 'Rainfall_Le_Croci', data=rain_yearly, marker='', color='olive', linewidth=2)
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")
plt.xticks(rotation = 90)
plt.grid()

In [None]:
#Rain distribution over the Months
rain_monthly= lb.groupby(["Month"])[['Rainfall_S_Piero', 'Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].mean().reset_index()
rain_monthly.head()

In [None]:
plt.plot( 'Month', 'Rainfall_S_Piero', data=rain_monthly, marker='', color='blue', linewidth=2)
plt.plot( 'Month', 'Rainfall_Mangona', data=rain_monthly, marker='', color='red', linewidth=2)
plt.plot( 'Month', 'Rainfall_S_Agata', data=rain_monthly, marker='', color='yellow', linewidth=2)
plt.plot( 'Month', 'Rainfall_Cavallina', data=rain_monthly, marker='', color='black', linewidth=2)
plt.plot( 'Month', 'Rainfall_Le_Croci', data=rain_monthly, marker='', color='olive', linewidth=2)
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")
plt.xticks(rotation = 90)
plt.grid()

**Observation**
1. The above plot shows Mangona recieves the maximum rain and S_Piero recieves the minimum over the months in a year. 
2. July to August period is the relatively dry period where there is consistent minimum rain.
3. September to February is the time when its raining to a decent level.


In [None]:
temperature=lb.groupby(["Year","Month"]).Temperature_Le_Croci.mean().reset_index()
temperature.head()


In [None]:
# Plotting the temperature pattern over the Year and Months
temperature["Year"]=temperature["Year"].astype(str)
sns.lineplot(data=temperature, x="Month", y="Temperature_Le_Croci", hue="Year")

Temperature grdually increases from January and reches to peak during June to August and gradually decreases to minimum in January

In [None]:
#Counting the Null value in Rain fall columns
lb[['Rainfall_S_Piero','Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].isnull().sum()

In [None]:
# Dropping the rows with missing Rainfall values from Le_Croci
lb1=lb.dropna(subset=["Rainfall_Le_Croci","Temperature_Le_Croci"])
lb1.info()

In [None]:
lb1.head()

In general flow rate is a function of lake Level
Q= f(H)


**Preparing the data for modeling**

In [None]:
lb1['Rainfall_mean']=lb1[['Rainfall_S_Piero','Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].mean(axis = 1)
#lb1['Rainfall_mean']=lb1.apply(lambda row:(row.Rainfall_S_Piero+ row.Rainfall_Mangona+row.Rainfall_S_Agata+row.Rainfall_Cavallina+row.Rainfall_Le_Croci)/5,axis=1)
lb1['Rainfall_median']=lb1[['Rainfall_S_Piero','Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].median(axis = 1)
lb1['Rainfall_min']=lb1[['Rainfall_S_Piero','Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].min(axis = 1)
lb1['Rainfall_max']=lb1[['Rainfall_S_Piero','Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].max(axis = 1)
lb1['Rainfall_net']=lb1[['Rainfall_S_Piero','Rainfall_Mangona', 'Rainfall_S_Agata','Rainfall_Cavallina','Rainfall_Le_Croci']].sum(axis = 1)
lb1.head()

In [None]:
lb1['day_of_the_year'] = lb1['Date'].dt.dayofyear

# **Heat map for checking the corelation between original & derived variables**

In [None]:
lb1.info()

In [None]:
corr= lb1.drop(['Date'], axis=1)

In [None]:
plt.figure(figsize=(15, 15))

sns.heatmap(corr, annot = True)

In [None]:
#Rainfall and lake level trend
comp=lb1[lb1["Year"].isin([2018])]
comp1=comp[["Lake_Level","Rainfall_net","Month"]]
sns.lineplot(data=comp1, x="Rainfall_net", y="Lake_Level")

In [None]:
plt.plot( 'Month','Lake_Level', data=comp1, marker='', color='blue', linewidth=2)
#plt.plot( 'Month','Rainfall_mean', data=comp1, marker='', color='red', linewidth=2)
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")
plt.xticks(rotation = 90)
plt.grid()

In [None]:
X= lb1[["Year","Month","Temperature_Le_Croci","Rainfall_mean", "day_of_the_year","Rainfall_median","Rainfall_net"]]

X.head()

In [None]:
X.info()

In [None]:
y=lb1[["Lake_Level"]]
y.head()

In [None]:
y.info()

# **Linear regression for Modeling**

In [None]:
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

**Create X_train, y_train, X_test and Y_test**

In [None]:
train = lb1[~lb1['Year'].isin([2017,2018,2019,2020])]
test = lb1[lb1['Year'].isin([2018])]

X_train = train[["Year","Month","Temperature_Le_Croci","Rainfall_mean","Rainfall_net","day_of_the_year"]]
y_train = train[["Lake_Level"]]

X_test = test[["Year","Month","Temperature_Le_Croci","Rainfall_mean", "Rainfall_net","day_of_the_year"]]
y_test = test[["Lake_Level"]]

In [None]:
#X_train, X_test,y_train, y_test= train_test_split(X, y, train_size=0.8 , random_state=100)
lr= LinearRegression(normalize=True)

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
lr.fit(X_train,y_train)

In [None]:
y_pred= lr.predict(X_test)

**Evaluate the model by comparing the predictions with the actual labels**

In [None]:
r_squared = r2_score(y_test, y_pred)

In [None]:
r_squared

# **Random forest for Modeling**

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=100, random_state=0)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [None]:
r_squared = r2_score(y_test, y_pred)
r_squared