# Time Series Part-1 (data visualisation) is explained clearly in the below link kindly check and then continue here

#### https://www.kaggle.com/nandha13/time-series-part-1-data-visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime,date
df = pd.read_csv("../input/acea-water-prediction/Aquifer_Petrignano.csv")
df = df[df.Rainfall_Bastia_Umbra.notna()].reset_index(drop=True)
df = df.drop(['Depth_to_Groundwater_P24', 'Temperature_Petrignano'], axis=1)
df.columns = ['date', 'rainfall', 'depth_to_groundwater', 'temperature', 'drainage_volume', 'river_hydrometry']
targets = ['depth_to_groundwater']
features = [feature for feature in df.columns if feature not in targets]
df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%Y')

In [None]:
f, ax = plt.subplots(nrows=5, ncols=1, figsize=(15, 25))

for i, column in enumerate(df.drop('date', axis=1).columns):
    sns.lineplot(x=df['date'], y=df[column].fillna(method='ffill'), ax=ax[i], color='dodgerblue')
    ax[i].set_title('Feature: {}'.format(column), fontsize=14)
    ax[i].set_ylabel(ylabel=column, fontsize=14)
                      
    ax[i].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])   

###### The above steps are already detalied explined in my data visualisation notebook

# Time Series - Part 2 (Data Preprocessing)

#### Chronological Order and Equidistant Timestamps

Chronological Order of timeseries data must be checked and preprocessed. Similarly Equidistant of the date also needed to be check.

In order make in chronological order we  sort the data. similarly to check for equidistant we see the difference between the current and previous date or you can use a constant time and find the diffenrence in date

In [None]:
df = df.sort_values(by='date')
df['delta'] = df['date'] - df['date'].shift(1)

shift function will shift the value of the date one record below its previous record based on the argumet given inside the shift function.

 

In [None]:
df['delta'].sum(),df['delta'].count()

we can see that the equdistant between each record has been maintained 

#### Handling Missing Values

We can see there is still some null values in the features

Also we can see that  there are some zero values in river_hydrometry and drainage volume which we can repalce with nan values and filling them afterwords 

In [None]:
# setting rows, columns and size 
f, ax = plt.subplots(nrows=2, ncols=1, figsize=(15, 15))


# hydrometry having original values and hydrometry having zoeros replaced with nan
old_hydrometry = df['river_hydrometry'].copy()
df['river_hydrometry'] = df['river_hydrometry'].replace(0, np.nan)

# ploting for old and new hydrometry
sns.lineplot(x=df['date'], y=old_hydrometry, ax=ax[0], color='darkorange', label='original')
sns.lineplot(x=df['date'], y=df['river_hydrometry'].fillna(np.inf), ax=ax[0], color='dodgerblue', label='modified')
ax[0].set_title('Feature: Hydrometry', fontsize=14)
ax[0].set_ylabel(ylabel='Hydrometry', fontsize=14)
ax[0].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

old_drainage = df['drainage_volume'].copy()
df['drainage_volume'] = df['drainage_volume'].replace(0, np.nan)

sns.lineplot(x=df['date'], y=old_drainage, ax=ax[1], color='darkorange', label='original')
sns.lineplot(x=df['date'], y=df['drainage_volume'].fillna(np.inf), ax=ax[1], color='dodgerblue', label='modified')
ax[1].set_title('Feature: Drainage', fontsize=14)
ax[1].set_ylabel(ylabel='Drainage', fontsize=14)
ax[1].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

From the above we can see that zeros are replaced by null values which is mentioned in orange lines 

In [None]:
f,ax = plt.subplots(nrows=1,ncols=1,figsize=(16,5))
sns.heatmap(df.T.isna(),cmap='Blues')
ax.set_title('Missing Values',fontsize=16)

for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14)
plt.show()
    

From the above heatmap gives insights on null values and we can see that  hydrometry having more null values

###### Missing values handling

1. filling NaN values with zeros or Outliers.
2. filling NaN values with last value
3. filling NaN values with mean value
4. filling NaN values with  .interpolate()

In [None]:
f, ax = plt.subplots(nrows=4,ncols=1,figsize=(15,12))

## we take np.inf to differentiate with 
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(0),ax = ax[0],color = 'darkorange',label = 'modified')
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(np.inf),ax = ax[0],color = 'dodgerblue',label = 'original')
ax[0].set_title('filling Nan values with zeros')

mean_drain = df['drainage_volume'].mean()
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(mean_drain),ax = ax[1],color = 'darkorange',label = 'modified')
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(np.inf),ax = ax[1],color = 'dodgerblue',label = 'original')
ax[0].set_title('filling Nan values with meanvalue')


last_value = df['drainage_volume'].ffill()
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(last_value),ax = ax[2],color = 'darkorange',label = 'modified')
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(np.inf),ax = ax[2],color = 'dodgerblue',label = 'original')
ax[0].set_title('filling Nan values with last value')

interpolate = df['drainage_volume'].interpolate()
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(interpolate),ax = ax[3],color = 'darkorange',label = 'modified')
sns.lineplot(x=df['date'],y=df['drainage_volume'].fillna(np.inf),ax = ax[3],color = 'dodgerblue',label = 'original')
ax[0].set_title('filling Nan values with interpolation')

for i in range(4):
    ax[i].set_xlim([date(2019, 5, 1), date(2019, 10, 1)])
    
plt.tight_layout()
plt.show()

From the above we can see that best option is to interpolate 

In [None]:
df['drainage_volume'] = df['drainage_volume'].interpolate()
df['river_hydrometry'] = df['river_hydrometry'].interpolate()
df['depth_to_groundwater'] = df['depth_to_groundwater'].interpolate()

##### Smoothing data / Resampling

Resampling the data can provide additional information. there are two types of resampling

-  Upsampling : upsampling is making weekly wise data into day wise data
-  downsampling : downsapling is making day wise data into weekly or mothly data

in this we will use .resample() method to resample

In [None]:
f, ax = plt.subplots(nrows=3,ncols=2,figsize = (16,12))

sns.lineplot(x=df['date'],y=df['drainage_volume'],ax = ax[0,0], color = 'dodgerblue')
ax[0,0].set_title('drainage_volume',fontsize =14)

resampling = df[['date','drainage_volume']].resample('7D',on='date').sum().reset_index(drop = False)
sns.lineplot(x=resampling['date'],y=resampling['drainage_volume'],ax = ax[1,0], color = 'dodgerblue')
ax[1,0].set_title('weekly_drainage_volume',fontsize =14)

resampling = df[['date','drainage_volume']].resample('M',on='date').sum().reset_index(drop = False)
sns.lineplot(x=resampling['date'],y=resampling['drainage_volume'],ax = ax[2,0], color = 'dodgerblue')
ax[2,0].set_title('Monthly_drainage_volume',fontsize =14)

for i in range(3):
    ax[i, 0].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

sns.lineplot(x=df['date'],y=df['temperature'],ax = ax[0,1], color = 'dodgerblue')
ax[0,1].set_title('temperature',fontsize =14)

resampling = df[['date','temperature']].resample('7D',on='date').mean().reset_index(drop = False)
sns.lineplot(x=resampling['date'],y=resampling['temperature'],ax = ax[1,1], color = 'dodgerblue')
ax[1,1].set_title('weekly_temperature',fontsize =14)

resampling = df[['date','temperature']].resample('M',on='date').mean().reset_index(drop = False)
sns.lineplot(x=resampling['date'],y=resampling['temperature'],ax = ax[2,1], color = 'dodgerblue')
ax[2,1].set_title('Monthly_temperature',fontsize =14)

for i in range(3):
    ax[i, 1].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])
plt.show()

From the above we can see that down sampling weekly wise will make the analysis easy with less data loss

In [None]:
downsammple = df[['date',
                 'depth_to_groundwater', 
                 'temperature',
                 'drainage_volume', 
                 'river_hydrometry',
                 'rainfall']].resample('7D',on='date').mean().reset_index(drop = False)

df = downsammple.copy()

## Stationarity

Sationarity means dataset is sattionary when the data properties does not depend on time. the data with sesonality and trends are said to be non stationary. since the trends and sesonality affects the data at different at diffrent timeseries.

Stationarity defines the time series as

* constant mean and mean is not time-dependent
* constant variance and variance is not time dependent 
* constant covariance and covariance is not time dependent



This stationarity check can be done with three methods

- Visually: by ploting in a graph and checking trends and seasonality
- Statistical: spliting time series and camparing the mean and variance of the data
- Staitstical test: Augmented Dickey Fuller test 

In [None]:
# rolling windo is the number of weeks per year
rolling_window = 52
f , ax = plt.subplots(nrows=2,ncols=1,figsize= (16,16))

sns.lineplot(x=df['date'],y=df['drainage_volume'],ax= ax[0], color = 'dodgerblue')
sns.lineplot(x=df['date'],y=df['drainage_volume'].rolling(rolling_window).mean(),ax= ax[0], color = 'black',label = 'roling_mean')
sns.lineplot(x=df['date'],y=df['drainage_volume'].rolling(rolling_window).std(),ax= ax[0], color = 'orange',label = 'roling_std')
ax[0].set_title('Depth to Groundwater: Non-stationary \n non-constant mean & non-constant variance', fontsize=14)
ax[0].set_ylabel(ylabel='Drainage Volume', fontsize=14)
ax[0].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

sns.lineplot(x=df['date'], y=df['temperature'], ax=ax[1], color='dodgerblue')
sns.lineplot(x=df['date'], y=df['temperature'].rolling(rolling_window).mean(), ax=ax[1], color='black', label='rolling mean')
sns.lineplot(x=df['date'], y=df['temperature'].rolling(rolling_window).std(), ax=ax[1], color='orange', label='rolling std')
ax[1].set_title('Temperature: Non-stationary \nvariance is time-dependent (seasonality)', fontsize=14)
ax[1].set_ylabel(ylabel='Temperature', fontsize=14)
ax[1].set_xlim([date(2009, 1, 1), date(2020, 6, 30)])

plt.tight_layout()
plt.show()

From the above we can see that features don't have constant mean and standard deviation

### Unit Root Test

Unit Root test is a charachteristic of time series that make time series Non stationary, ADF is a Unit root test. A unit root is said to be exist in a time series if Value of alpha =1 in time series 

Y
t
=
α
Y
t
−
1
+
β
X
e
+
ϵ

### Agumented Dicky Fuller(ADF)

Agumeented Dicky Fuller is a type of Statistical test called as Unit root test. we can give them by.

**Null Hypothesis(H0)**: Time Series has unit Root (Non- Stationary)
**Alternate Hypothesis(H1)**: Time Series not has unit Root (Stationary)

If the null hypothesis can be rejected, we can conclude that the time series is stationary.

There are two ways to rejects the null hypothesis:

On the one hand, the null hypothesis can be rejected if the p-value is below a set significance level. The defaults significance level is 5%

**p-value > significance level (default: 0.05)**: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
**p-value <= significance level (default: 0.05)**: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.
On the other hand, the null hypothesis can be rejects if the test statistic is less than the critical value.

**ADF statistic > critical value**: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
**ADF statistic < critical value**: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(df['depth_to_groundwater'].values)
result

Now, we are going to check for each variable:

The p-value is less than 0.05
Check the range of the ADF statistic compared with critical_values

In [None]:
f,ax = plt.subplots(nrows=3,ncols=2,figsize = (15,9))

def visualisation(series,title,ax):
    result = adfuller(series)
    sig_lvl = 0.05
    adf_stat = result[0]
    p_val = result[1]
    crit_val_1 = result[4]['1%']
    crit_val_5 = result[4]['5%']
    crit_val_10 = result[4]['10%']
    if (p_val < sig_lvl) & ((adf_stat < crit_val_1)):
        linecolor = 'forestgreen' 
    elif (p_val < sig_lvl) & (adf_stat < crit_val_5):
        linecolor = 'orange'
    elif (p_val < sig_lvl) & (adf_stat < crit_val_10):
        linecolor = 'red'
    else:
        linecolor = 'purple'
    sns.lineplot(x=df['date'],y=series,ax=ax,color=linecolor)
    ax.set_title(f'ADF Statistic {adf_stat:0.3f}, p-value: {p_val:0.3f}\nCritical Values 1%: {crit_val_1:0.3f}, 5%: {crit_val_5:0.3f}, 10%: {crit_val_10:0.3f}', fontsize=14)
    ax.set_ylabel(ylabel=title, fontsize=14)
    
visualisation(df['rainfall'].values, 'Rainfall', ax[0, 0])
visualisation(df['temperature'].values, 'Temperature', ax[1, 0])
visualisation(df['river_hydrometry'].values, 'River_Hydrometry', ax[0, 1])
visualisation(df['drainage_volume'].values, 'Drainage_Volume', ax[1, 1])
visualisation(df['depth_to_groundwater'].values, 'Depth_to_Groundwater', ax[2, 0])



If the data is not staionary bh=ut we want to use ARIMA models. we can do that transforming the data.The two most common methods to transform series into stationarity ones are:

Transformation: e.g. log or square root to stabilize non-constant variance

Differencing: subtracts the current value from the previous


### Transformation

In [None]:
df['depth_to_groundwater_log'] = np.log(abs(df['depth_to_groundwater']))
f, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))
visualisation(df['depth_to_groundwater_log'], 'Transformed \n Depth to Groundwater', ax[0])

sns.distplot(df['depth_to_groundwater_log'], ax=ax[1])

#### Differencing

Differencing can be done in different orders:

First order differencing: linear trends with  
z
i
=
y
i
−
y
i
−
1
 
Second-order differencing: quadratic trends with  
z
i
=
(
y
i
−
y
i
−
1
)
−
(
y
i
−
1
−
y
i
−
2
)
 
and so on...

In [None]:
# First Order Differencing
ts_diff = np.diff(df['depth_to_groundwater'])
df['depth_to_groundwater_diff_1'] = np.append([0], ts_diff)

f, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 6))
visualisation(df['depth_to_groundwater_diff_1'], 'Differenced (1. Order) \n Depth to Groundwater', ax)