# <a>Import required libraries</a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt #visualization
import seaborn as sns #visualization
%matplotlib inline
import itertools
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization
import warnings
import seaborn as sns # for plot visualization
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
warnings.filterwarnings("ignore")
# Any results you write to the current directory are saved as output.

*Load the dataset.*

There is column named **CET** in this dataset, we are going to read that as an index.

In [None]:
data = pd.read_csv('/kaggle/input/weather_madrid_lemd_1997_2015.csv/weather_madrid_LEMD_1997_2015.csv', parse_dates=['CET'],index_col='CET')

In [None]:
data.head()

In [None]:
data.index

# <a>Feature Engineering</a>

Here we are going to consider only few of the columns which seems important from some basic EDA and time series prediction's point of view.

In [None]:
data.isnull().sum(axis=0)

In [None]:
data = data.drop([' Events',' Max VisibilityKm',' Mean VisibilityKm',' Min VisibilitykM',' Max Gust SpeedKm/h',' CloudCover'], axis = 1)

In [None]:
data.isnull().sum(axis=0)

Not many values are missing, but it will still be great to fill the missing ones instead of removing entire row.

In [None]:
data[data.isna().any(axis=1)]

In [None]:
data.ffill(axis=0, inplace = True)

In [None]:
data.isnull().sum()

In [None]:
data.head()

# <a>Exploratory Data Analysis & Visualizations</a>

Let's see how plot for all year's temprature and humidity looks like.

In [None]:
data['Mean TemperatureC'].plot(subplots=True, figsize=(20,12))
plt.ylabel('Temperature')

In [None]:
data['MeanDew PointC'].plot(subplots=True, figsize=(20,12))
plt.ylabel('Dew Point')

In [None]:
data[' Mean Humidity'].plot(subplots=True, figsize=(20,12))
plt.ylabel('Humidity')

Split the dataset into train and test.

In [None]:
train=data['2000':'2013'].resample('M').mean().fillna(method='pad')
test=data['2014':'2015'].resample('M').mean().fillna(method='pad')

In [None]:
train.index

In [None]:
test.index

In [None]:
ts=train['Max TemperatureC']
test_ts=test['Max TemperatureC']
ts.head()

In [None]:
def ts_plot(timeseries):
    fig=plt.figure(figsize=(20,10))
    plt.plot(timeseries)
    plt.show()

ts_plot(ts);

It seems overplotted, let's plot for only two years 2015 and 2016, it will give us the clear picture of seasonality and trend.

In [None]:
ts_plot(test_ts)

# <a>Check Stationarity</a>

So above plots shows that we do have seasonality but there is no trend. Let's check for below necessary conditions:
    * Constant mean
    * Constant variance
    * An auto co-variance that does not depend on time

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries, wind_size):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=wind_size).mean()
    rolstd = timeseries.rolling(window=wind_size).std()

    #Plot rolling statistics:
    fig=plt.figure(figsize=(20,10))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In **Dickey-Fuller test**, we need only test_statics and critical_value to know if it is stationary or not.

In [None]:
test_stationarity(ts,14)

We have constant Mean and Variance, and our **Test statistic** is less than **Critical Values**, so we already have stationary Time series. So our 'd' value will become 0 in ARIMA Model.

Consider a case if it was non-stationary, in that case we would use below techniques to make it stationary

Make Stationary **For non-stationary to stationary conversion**, we can use any of the below technique :
* Decomposing
* Differencing


In [None]:
def diff_plot(timeseries):
    plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})
    
    fig, axes = plt.subplots(3, 2, sharex=True)
    axes[0, 0].plot(timeseries.values); 
    axes[0, 0].set_title('Original Series')
    plot_acf(timeseries.values, ax=axes[0, 1])
    
    # 1st Differencing
    axes[1, 0].plot(timeseries.diff().values); 
    axes[1, 0].set_title('1st Order Differencing')
    plot_acf(timeseries.diff().dropna().values,ax=axes[1, 1])
    
    # 2nd Differencing
    axes[2, 0].plot(timeseries.diff().diff().values); 
    axes[2, 0].set_title('2nd Order Differencing')
    plot_acf(timeseries.diff().diff().dropna().values,ax=axes[2, 1])
    
    plt.xticks(rotation='vertical')
    plt.show()

    
diff_plot(ts)

As you can see here the first series itself is perfectly stationary, So we don't need any differencing here

In [None]:
def pacf_plot(timeseries):
    # PACF plot of 1st differenced series
    plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})
    
    fig, axes = plt.subplots(1, 2, sharex=True)
    axes[0].plot(timeseries.diff().values); axes[0].set_title('1st Differencing')
    axes[1].set(ylim=(0,5))
    plot_pacf(timeseries.diff().dropna().values, ax=axes[1])
    
    plt.show()

pacf_plot(ts)

In [None]:
def acf_plot(timeseries):
    fig, axes = plt.subplots(1, 2, sharex=True)
    axes[0].plot(timeseries.diff().values); axes[0].set_title('1st Differencing')
    axes[1].set(ylim=(0,1.2))
    plot_acf(timeseries.diff().dropna().values, ax=axes[1])
    
    plt.show()

acf_plot(ts);

## <a>Timeseries Analysis (ARIMA Model)</a>

For prediction we are going to use one of the most popular model for time series, **Autoregressive Integrated Moving Average (ARIMA)** which is a standard statistical model for time series forecast and analysis.
An ARIMA model can be understood by outlining each of its components as follows:
* **Autoregression (AR) -** refers to a model that shows a changing variable that regresses on its own lagged, or prior, values.<br/>
The notation **AR(p)** indicates an autoregressive model of order p.

    *Example* — If p is 3 the predictor for X(t) will be 
        X(t) = µ + X(t-1) + X(t-2) + X(t-3) + εt

    Where ε is error term.
* **Integrated (I) -** represents the differencing of raw observations to allow for the time series to become stationary, i.e., data values are replaced by the difference between the data values and the previous values.
* **Moving average (MA) -** incorporates the dependency between an observation and a residual error from a moving average model applied to lagged observations.

    The notation **MA(q)** refers to the moving average model of order q.<br/>
 

    Example — If q is 3 the predictor for X(t) will be 
        X(t) = µ + εt + θ1.ε(t-1) + θ2.ε(t-2) + θ3.ε(t-3)
    Here instead of difference from previous term, we take errer term (ε) obtained from the difference from past term.
Now we need to figure out the values of p and q which are parameters of ARIMA model. We use below two methods to figure out these values  -

**Autocorrelation Function (ACF):** It just measures the correlation between two consecutive (lagged version). example at lag 4, ACF will compare series at time instance t1…t2 with series at instance t1–4…t2–4

**Partial Autocorrelation Function (PACF):** is used to measure the degree of association between X(t) and X(t-p).

In [None]:
def acf_pacf_plot(timeseries):
    acf_lag = acf(timeseries.diff().dropna().values, nlags=20)
    pacf_lag = pacf(timeseries.diff().dropna().values, nlags=20, method='ols')
    
    plt.figure(figsize=(22,10))
    
    plt.subplot(121)
    plt.plot(acf_lag)
    plt.axhline(y=0,linestyle='--',color='silver')
    plt.axhline(y=-1.96/np.sqrt(len(timeseries.diff().values)),linestyle='--',color='silver')
    plt.axhline(y=1.96/np.sqrt(len(timeseries.diff().values)),linestyle='--',color='silver')
    plt.title("Autocorrelation Function")
    
    plt.subplot(122)
    plt.plot(pacf_lag)
    plt.axhline(y=0,linestyle='--',color='silver')
    plt.axhline(y=-1.96/np.sqrt(len(timeseries.diff().values)),linestyle='--',color='silver')
    plt.axhline(y=1.96/np.sqrt(len(timeseries.diff().values)),linestyle='--',color='silver')
    plt.title("Partial Autocorrelation Function")
    plt.tight_layout()

acf_pacf_plot(ts)

These grey dotted line are confidence intervels which we are going to use to find out the value of p and q.

__p__ - *the point where PACF crosses the upper confiednce level. In our case it seems to be 2. So we will take *p = 2.

__q__ - the point where ACF crosses the upper confiednce level. In our case it seems to be 2. So we will take q = 2.

__d__ - number of nonseasonal differences needed for stationarity. In this case we are going to take it as 0, since this series is already stationary.

Now we are going fit time series for ARIMA Models. We will compare performance on the basis of RSS score and at last prefer the best one.

In [None]:
model = ARIMA(ts.values, order=(2,0,2))
model_fit = model.fit(disp=0)
print(model_fit.summary())

In [None]:
# Plot residual errors
residuals = pd.DataFrame(model_fit.resid)
fig, ax = plt.subplots(1,2)
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()

In [None]:
model_fit.plot_predict(dynamic=False)
plt.show()

In [None]:
fc, se, conf = model_fit.forecast(24, alpha=0.05)  # 95% conf

# print(fc)
# Make as pandas series
fc_series = pd.Series(fc, index=test_ts.index)
lower_series = pd.Series(conf[:, 0], index=test_ts.index)
upper_series = pd.Series(conf[:, 1], index=test_ts.index)

# # Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(ts, label='training')
plt.plot(test_ts, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()

In [None]:
train.head()

In [None]:
ts_dew = train['Dew PointC']
test_ts_dew=test['Dew PointC']
ts_hum = train[' Mean Humidity']
test_ts_hum = test[' Mean Humidity']

In [None]:
ts_plot(ts_hum)

In [None]:
test_stationarity(ts_hum,12)

In [None]:
diff_plot(ts_hum)

In [None]:
acf_pacf_plot(ts_hum)

In [None]:
model1 = ARIMA(ts_hum.values, order=(2,0,2))
model_fit1 = model1.fit(disp=0)
print(model_fit1.summary())

In [None]:
residuals = pd.DataFrame(model_fit1.resid)
fig, ax = plt.subplots(1,2)
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()

In [None]:
model_fit1.plot_predict(dynamic=False)
plt.show()

In [None]:
fc, se, conf = model_fit1.forecast(24, alpha=0.05)  # 95% conf

# print(fc)
# Make as pandas series
fc_series = pd.Series(fc, index=test_ts_hum.index)
lower_series = pd.Series(conf[:, 0], index=test_ts_hum.index)
upper_series = pd.Series(conf[:, 1], index=test_ts_hum.index)

# # Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(ts_hum, label='training')
plt.plot(test_ts_hum, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()

In [None]:
test_stationarity(ts_dew,12)

Here we can see that Dew Point is not stationary because the __Test Statistic__ value is greater than the __Critical Values__.

**For non-stationary to stationary conversion**, we can use any of the below technique :
* **Decomposing**
* **Differencing**


In [None]:
diff_plot(ts_dew)

In [None]:
log_ts_dew = np.log(ts_dew)


In [None]:
log_ts_dew.dropna(inplace=True)

**Decompose**

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_dew)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ts_dew, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
residual.isnull().count()

In [None]:
ts_log_decompose = residual
ts_log_decompose.dropna(inplace=True)
test_stationarity(ts_log_decompose,14)

In [None]:
acf_pacf_plot(ts_log_decompose)

Differencing

In [None]:
ts_log = np.log(ts_dew)
ts_log.isnull().count()

In [None]:
ts_log.dropna(inplace=True)
ts_log.isnull().count()

In [None]:
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)

In [None]:
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff,14)

In [None]:
acf_pacf_plot(ts_log_diff)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_log_diff)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ts_dew, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
acf_pacf_plot(ts_dew.diff())