In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns


import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARMA, ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from fbprophet import Prophet

from sklearn.metrics import mean_squared_error

In [None]:
file = pd.read_csv("../input/bigdataset/Datasets-master/daily-max-temperatures.csv")
file.head()

In [None]:
file["Date"] = pd.to_datetime(file["Date"])

In [None]:
file = file.set_index("Date")
file.index


In [None]:
#plotting the data
file.isnull().sum()

In [None]:
file.describe()

In [None]:
file.plot(figsize = (16, 10))
plt.show()

In [None]:
plt.figure(1)
plt.subplot(211)
file["Temperature"].hist()
plt.subplot(212)
file["Temperature"].plot(kind = 'kde')
plt.show()


In [None]:
fig, ax = plt.subplots(figsize = (15, 6))
sns.boxplot(file.index.month, file["Temperature"])

# Decomposing using statsmodel:

- We can use statsmodels to perform a decomposition of this time series.
- The decomposition of time series is a statistical task that deconstructs a time series into several components, each representing one of the underlying categories of patterns.
- With statsmodels we will be able to see the trend, seasonal, and residual components of our data.

In [None]:
#decomposing the model
plt.rcParams['figure.figsize'] = 16, 8
decomposition = sm.tsa.seasonal_decompose(file["Temperature"], model='multiplicative', period=365)
fig = decomposition.plot()
plt.show()

# Stationarity
- A Time Series is said to be stationary if its statistical properties such as mean, variance remain constant over time.
- Most of the Time Series models work on the assumption that the TS is stationary. Major reason for this is that there are many ways in which a series can be non-stationary, but only one way for stationarity.
- Intuitively, we can say that if a Time Series has a particular behaviour over time, there is a very high probability that it will follow the same in the future.
- Also, the theories related to stationary series are more mature and easier to implement as compared to non-stationary series.

In [None]:
plt.plot(file)

# We can check stationarity using the following:

- **ACF and PACF plots:** If the time series is stationary, the ACF/PACF plots will show a quick drop-off in correlation after a small amount of lag between points.
- **Plotting Rolling Statistics:** We can plot the moving average or moving variance and see if it varies with time. Moving average/variance is for any instant ‘t’, the average/variance of the last year, i.e. last 12 months.
- **Augmented Dickey-Fuller Test:** This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary. Refer this article for details.

# **ACF and PACF plots**

- Let's review the Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots
- If the time series is stationary, the ACF/PACF plots will show a quick drop-off in correlation after a small amount of lag between points.
- This data is non-stationary as a high number of previous observations are correlated with future values.
- Confidence intervals are drawn as a cone.
- By default, this is set to a 95% confidence interval, suggesting that correlation values outside of this code are very likely a correlation and not a statistical fluke.
- The partial autocorrelation at lag k is the correlation that results after removing the effect of any correlations due to the terms at shorter lags

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

plt.figure()
plt.subplot(211)
plot_acf(file["Temperature"], ax=plt.gca(), lags = 30)
plt.subplot(212)
plot_pacf(file["Temperature"], ax=plt.gca(), lags = 30)
plt.show()

# Plotting Rolling Statistics
- We observe that the rolling mean and Standard deviation are not constant with respect to time (increasing trend)
- The time series is hence not stationary

In [None]:
rolmean = file["Temperature"].rolling(window = 12).mean()
rolstd = file["Temperature"].rolling(window = 12).std()

#Plot rolling statistics:
orig = plt.plot(file, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show()

# Augmented Dickey-Fuller 
- The intuition behind the test is that if the series is integrated then the lagged level of the series y(t-1) will provide no relevant information in predicting the change in y(t).
- Null hypothesis: The time series is not stationary
- Rejecting the null hypothesis (i.e. a very low p-value) will indicate staionarity

In [None]:

#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(file["Temperature"], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)

Here we can clearly see that the P-values is well below 5%, so we can successfully reject the null hypothesis and call the series to be stationary.

In [None]:

def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12).mean()
    rolstd = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

# Time Series Forecasting



In [None]:
from statsmodels.tsa.ar_model import AR
from random import random

In [None]:

# fit model
model = AR(file["Temperature"])
model_fit = model.fit()

In [None]:
plt.plot(file["Temperature"])
plt.plot(model_fit.fittedvalues, color='red')
plt.title('RSS: %.4f'% np.nansum((model_fit.fittedvalues-file["Temperature"])**2))
plt.show()

In [None]:
#Building the ARIMA model

#splitting the dataset

train = file[:int(0.75*len(file))]
test = file[train.shape[0]:]

train.shape, test.shape

In [None]:
train["Temperature"].plot()
test["Temperature"].plot()

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt

# fit model
model = ARIMA(train, order=(1, 0, 1))
model_fit = model.fit(disp=1)

In [None]:
model_fit.summary()

In [None]:
test

In [None]:
#Predictions
end_index = len(file)
predictions = model_fit.predict(start=2737, end = end_index - 1)

In [None]:
predictions

In [None]:
mse = mean_squared_error(file[train.shape[0]:], predictions)
rmse = sqrt(mse)
print('RMSE: {}, MSE:{}'.format(rmse,mse))

In [None]:
plt.plot(file["Temperature"])
plt.plot(predictions)
#plt.title('RMSE: %.4f'% np.sqrt(np.nansum((predictions-file["Temperature"])**2)/len(file)))

In [None]:
predictions = pd.Series(predictions)

# Prophet Model 

In [None]:
train.head()

In [None]:
train_prophet = pd.DataFrame()
train_prophet['ds'] = train.index
train_prophet['y'] = train["Temperature"].values

In [None]:
from fbprophet import Prophet

#instantiate Prophet with only yearly seasonality as our data is monthly 
model = Prophet( yearly_seasonality=True, seasonality_mode = 'multiplicative')
model.fit(train_prophet) #fit the model with your dataframe

In [None]:
future = model.make_future_dataframe(periods = 913, freq = 'D') 
future.tail()

In [None]:
# now lets make the forecasts
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:

fig = model.plot(forecast)
#plot the predictions for validation set

plt.plot(test, label='Valid', color = 'red', linewidth = 2)

plt.show()

In [None]:
model.plot_components(forecast);

In [None]:

y_prophet = pd.DataFrame()
y_prophet['ds'] = test.index
y_prophet['y'] = test["Temperature"].values

In [None]:
y_prophet = y_prophet.set_index('ds')
forecast_prophet = forecast.set_index('ds')