In [None]:
import pandas as pd

df = pd.read_csv('../input/temperature-change/Environment_Temperature_change_E_All_Data_NOFLAG.csv', encoding='latin-1')

df=df.dropna()
df

At first, I checked out a random time series from the set, which illustrated the changes of temperature in Argentina in May. From the graph it looks like a stationary time series. 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
fig, ax = plt.subplots(figsize=(12,5))

x_pos=np.arange(len(df.columns[7:]))
ax = plt.bar(x_pos ,df.iloc[246,7:], yerr=df.iloc[246,7])
plt.xticks(x_pos, df.columns[7:], rotation=45)
plt.ylabel('Temperature changes in Argentina in May')
plt.title('Temperature changes over past 60 years')


Next, I looked at all the unique Areas and Months

In [None]:
df.Area.unique()

In [None]:
df.Months.unique()

Having all the months separately, by season, and yearly got me curious to check if the mean results are the same. 

In [None]:
afg = df.iloc[:34,: ]
afgtemp = afg[afg['Element Code']==7271]
afgtemp=afgtemp.drop([32])
afgtemp=afgtemp.reset_index()
afgtemp=afgtemp.drop(columns=['index'])
afgtemp

In [None]:
totafg = afg[afg['Element Code']==7271]
totafg = totafg.reset_index()
totafg = totafg.iloc[16,8:]
totafg


plt.rcParams["figure.figsize"] = (12,7)

plt.plot(afgtemp.iloc[0:11].mean()[3:])
plt.plot(afgtemp.iloc[12:16].mean()[3:], linewidth=3)
plt.plot(totafg)
plt.ylabel('Years')
plt.title('Afganistan')

plt.xticks(afgtemp.columns[7:], rotation=45)
plt.legend(('by month', 'by season', 'yearly'))



Season and year completely coincide, while monthly results differ slightly, which can probably be explained with the help of standard deviations. Noow, we can move straight to the "World" to check the global tendencies.

In [None]:
World = df[df.Area=='World']
World=World.reset_index()
World=World.drop(columns=['index'])
World

In [None]:
fig, ax = plt.subplots(figsize=(12,5))

x_pos=np.arange(len(World.columns[7:]))
ax = plt.bar(x_pos ,World.iloc[32,7:], yerr=World.iloc[33,7])
plt.xticks(x_pos, World.columns[7:], rotation=45)
plt.ylabel('Temperature changes of the World')
plt.title('Temperature changes over past 60 years')


This time series appeared increasing, thus didn't look stationary. Next logical step was to check rolling mean and standard deviation of this series.   

In [None]:
rolling_mean = World.iloc[32,7:].rolling(window = 12).mean()
rolling_std = World.iloc[32,7:].rolling(window = 12).std()

plt.plot(World.iloc[32,7:], color = 'blue', label = 'Original')
plt.plot(rolling_mean, color = 'red', label = 'Rolling Mean')
plt.plot(rolling_std, color = 'black', label = 'Rolling Std')
plt.xticks(x_pos, World.columns[7:], rotation=45)
plt.legend(loc = 'best')
plt.title('Rolling Mean & Rolling Standard Deviation')
plt.show()

In [None]:

from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
from matplotlib.pylab import rcParams

rcParams['figure.figsize'] = 15, 6



result = seasonal_decompose(World.iloc[32,7:], model='additive',freq=2) #need to choose the proper freq

result.plot()
pyplot.show()

While the rolling mean clearly increses with time, the deviation seems to stay unchanged, altogether showing that the time series is not stationary. Still let's perform the augmented Dickey-Fuller Test for more evidence

In [None]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(World.iloc[32,7:],autolag='AIC') #the Akaike Information Criterion (AIC) is used to determine the lag.
print('ADF Statistic: {}'.format(result[0])) 
print('p-value: {}'.format(result[1]))  
print('Critical Values:')
for key, value in result[4].items():
    print('\t{}: {}'.format(key, value))

With p-value 0.99 we fail to reject the hypothesis that unit root exist, 
and ensure that the series is not stationary. 

And now we can procede to ARIMA (AutoRegressive Integrated Moving Average Model) testing.
AR-Autoregression is a model with a dependency between the observations and some number of lagged observations. I-Integrated means using the difference between raw observations, the way values of temperature change could be gathered. MA-Moving Average uses the dependency between an observation and a residual error from a moving average model applied to lagged observations.
p, d , q are the three parameneters to quickly describe the model. p - the number of lagged observations used in the model, d - the number of time raw observations are differenced from each other, and q is the size of the moving average window. 

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig, ax = plt.subplots(2, figsize=(12,6))
ax[0]=plot_acf(World.iloc[32,7:], ax=ax[0])
ax[1]=plot_pacf(World.iloc[32,7:],ax=ax[1])


ACF is slowly decreasing losing its significance. From PACF let's get p=3, while q=5. 

In [None]:
from pandas.plotting import autocorrelation_plot

autocorrelation_plot(World.iloc[32,7:])

In [None]:
world = pd.DataFrame(World.iloc[32,7:])
world.values
world=world.rename(columns={32:'tempch'})

In [None]:
# Original Series
fig, axes = plt.subplots(3, 2, figsize=(20,10))
axes[0, 0].plot(world['tempch']); axes[0, 0].set_title('Original Series')
plot_acf(world.values, ax=axes[0, 1])

# 1st Differencing
axes[1, 0].plot(world['tempch'].diff()); axes[1, 0].set_title('1st Order Differencing')
plot_acf(world['tempch'].diff().dropna(), ax=axes[1, 1])

# 2nd Differencing
axes[2, 0].plot(world['tempch'].diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
plot_acf(world['tempch'].diff().diff().dropna(), ax=axes[2, 1])

plt.show()

In [None]:
print("\n > Is the de-trended data stationary ?")
dftest = adfuller(world.diff().dropna(), autolag='AIC')
print("Test statistic = {:.3f}".format(dftest[0]))
print("P-value = {:.3f}".format(dftest[1]))
print("Critical values :")
for k, v in dftest[4].items():
    print("\t{}: {} - The data is {} stationary with {}% confidence".format(k, v, "not" if v<dftest[0] else "", 100-int(k[:-1])))

    
print("\n > Is the de-trended data stationary after the second differencing ?")
dftest = adfuller(world.diff().diff().dropna(), autolag='AIC')
print("Test statistic = {:.3f}".format(dftest[0]))
print("P-value = {:.3f}".format(dftest[1]))
print("Critical values :")
for k, v in dftest[4].items():
    print("\t{}: {} - The data is {} stationary with {}% confidence".format(k, v, "not" if v<dftest[0] else "", 100-int(k[:-1])))
    

This illustrates that 1 differencing is enough to make the series stationary.Let's test our model.

In [None]:
from statsmodels.tsa.arima.model import ARIMA
world = pd.DataFrame(World.iloc[32,7:])
world.values
final = pd.to_numeric(world[32], errors='coerce')
model1 = ARIMA(final.diff().dropna(), order=(3,1,5))
model_fit1 = model1.fit()
print(model_fit1.summary())

model2 = ARIMA(final, order=(2,1,5))
model_fit2 = model2.fit()
print(model_fit2.summary())

model3 = ARIMA(final, order=(2,0,5))
model_fit3 = model3.fit()
print(model_fit3.summary())

fig, ax = plt.subplots(1,3,sharey=True, figsize=(12, 6))
ax[0].plot(model_fit1.resid.values, alpha=1, label='variance={:.3f}'.format(np.std(model_fit1.resid.values)));
ax[0].hlines(0, xmin=0, xmax=59, color='b');
ax[0].set_title("ARIMA(3,1,5)");
ax[0].legend();
ax[1].plot(model_fit2.resid.values, alpha=1, label='variance={:.3f}'.format(np.std(model_fit2.resid.values)));
ax[1].hlines(0, xmin=0, xmax=59, color='r');
ax[1].set_title("ARIMA(2,1,5)");
ax[1].legend();
ax[2].plot(model_fit3.resid.values, alpha=1, label='variance={:.3f}'.format(np.std(model_fit3.resid.values)));
ax[2].hlines(0, xmin=0, xmax=59, color='g');
ax[2].set_title("ARIMA(2,0,5)");
ax[2].legend();

In [None]:
from math import sqrt
from sklearn.metrics import mean_squared_error

X = final.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
# walk-forward validation
for t in range(len(test)):
    model = ARIMA(history, order=(2,1,5))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    #print('predicted=%f, expected=%f' % (yhat, obs))
# evaluate forecasts
rmse = sqrt(mean_squared_error(test, predictions))
print('Test RMSE: %.3f' % rmse)
# plot forecasts against actual outcomes
plt.plot(test,'black',label='test')
plt.plot(predictions, color='red',label='predictions')
plt.legend()
plt.show()



In [None]:
residuals = pd.DataFrame(model_fit2.resid)
fig, ax = plt.subplots(2,1, figsize=(7,11))
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()

Our ARIMA model seem to fit and explain well enough the increasing change in the temperature of the world. 