In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Packages/Libraries to load 

In [None]:
import matplotlib.pyplot as plt
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Define the task
The task is to predict the monthly Population. 

The data ranges from January 1952 to December 2019, with **816 observations.

In [None]:
df_POP = pd.read_csv('/kaggle/input/population-time-series-data/POP.csv', delimiter=',')
df_POP.dataframeName = 'POP.csv'
print(df_POP.shape)

In [None]:
df_POP.head(5)

In [None]:
df_POP.tail(5)

## Plot the time series

In [None]:
plt.title('Monthly Population TS')
plt.plot(df_POP['value'])
plt.show()

Converting Date to pandas DateTime for easier plotting of the Population TS data.

In [None]:
df_POP['date']=pd.to_datetime(df_POP['date'])

In [None]:
plt.title('Monthly Population TS')
plt.plot(df_POP['date'],df_POP['value'])
plt.show()

# Long Short-Term Memory network Network for Regression

Task: Given the Population number this month, what is the Population next month?

A function is used to convert the single column TS into a two-column dataset: 
* the first column containing this month’s (t) population count
* the second column containing next month’s (t+1) populations count, to be predicted.

> This notebook follows the Deep Learning for Time Series tutorial from Jason Brownlee (July 21, 2016). Link: https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/


In [None]:
# random seed for reproducibility
np.random.seed(5)

In [None]:
# reshape the dataset
dataset=np.array(df_POP['value'])
dataset=dataset.reshape(-1,1)

## Step 1 :  normalize the dataset

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

## Step 2 : Split the dataset into train and test sets

In [None]:
train_size = int(len(dataset) * 0.67)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))

## Step 3 : reshape the train and test sets to 2 columns with t & t+1


In [None]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return np.array(dataX), np.array(dataY)

In [None]:
# reshape into X=t and Y=t+1
look_back = 1
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

In [None]:
# reshape input to [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

## Step 4 : create and fit the Long Short-Term Memory network

In [None]:
model = Sequential()
model.add(LSTM(4, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

## Step 5 : Predictions & RMSE calculation

In [None]:
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

## Step 6 : Plot of the population predictions

Original dataset in blue

Predictions for the training dataset in green

Predictions on the unseen test dataset in red


In [None]:
# shift train predictions for plotting
trainPredictPlot = np.empty_like(dataset)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = np.empty_like(dataset)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1, :] = testPredict
# plot baseline and predictions
plt.plot(df_POP['date'],scaler.inverse_transform(dataset),color='blue')
plt.plot(df_POP['date'],trainPredictPlot,color='green')
plt.plot(df_POP['date'],testPredictPlot,color='red')
plt.show()

In [None]:
# plot baseline and predictions between 1994 & 2010
plt.plot(df_POP['date'][500:700],(scaler.inverse_transform(dataset))[500:700],color='blue')
plt.plot(df_POP['date'][500:700],trainPredictPlot[500:700],color='green')
plt.plot(df_POP['date'][500:700],testPredictPlot[500:700],color='red')
plt.show()