# Time Series LSTM - FORECAST NEW DATA

In [None]:
#import libraries
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from keras.models import Sequential, load_model
from keras.layers import Activation, Dense, LSTM, Dropout, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.preprocessing.sequence import TimeseriesGenerator



Function to access a specific set of information, which will be used in the predictive model

In [None]:
def GlobalTemperature_values(chart_time,inicial_date = '1995-01-01',final_date = '2019-12-31',Region=None,Country=None,City=None):  
  """
  Funtion which returns the values of the Average Temperature from a given location and time.

  If any location is specified the function returns a chart related with the data from the whole world.
  To know what is the exactly list of Regions/Country/City for the parameters, involke the GlobalTemperature_dataInfo.
  
  The Parameter chart_time only accepts the following string names: [Day, Month, Year].

  The correct format for the inicial_date and final_date is: Year-Month-Day.
  If not specified, the inicial_date and final_date are: '1995-01-01', '2019-12-31'.
  The Parameters inicial_date and final_date only accepts values in the interval of: 1995-01-01 ------ 2019-12-31.

  """
  #Operations
  def AvgperDate(data):
    avgperDate = data.groupby('Date')['AvgTemperature'].mean()
    return avgperDate
  def Avgperday(data):
    avgperday = data.groupby('Day')['AvgTemperature'].mean()
    return avgperday
  def Avgpermonth(data):
    avgpermonth = data.groupby('Month')['AvgTemperature'].mean()
    return avgpermonth
  def Avgperyear(data):  
    avgperyear = data.groupby('Year')['AvgTemperature'].mean()
    return avgperyear

  data = pd.read_csv('/kaggle/input/daily-temperature-of-major-cities/city_temperature.csv',low_memory=False) #import data
  data.drop(columns = 'State',inplace=True) #drop column state
  data['AvgTemperature'] = (data['AvgTemperature']-32)*(5/9) #transforming in Celsius
  remove = data.loc[(data['AvgTemperature']< -50)] #removing outliers
  data.drop(remove.index,inplace=True)
  remove = data.loc[(data['Year'] == 2020)] #removing data from incomplete year
  data.drop(remove.index,inplace=True)
  date = pd.to_datetime(data[['Month','Day','Year']],errors='coerce') #data format
  data['Date'] = date #new column
 
  #Location choice
  if Region != None and Country == None and City == None:
    if any(data['Region'].unique() == Region) == False:
      return print('Please check the list of locations and the spelling accepted using the funtion: GlobalTemperature_dataInfo ')
    data = data[data['Region'] == Region]

  elif Region == None and Country != None and City == None:
    if any(data['Country'].unique() == Country) == False:
      return print('Please check the list of locations and the spelling accepted using the funtion: GlobalTemperature_dataInfo ')
    data = data[data['Country'] == Country]

  elif Region == None and Country == None and City != None:
    if any(data['City'].unique() == City) == False:
      return print('Please check the list of locations and the spelling accepted using the funtion: GlobalTemperature_dataInfo ')
    data = data[data['City'] == City]
  
  elif Region == None and Country == None and City == None:
    data = data

  else:
    return print('Please select just one of types of location: Region, Country, City or let None in all for World data')



  #Date choice
  if inicial_date<'1995-01-01' or inicial_date>'2019-12-31':
    return print('Please choose a initial_date greater than 1995-01-01 and lesser than 2019-12-31.')
  elif final_date<'1995-01-01' or final_date>'2019-12-31':
    return print('Please choose a initial_date greater than 1995-01-01 and lesser than 2019-12-31.')

  data = data[(data['Date'] >= inicial_date) & (data['Date'] <= final_date)]

  #chart period choice

  if chart_time == 'Day':
    return Avgperday(data)

  elif chart_time == 'Month':
    return Avgpermonth(data)

  elif chart_time == 'Year':
    return Avgperyear(data)
  
  elif chart_time == 'Date':
    return AvgperDate(data)

  elif chart_time == None:
    return Avgperyear(data)
  
  else:
    return print('Please type one of the following: Day,Month,Year or None ')

In [None]:
data = GlobalTemperature_values(chart_time='Date',inicial_date = '1995-01-01',final_date = '2019-12-31',Region=None,Country=None,City='Sao Paulo')

In [None]:
data

# Forecast Time Series


After obtaining the data you want to perform the forecast, it is necessary to separate the data in training and testing.

Test = %TOTAL_LENGTH (usually ~30%)

Train = TOTAL_LENGTH - Test

In [None]:
train,test= train_test_split(data.values,test_size=0.3,shuffle=False)

In [None]:
len(data.values)

In [None]:
len(train)

Scaling the values
It is necessary to reshape the values to 2 dimensions

In [None]:
scaler1 = MinMaxScaler()
train = scaler1.fit_transform(train.reshape(-1, 1))
test = scaler1.transform(test.reshape(-1, 1))

In [None]:
train

For data visualizations purpose, we gonna keep the dates into separate variables

In [None]:
split=int((1-0.3)*len(data))

date_train = data.index[:split]
date_test = data.index[split:]

In [None]:
date_train

The most difficult part in time series is to separate the data in batches, transforming in proper way.

There is a keras API for help us make this.

The parameter length, is how much of previus data we want to use to make predictions

In [None]:
look_back = 20
train_gen = TimeseriesGenerator(train, train, length=look_back, batch_size=20)     
test_gen = TimeseriesGenerator(test, test, length=look_back, batch_size=1)

In [None]:
train_gen

Model structure

In [None]:
model = Sequential()
model.add(LSTM(500,activation='relu', return_sequences=True, input_shape=(look_back, 1)))
model.add(LSTM(200,activation='relu', return_sequences=True, input_shape=(look_back, 1)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.25))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
model.summary()

Trainning the model

If it is taking too long, set less steps or change the look back value

In [None]:
model.fit_generator(train_gen, epochs=20, 
      verbose=1)
model.save('model.pb')

In [None]:
test = scaler1.inverse_transform(test)
train = scaler1.inverse_transform(train)

Performing predictions with the test data


In [None]:
pred = scaler1.inverse_transform(model.predict_generator(test_gen))

Ploting the whole data

In [None]:
plt.figure(figsize=(20,8))
# plt.plot(date_train,train, label = "Train data")
# plt.plot(date_test[:-look_back],pred, label = "Prediction based in the Test data")
plt.plot(data.index,data.values, label = "Data")
plt.title('Avg Temperature in C° per {}'.format(data.index.name))
plt.xlabel('{}'.format(data.index.name),fontsize=15)
plt.ylabel('Avg Temperature in C°',fontsize=15)
plt.legend()
plt.show()

Ploting the train and the predicted test data 

In [None]:
plt.figure(figsize=(20,8))
plt.plot(date_train,train, label = "Train data")
plt.plot(date_test[:-look_back],pred, label = "Prediction based in the Test data")
# plt.plot(data.index,data.values, label = "Data")
plt.title('Avg Temperature in C° per {}'.format(data.index.name))
plt.xlabel('{}'.format(data.index.name),fontsize=15)
plt.ylabel('Avg Temperature in C°',fontsize=15)
plt.legend()
plt.show()

predicted test data and the original test data

In [None]:
plt.figure(figsize=(20,8))
#plt.plot(date_train,train, label = "Train data")
plt.plot(date_test[:-look_back],pred, label = "Prediction based in the Test data")
plt.plot(date_test[:-look_back],test.reshape(-1)[:-look_back],label = "Test Data")
plt.title('Avg Temperature in C° per {}'.format(data.index.name))
plt.xlabel('{}'.format(data.index.name),fontsize=15)
plt.ylabel('Avg Temperature in C°',fontsize=15)
plt.legend()
plt.show()

Metrics of the model

In [None]:
  from sklearn import metrics
  print('MAE:', metrics.mean_absolute_error(data.values[split+look_back:],pred))
  print('MSE:', metrics.mean_squared_error(data.values[split+look_back:],pred))
  print('RMSE:', np.sqrt(metrics.mean_squared_error(data.values[split+look_back:],pred)))

After you train and test your model, with the data that you already had, you want to predict future data, which is, I think, the trully interresting thing about recurrent networks.

So in order to make this, you need to start predicting the values from one day after your final date in your original dataset, using the model (which is trained with this past data). Once you predict this value, you do the same thing, but considering the last values predict, and so on.

The fact that you are using a prediction to make others predictions, implies that is much more difficult to get good results, so is common to try to predict short ranges of time.

In [None]:
def predict(forecast_num, model,data,look_back):
  prediction_list = data[-look_back:]

  for _ in range(forecast_num):
      x = prediction_list[-look_back:]
      x = x.reshape((1, look_back, 1))
      out = model.predict(x)[0][0]
      prediction_list = np.append(prediction_list, out)
  prediction_list = prediction_list[look_back-1:]

  return prediction_list

def predict_dates(forecast_num):
    last_date = data.index[-1]
    prediction_dates = pd.date_range(last_date, periods=forecast_num+1).tolist()
    return prediction_dates

In [None]:
forecast_num = 2 #number of day to predict after the last date in data
forecast=predict(forecast_num, model=model,data=data.values,look_back=look_back)
forecast_date=predict_dates(forecast_num)

In [None]:
  print('forecast',forecast)
  print('forecast dates',forecast_date)

Let's see the data in 2020, that was not used in the train or test for the model

In [None]:
data = pd.read_csv('/kaggle/input/daily-temperature-of-major-cities/city_temperature.csv',low_memory=False) #import data
data.drop(columns = 'State',inplace=True) #drop column state
data['AvgTemperature'] = (data['AvgTemperature']-32)*(5/9) #transforming in Celsius
remove = data.loc[(data['AvgTemperature']< -50)] #removing outliers
data.drop(remove.index,inplace=True)
date = pd.to_datetime(data[['Month','Day','Year']],errors='coerce') #data format
data['Date'] = date #new column
d=data[((data['Date'] =='2020-1-1') | (data['Date'] =='2020-1-2'))& (data['City'] == 'Sao Paulo')]

In [None]:
d

The model predicts that the average temperature would be 48 ° C and the correct answer is 26 ° C. We need to improve the model by using more layers and testing other parameters


This kernel was a simple example to show only the basic steps, I hope this can help someone.