**IMPORT THE BASIC LIBRARIES YOU THINK YOU WILL USE**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

## Data

Info about this data set: https://fred.stlouisfed.org/series/IPN31152N


Units:  Index 2012=100, Not Seasonally Adjusted

Frequency:  Monthly

The industrial production (IP) index measures the real output of all relevant establishments located in the United States, regardless of their ownership, but not those located in U.S. territories.

NAICS = 31152

Source Code: IP.N31152.N

Suggested Citation:
Board of Governors of the Federal Reserve System (US), Industrial Production: Nondurable Goods: Ice cream and frozen dessert [IPN31152N], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/IPN31152N, November 16, 2019.

# Project Tasks

**Read in the data set "Frozen_Dessert_Production.csv" from the Data folder. Figure out how to set the date to a datetime index columns**

In [None]:
path = "../input/frozen-dessert-production/Frozen_Dessert_Production.csv"

In [None]:
df = pd.read_csv(path, parse_dates = ["DATE"])

In [None]:
df.head(5)

In [None]:
df.info()

**Change the column name to Production**

In [None]:
df.columns

In [None]:
df = df.set_index('DATE')

In [None]:
df.columns = ['Production']

In [None]:
df.head(5)

**Plot out the time series**

In [None]:
df.plot(figsize = (12, 6))
plt.show()

## Train Test Split

**Figure out the length of the data set**

In [None]:
len(df)

**Split the data into a train/test split where the test set is the last 24 months of data.**

In [None]:
test_size = 24

In [None]:
test_index  = len(df)-test_size
test_index

In [None]:
train = df.iloc[:test_index]
test = df.iloc[test_index:]

In [None]:
len(train)

In [None]:
len(test)

In [None]:
train.head(5)

In [None]:
test.head(5)

## Scale Data

**Use a MinMaxScaler to scale the train and test sets into scaled versions.**

In [None]:
scaler = MinMaxScaler()

In [None]:
# IGNORE WARNING ITS JUST CONVERTING TO FLOATS
# WE ONLY FIT TO TRAININ DATA, OTHERWISE WE ARE CHEATING ASSUMING INFO ABOUT TEST SET
scaler.fit(train)

In [None]:
scaled_train = scaler.fit_transform(train)
scaled_test = scaler.transform(test)

# Time Series Generator

**Create a TimeSeriesGenerator object based off the scaled_train data. The batch length is up to you, but at a minimum it should be at least 18 to capture a full year seasonality.**

In [None]:
length = 18 #one shorter to the length of test data
batch_size = 1
generator = TimeseriesGenerator(scaled_train, scaled_train, length = length, batch_size = batch_size)

**Create a generator for the scaled test/validation set. NOTE: Double check that your batch length makes sense for the size of the test set**

In [None]:
validation_generator = TimeseriesGenerator(scaled_test, scaled_test, length = length, batch_size = batch_size)

### Create the Model

**Create a Keras Sequential Model with as many LSTM units you want and a final Dense Layer.**

In [None]:
n_features = 1

In [None]:
# define model
model = Sequential()

# Simple RNN layer
#model.add(SimpleRNN(50,input_shape=(length, n_features))) #50 neurons

# LSTM 
model.add(LSTM(100, activation = 'relu', input_shape = (length, n_features)))

# Final Prediction
model.add(Dense(1))

model.compile(optimizer = 'adam', loss = 'mse')

# Model Summary
model.summary()

**Create an EarlyStopping callback based on val_loss.**

In [None]:
early_stop = EarlyStopping(monitor = 'val_loss', patience = 3)

**Fit the model to the generator, let the EarlyStopping dictate the amount of epochs, so feel free to set the parameter high.**

In [None]:
model.fit_generator(generator, 
                    epochs = 25, 
                    validation_data = validation_generator, 
                    callbacks = [early_stop])

**Plot the history of the loss that occured during training.**

In [None]:
losses = pd.DataFrame(model.history.history)
losses.plot()
plt.show()

In [None]:
losses

## Evaluate on Test Data

**Forecast predictions for your test data range (the last 12 months of the entire dataset). Remember to inverse your scaling transformations. Your final result should be a DataFrame with two columns, the true test values and the predictions.**

In [None]:
test_predictions = []

first_eval_batch = scaled_train[-length:]
current_batch = first_eval_batch.reshape((1, length, n_features))

for i in range(len(test)):
    
    # get prediction 1 time stamp ahead ([0] is for grabbing just the number instead of [array])
    current_pred = model.predict(current_batch)[0]
    
    # store prediction
    test_predictions.append(current_pred) 
    
    # update batch to now include prediction and drop first value
    current_batch = np.append(current_batch[:,1:,:], [[current_pred]], axis = 1)

In [None]:
true_predictions = scaler.inverse_transform(test_predictions)

In [None]:
test['Predictions'] = true_predictions

In [None]:
test.head()

**Plot your predictions versus the True test values. (Your plot may look different than ours).**

In [None]:
test.plot(figsize = (10, 4))
plt.show()

**Calculate your RMSE.**

In [None]:
np.sqrt(mean_squared_error(test['Production'],test['Predictions']))

**Retrain & Forecasting**

In [None]:
full_scaler = MinMaxScaler()
scaled_full_data = full_scaler.fit_transform(df)

In [None]:
length = 12 # Length of the output sequences (in number of timesteps)
generator = TimeseriesGenerator(scaled_full_data, scaled_full_data, length = length, batch_size = 1)

In [None]:
model = Sequential()
model.add(LSTM(100, activation = 'relu', input_shape = (length, n_features)))
model.add(Dense(1))
model.compile(optimizer = 'adam', loss = 'mse')


# fit model
model.fit_generator(generator, epochs = 8)

In [None]:
forecast = []
# Replace periods with whatever forecast length you want
periods = 12

first_eval_batch = scaled_full_data[-length:]
current_batch = first_eval_batch.reshape((1, length, n_features))

for i in range(periods):
    
    # get prediction 1 time stamp ahead ([0] is for grabbing just the number instead of [array])
    current_pred = model.predict(current_batch)[0]
    
    # store prediction
    forecast.append(current_pred) 
    
    # update batch to now include prediction and drop first value
    current_batch = np.append(current_batch[:,1:,:], [[current_pred]], axis = 1)

In [None]:
forecast = scaler.inverse_transform(forecast)

**Creating new timestamp index with pandas**

In [None]:
df

In [None]:
forecast_index = pd.date_range(start = '2019-10-01', periods = periods, freq = 'MS') #freq = 'MS' --> pandas frequency stings
# https://stackoverflow.com/questions/35339139/what-values-are-valid-in-pandas-freq-tags

In [None]:
forecast_index

In [None]:
forecast_df = pd.DataFrame(data = forecast,
                           index = forecast_index,
                           columns = ['Forecast'])

In [None]:
forecast_df

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 7))

axes[0].plot(df, 'b')

axes[1].plot(forecast_df, 'r')

#labels
axes[0].set_xlabel('Date')
axes[0].set_title('Production')

axes[1].set_xlabel('Date')
axes[1].set_title('Forecast')

plt.show()

**Joining Pandas Plot**

In [None]:
ax = df.plot()
forecast_df.plot(figsize = (15, 8), ax = ax)
plt.show()

In [None]:
ax = df.plot()
forecast_df.plot(figsize = (10, 6), ax = ax)
plt.xlim('2018-01-01','2020-12-01	')
plt.show()