In [3]:
#Imports and API Key

import pandas as pd
import quandl
from scipy import stats
import scipy
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

import numpy as np

import keras

%aimport dataclean

quandl.ApiConfig.api_key = 'm8FYMyoCaJSbTrBASNHh'

Using TensorFlow backend.


In [4]:
#pulling data from Quandl

data = pd.read_csv('data_for_pull.csv') #staging the QUANDL keys to pull in CSV
cols = list(data['Var_name'].astype('str'))
dataset = quandl.get([val for val in data['Quandl Key']]) #looping through the QUANDL keys to pull it into one DF
dataset.columns = cols

In [5]:
#pulling FED Yield Curve data

yields = pd.read_csv('Fed10Y_3M.csv')
yields['Date'] = pd.to_datetime(yields['Date'])
yields['Date'] = yields['Date'].apply(lambda x: x.strftime('%Y-%m'))
yields = yields.set_index('Date')
yields = yields.drop(['3 Month Treasury Yield', 'Rec_prob', 'NBER_Rec','Unnamed: 7'], axis=1) 

In [6]:
## back to working on the general data
dataset.index = dataset.index.strftime('%Y-%m') #converting the datetime index to Y/M so it is collapsable
dataset = dataset.groupby(dataset.index, as_index=True).agg(sum) #collapsing by Y/M

In [7]:
#converting GDP quarterly data into monthly

dataset = dataclean.convert_q_to_m(dataset, 'GDP')

#converting consumer sentiment into monthly

dataset = dataclean.convert_q_to_m(dataset, 'CONS_SENT')

In [8]:
#calculating change in GDP and converting Y into categorical values 
dataset['Recession'] = ((dataset['GDP'] - dataset['GDP'].shift(3)) < 0).astype(int)

In [9]:
#merge fed interest rate data here
dataset = dataset.join(yields, how='outer')

In [10]:
#cutoff most of missing data, Post March 2019, Prior 1959. CPI/PPI missing 2016 onward so need to cut that off
dataset = dataset.iloc[552:]
dataset = dataset.iloc[:-59]

In [11]:
y = dataset['Recession'] #splitting off Y

In [12]:
dataset = dataset.drop(columns = ['GDP','Recession']) #dropping calc column and recession column from dataset, experimenting with taking out fed funds rate

In [13]:
#substituting mean value in for missing values and adding dummy column to indicate where done

for col in dataset.columns:
    dataclean.clean_zeros(col, dataset)

In [14]:
#adding momentum factors

momentum_cols = list(dataset.columns[:-6])

momentum_cols.remove('PPI') #removing PPI and CPI because they need a different transformation
momentum_cols.remove('CPI')

for i in [1,3,12]:
    for col in momentum_cols:
        dataclean.create_momentum(col,dataset,i)

In [15]:
#CPI Calcs

for i in [1,3,12]:
    for col in ['CPI','PPI']:
        dataclean.infl_momentum(col,dataset,i)

In [16]:
X = dataset

In [17]:
### Data Prep Finished Here ###

In [18]:
#implementing RNN

In [19]:
y_shift = y.shift(0) #shifting y to forecast 3 months out
y_shift = y_shift.fillna(0)

In [38]:
def windowize_data_x(data, n_prev):
    n_predictions = len(data) - n_prev
    y = data[n_prev:]
    # this might be too clever
    indices = np.arange(n_prev) + np.arange(n_predictions)[:, None]
    x = data[indices, None]
    return x

In [39]:
def split_and_windowize_x(data, n_prev, fraction_test=0.3):
    n_predictions = len(data) - 2*n_prev
    
    n_test  = int(fraction_test * n_predictions)
    n_train = n_predictions - n_test   
    
    x_train, y_train = windowize_data(data[:n_train], n_prev)
    x_test, y_test = windowize_data(data[n_train:], n_prev)
    return x_train, x_test

In [43]:
n_prev = 50

x_train, x_test, y_train = split_and_windowize(X, n_prev)
x_train.shape, x_test.shape

TypeError: '(array([[  0,   1,   2, ...,  47,  48,  49],
       [  1,   2,   3, ...,  48,  49,  50],
       [  2,   3,   4, ...,  49,  50,  51],
       ...,
       [350, 351, 352, ..., 397, 398, 399],
       [351, 352, 353, ..., 398, 399, 400],
       [352, 353, 354, ..., 399, 400, 401]]), None)' is an invalid key

In [20]:
X_train = X.iloc[12:550]
X_test = X.iloc[550:]
y_train = y_shift.iloc[12:550]
y_test = y_shift.iloc[550:]

In [22]:
X_train.shape

(538, 98)

In [26]:
X_train.shape[1]

98

In [28]:
np.reshape(X_train.values,(538,1,98))

array([[[61.5       ,  5.2       , 10.9       , ...,  0.        ,
          1.03448276, -0.31545741]],

       [[52.3       ,  4.8       , 10.2       , ...,  0.31746032,
          1.73010381, -0.31545741]],

       [[47.8       ,  5.4       , 11.5       , ...,  0.95238095,
          1.73010381,  0.31545741]],

       ...,

       [[58.5       ,  5.4       , 11.5       , ...,  0.81743869,
          2.65438787,  7.24637681]],

       [[57.4       ,  5.4       , 11.7       , ...,  0.33967391,
          2.53779698,  6.64259928]],

       [[56.3       ,  5.5       , 12.1       , ...,  1.76390773,
          3.18918919,  7.68126346]]])

In [29]:
trainX = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1])) #this shouldn't throw an error, X_train is 2D

In [30]:
trainX.shape

(538, 1, 98)

In [40]:
y_train.shape

(538,)

In [41]:
model = keras.Sequential()
model.add(keras.layers.LSTM(32, input_shape=(1,98), return_sequences=True))
model.add(keras.layers.LSTM(32, return_sequences=True))
model.add(keras.layers.LSTM(32, return_sequences=False))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy') #this is log loss

In [42]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, 1, 32)             16768     
_________________________________________________________________
lstm_11 (LSTM)               (None, 1, 32)             8320      
_________________________________________________________________
lstm_12 (LSTM)               (None, 32)                8320      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 33,441
Trainable params: 33,441
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.fit(trainX, y_train.values, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a364f9c50>

In [45]:
testX = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1])) #this shouldn't throw an error, X_train is 2D

In [46]:
y_pred = model.predict(testX)

In [48]:
log_loss(y_test,y_pred)

0.3055185180604458