# 000 One-time creation of excel sheet with quarterly GDP Joined on (imputation to be done in Excel) 

In [None]:
import pandas as pd
df = pd.read_excel("unemployment_rate_AR.xls") #unemployment-only data (with date)
df_GDP = pd.read_excel("A191RL1Q225SBEA.xls") #gdp quarterly-only data (with date)
df_left = df.merge(df_GDP, how = 'left', on = 'observation_date') 
df_left.to_excel("gdp_merged.xls")

# 000 Mounting of dataframe with updated gdp and flows

In [47]:
import pandas as pd
df_UR = pd.read_excel("unemployment_rate_AR.xls") #unemployment-only data (with date)
df_GDP_flows = pd.read_excel("Labor_Force_Flows.xls", sheet_name = 1) # labor force flows and new gdp growth data (with date)
df_GDP_flows.dropna(inplace = True)
df_right = df_UR.merge(df_GDP_flows, how = 'right', left_on = 'observation_date', right_on = 'DATE')

# 001 AR(6) model

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
import math 
import random
from sklearn.metrics import mean_squared_error

#df = pd.read_excel("unemployment_rate_AR.xls") #option to use old data 
df = df_right #from the above box (updated data)
result_arr = np.zeros((800,2)) #to store predictions and actual values 
index = 0 #for indexing the above array

#The below loop will, from data point number 100 to the end, train up to that point, generate a prediction, and then store
#the actual value at that time step. The starting value will change based on the dataset used..
for i in range(100, len(df)):
    train_data = df["UNRATE_20221202"][:i] 
    ar_model = AutoReg(train_data, lags = 6).fit()
    pred = ar_model.forecast(1)
    result_arr[index][0] = pred
    result_arr[index][1] = df["UNRATE_20221202"][i]
    index = index + 1
    
RMSE = mean_squared_error(result_arr[:][1], result_arr[:][0], squared = False)
print(RMSE)

0.022844253571604106


# 001 Naive Model 

In [33]:
df = df_right #from the above box (updated data)
index = 0
#The below loop will, from data point number 100 to the end store the previous data point and the current data point
for i in range(100, len(df)):
    result_arr[index][0] = df["UNRATE_20221202"][i-1]
    result_arr[index][1] = df["UNRATE_20221202"][i]
    index = index + 1
    
RMSE = mean_squared_error(result_arr[:][1], result_arr[:][0], squared = False)
print(RMSE)


0.0707106781186545


# 001 Preprocess for LSTM 

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
import math 
import random
from sklearn.metrics import mean_squared_error

#Other options - 
#Example Option A - old dataset
#df_gdf_processed = pd.read_excel("gdp_merged.xls")
#selected = df_gdf_processed[['UNRATE_20221202','A191RL1Q225SBEA']]
#Example Option B - old dataset with just employment 
#selected = df_gdf_processed[['UNRATE_20221202']]

selected = df_right[['UNRATE_20221202','BBKMGDP_PCH']] #up to date dataset with just GDP 

#train-test split cutoff
cutoff = math.floor(len(selected) * .8) 
selected_train = selected.iloc[0:cutoff,0:] 
selected_test = selected.iloc[cutoff:,0:]
selected_test.reset_index(inplace = True)

from sklearn.preprocessing import StandardScaler

#scale train inputs and targets
sc_train_x = StandardScaler()
selected_train_scaled_x = sc_train_x.fit_transform(selected_train)
sc_train_y = StandardScaler()
selected_train_scaled_y  = sc_train_y.fit_transform(selected_train[['UNRATE_20221202']])

#develop train inputs and targets by looping (in each loop pick, as input, data at the previous 6 time-steps and 
#as output, the unemployment rate at the next time-step) until cutoff
X_train = []
y_train = []
for i in range (6, cutoff):
    X_train.append(selected_train_scaled_x[i-6:i])
    y_train.append(selected_train_scaled_y[i])
X_train, y_train = np.array(X_train), np.array(y_train)

X_train_shaped = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2]))

# 002 Hyperparameter Optimized LSTM

In [42]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
import keras
import sklearn
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

#each individual model
def model_builder(hp_unit, hp_learning_rate):

    model = Sequential()
    model.add(LSTM(units = hp_unit, return_sequences = True, input_shape = (6,2))) #This will change depending on df used
    model.add(Dropout(0.2))
    model.add(LSTM(units = hp_unit))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer = keras.optimizers.Adam(learning_rate = hp_learning_rate), loss = 'mean_squared_error', metrics = ["accuracy"])
    return model


model = KerasClassifier(build_fn = model_builder)

#grid search 
params = {'batch_size':[10,25,50,75,100], 
          'nb_epoch': [100],
          'hp_unit':[10, 20, 30], 
          'hp_learning_rate': [1e-2,1e-4]}
gs = GridSearchCV(estimator = model, param_grid = params, cv = 5)
gs = gs.fit(X_train_shaped, y_train)

#return the best estimator
model = gs.best_estimator_

  model = KerasClassifier(build_fn = model_builder)










# 002 Non-Hyperparameter Optimized LSTM

In [35]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
import keras

#define the neural net
model = Sequential()
model.add(LSTM(units = 50, return_sequences = True, input_shape = (6,2))) #This will also change depending on df used
model.add(Dropout(0.2))
model.add(LSTM(units = 50))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

#fit the model with early stopping
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
model.fit(X_train_shaped, y_train, epochs = 100, batch_size = 100, validation_split = .2, callbacks = [callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


<keras.callbacks.History at 0x234e2739c70>

# 003 LSTM Evaluation

In [43]:
#tack on the the last 6 data points from the train since those correspond to the first test target and then transform
df_train_last = selected_train[-6:] 
full_df = pd.concat((df_train_last, selected_test), axis = 0)[["UNRATE_20221202",'BBKMGDP_PCH']]
#full_df = pd.concat((df_train_last, selected_test), axis = 0)[["UNRATE_20221202"]] #option to just use the unemployment rate 
full_df = sc_train_x.transform(full_df)

#generate test input by looping consecutive 6's in the input until the last data point
x_test = []
for i in range(6, len(selected) - cutoff + 6):
    x_test.append(full_df[i-6:i])
x_test = np.array(x_test)

#predict and inverse scale since the output is trained off of scaled targets
y_test = model.predict(x_test)
y_final_pred = sc_train_y.inverse_transform(y_test)

from sklearn.metrics import mean_squared_error
mean_squared_error(selected_test['UNRATE_20221202'],y_final_pred, squared = False)



2.256271148082802

# 001 VAR Model

In [54]:
from statsmodels.tsa.api import VAR
import statistics
import numpy as np
from sklearn.metrics import mean_squared_error

#Different input options - 
#Example Option A - the full updated dataset
#selected = df_right[['UNRATE_20221202','BBKMGDP_PCH','pUtoE', 'pNLtoE' , 'pEtoU', 'pNLtoU', 'pEtoNL', 'pUtoNL']]
#Example Option B - GDP + Unemployment from updated dataset
#selected = df_right[['UNRATE_20221202','BBKMGDP_PCH']] 
df_gdp_processed = pd.read_excel("gdp_merged.xls")
selected = df_gdp_processed[['UNRATE_20221202','A191RL1Q225SBEA']]

result_arr = np.zeros((800,2)) #array to store the predictions and the actual value
index = 0 #indexing the above result array 

#The below loop will, from data point number 100 to the end, train up to that point, generate a prediction, and then store
#the actual value at that time step. The starting value will change based on the dataframe used. 
for i in range(605
               , len(selected)):
    train_data = selected[:i]
    var = VAR(selected)
    
    #this section checks the different measurement errors and selected the lag parameter that corresponds to the most minimums
    order = var.select_order()
    selected_order = statistics.mode([order.selected_orders['aic'], order.selected_orders['bic'], 
                            order.selected_orders['hqic'], order.selected_orders['fpe']])
    final_model = var.fit(selected_order)
    
    pred = final_model.forecast(train_data.values[-selected_order:],1)[0][0]
    result_arr[index][0] = pred
    result_arr[index][1] = selected["UNRATE_20221202"][i]
    index = index + 1
    
RMSE = mean_squared_error(result_arr[:][1], result_arr[:][0], squared = False)
print(RMSE)

0.07416993731232889


895-cutoff
