In [1]:
import pandas as pd
import numpy as np
import scipy.optimize as optimize
import os
import sys
import collections
import itertools
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import sklearn.metrics
from scipy.stats import mode
from scipy.spatial.distance import squareform
from fastdtw import fastdtw
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Data Cleaning (from raw) -- longer time horizon

In [2]:
gdp = pd.read_csv('../Data/Raw/input/GDP.csv')
ffer = pd.read_csv('../Data/Raw/input/FEDFUNDS.csv')
cpi = pd.read_csv('../Data/Raw/input/CPIAUCSL.csv')

In [3]:
# Linear interpolation for GDP data
gdp['DATE'] = pd.to_datetime(gdp['DATE'])
print(gdp.dtypes)

timestamps_quarterly = gdp['DATE'].astype(np.int64)
timestamps_monthly = pd.date_range(start=timestamps_quarterly.min(), end=timestamps_quarterly.max(), freq='MS').astype(np.int64)
gdp_quarterly = np.array(gdp['GDP_PCH'])

print(timestamps_quarterly.dtype, timestamps_monthly.dtype, gdp_quarterly.dtype)

gdp_monthly = np.interp(timestamps_monthly, timestamps_quarterly, gdp_quarterly)

timestamps_monthly = pd.to_datetime(timestamps_monthly)

df_interpolated_gdp = pd.DataFrame({'DATE': timestamps_monthly, 'GDP_PCH': gdp_monthly})

print("Original Quarterly Data:")
print(gdp[['DATE', 'GDP_PCH']])
print("\nMonthly Data with Linear Interpolation:")
print(df_interpolated_gdp)

DATE       datetime64[ns]
GDP_PCH           float64
dtype: object
int64 int64 float64
Original Quarterly Data:
          DATE  GDP_PCH
0   1947-04-01  1.15313
1   1947-07-01  1.47052
2   1947-10-01  4.07076
3   1948-01-01  2.30880
4   1948-04-01  2.56828
..         ...      ...
301 2022-07-01  1.76308
302 2022-10-01  1.59174
303 2023-01-01  1.53434
304 2023-04-01  0.93017
305 2023-07-01  2.07121

[306 rows x 2 columns]

Monthly Data with Linear Interpolation:
          DATE   GDP_PCH
0   1947-04-01  1.153130
1   1947-05-01  1.257764
2   1947-06-01  1.365886
3   1947-07-01  1.470520
4   1947-08-01  2.346688
..         ...       ...
911 2023-03-01  1.138273
912 2023-04-01  0.930170
913 2023-05-01  1.306337
914 2023-06-01  1.695043
915 2023-07-01  2.071210

[916 rows x 2 columns]


In [4]:
ffer['DATE'] = pd.to_datetime(ffer['DATE'])
cpi['DATE'] = pd.to_datetime(cpi['DATE'])


In [5]:
input_data = pd.merge(df_interpolated_gdp, ffer, on='DATE', how='inner')
input_data = pd.merge(input_data, cpi, on='DATE', how='inner')

In [6]:
input_data

Unnamed: 0,DATE,GDP_PCH,FEDFUNDS,CPIAUCSL_PC1
0,1954-07-01,1.262560,0.80,0.26129
1,1954-08-01,1.590166,1.22,0.00000
2,1954-09-01,1.917772,1.07,-0.29751
3,1954-10-01,2.234810,0.85,-0.85343
4,1954-11-01,2.606190,0.83,-0.26071
...,...,...,...,...
824,2023-03-01,1.138273,4.65,4.98692
825,2023-04-01,0.930170,4.83,4.95719
826,2023-05-01,1.306337,5.06,4.12884
827,2023-06-01,1.695043,5.08,3.09200


In [7]:
# output_data = pd.read_csv('../Data/Raw/output/UNRATE.csv')
# output_data
# output_data['DATE'] = pd.to_datetime(output_data['DATE'])
# output_data['UNRATE_ADDCH'] = output_data['UNRATE'].diff()

In [8]:
output_data = pd.read_csv('../Data/Raw/output/USPRIV.csv')
output_data['DATE'] = pd.to_datetime(output_data['DATE'])

In [9]:
output_data

Unnamed: 0,DATE,USPRIV_PCH
0,1939-02-01,0.63235
1,1939-03-01,0.68585
2,1939-04-01,-0.70782
3,1939-05-01,0.77035
4,1939-06-01,0.78348
...,...,...
1012,2023-06-01,0.06450
1013,2023-07-01,0.10867
1014,2023-08-01,0.08535
1015,2023-09-01,0.18401


In [10]:
output_data = output_data.drop(0).reset_index(drop=True)

In [11]:
output_data = output_data[output_data['DATE'] >= input_data['DATE'].min()].reset_index(drop=True)
output_data = output_data[output_data['DATE'] <= input_data['DATE'].max()]
output_data

Unnamed: 0,DATE,USPRIV_PCH
0,1954-07-01,-0.19501
1,1954-08-01,-0.07863
2,1954-09-01,0.13831
3,1954-10-01,0.11907
4,1954-11-01,0.41387
...,...,...
824,2023-03-01,0.11827
825,2023-04-01,0.13468
826,2023-05-01,0.19161
827,2023-06-01,0.06450


In [12]:
output_data.iloc[:,1]

0     -0.19501
1     -0.07863
2      0.13831
3      0.11907
4      0.41387
        ...   
824    0.11827
825    0.13468
826    0.19161
827    0.06450
828    0.10867
Name: USPRIV_PCH, Length: 829, dtype: float64

In [13]:
# add lag variables to input data
# input_data['UNRATE_ADDCH_lag_1'] = output_data['UNRATE_ADDCH'].shift(1)
# input_data['UNRATE_ADDCH_lag_2'] = output_data['UNRATE_ADDCH'].shift(2)
# input_data = input_data.fillna(0)

In [14]:
input_data['USPRIV_PCH_lag_1'] = output_data['USPRIV_PCH'].shift(1)
input_data['USPRIV_PCH_lag_2'] = output_data['USPRIV_PCH'].shift(2)
input_data = input_data.fillna(0)

# Linear Regression

In [15]:
X = input_data.iloc[:,1:]
y = output_data.iloc[:,1] # y = output_data.iloc[:,2] for UNRATE_ADDCH
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
model.score(X_test, y_test)

0.2210452166074477

In [16]:
pred_sign = np.sign(pred)
y_test_sign = np.sign(y_test)
classification_accuracy = 1 - 0.5 * np.sum(np.abs(pred_sign - y_test_sign)) / np.size(y_test_sign)
classification_accuracy

0.7751004016064257

In [17]:
pred_train = model.predict(X_train)
pred_train_sign = np.sign(pred_train)
y_train_sign = np.sign(y_train)
train_accuracy = 1 - 0.5*np.sum(np.abs(pred_train_sign - y_train_sign))/np.size(y_train_sign)
train_accuracy # train accuracy for classification via linear regression

0.7913793103448276

# Logistic Regression

In [18]:
X = input_data.iloc[:,1:]
y = output_data.iloc[:,1]
y = np.sign(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)  # test accuracy (boolean labels)

0.8232931726907631

# Decision Tree Classifier

In [19]:
X = input_data.iloc[:,1:]
y = output_data.iloc[:,1]
y = np.sign(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)  # test accuracy (boolean labels)

0.891566265060241

# SVM Classifier

In [20]:
X = input_data.iloc[:,1:]
y = output_data.iloc[:,1]
y = np.sign(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = SVC()
model.fit(X_train, y_train)
model.score(X_test, y_test)  # test accuracy (boolean labels)

0.8353413654618473

# KNN (with dynamic time warping)

In [21]:
from fastdtw import fastdtw
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def calculate_dtw(series1, series2):
    distance_dtw, _ = fastdtw(series1, series2)
    return distance_dtw

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

knn_model = KNeighborsClassifier(n_neighbors=4, metric=calculate_dtw)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8995983935742972


# Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

y_labels = (y > 0).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.3, random_state=0)
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8473895582329317
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.59      0.63        54
           1       0.89      0.92      0.90       195

    accuracy                           0.85       249
   macro avg       0.78      0.76      0.77       249
weighted avg       0.84      0.85      0.84       249


# Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

y_sign = np.sign(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_sign, test_size=0.3, random_state=3)
rf_model = RandomForestClassifier(n_estimators=100, random_state=3)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8674698795180723
Classification Report:
              precision    recall  f1-score   support

        -1.0       0.75      0.57      0.65        53
         1.0       0.89      0.95      0.92       196

    accuracy                           0.87       249
   macro avg       0.82      0.76      0.78       249
weighted avg       0.86      0.87      0.86       249


# Baseline

In [24]:
X = input_data.iloc[:,1:]
y = output_data.iloc[:,1]

y_labels = (y > 0).astype(int)

# predict next month's class to be the same as this month's class
y_pred = y_labels.shift(1).dropna()

# split the data into training and testing sets along with corresp. indices
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y_labels, np.arange(len(y_labels)), test_size=0.3, random_state=3)

# check accuracy of baseline
accuracy = np.sum(y_pred[indices_test] == y_test) / np.size(y_test)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8473895582329317
