In [1]:
#1.1 Initial datapoints
#1.2 Momentum factors, CPI/PPI Calculations, modified timeframe.

In [118]:
#Imports and API Key

import pandas as pd
import quandl
from scipy import stats
import scipy
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

%aimport dataclean

quandl.ApiConfig.api_key = 'm8FYMyoCaJSbTrBASNHh'

In [3]:
#pulling data from Quandl

data = pd.read_csv('data_for_pull.csv') #staging the QUANDL keys to pull in CSV
cols = list(data['Var_name'].astype('str'))
dataset = quandl.get([val for val in data['Quandl Key']]) #looping through the QUANDL keys to pull it into one DF
dataset.columns = cols

In [4]:
#pulling FED Yield Curve data

yields = pd.read_csv('Fed10Y_3M.csv')
yields['Date'] = pd.to_datetime(yields['Date'])
yields['Date'] = yields['Date'].apply(lambda x: x.strftime('%Y-%m'))
yields = yields.set_index('Date')
yields = yields.drop(['3 Month Treasury Yield', 'Rec_prob', 'NBER_Rec','Unnamed: 7'], axis=1) 

In [5]:
## back to working on the general data
dataset.index = dataset.index.strftime('%Y-%m') #converting the datetime index to Y/M so it is collapsable
dataset = dataset.groupby(dataset.index, as_index=True).agg(sum) #collapsing by Y/M

In [6]:
#converting GDP quarterly data into monthly

dataset = dataclean.convert_q_to_m(dataset, 'GDP')

#converting consumer sentiment into monthly

dataset = dataclean.convert_q_to_m(dataset, 'CONS_SENT')

In [7]:
#calculating change in GDP and converting Y into categorical values 
dataset['Recession'] = ((dataset['GDP'] - dataset['GDP'].shift(3)) < 0).astype(int)

In [8]:
#merge fed interest rate data here
dataset = dataset.join(yields, how='outer')

In [9]:
#cutoff most of missing data, Post March 2019, Prior 1959. CPI/PPI missing 2016 onward so need to cut that off
dataset = dataset.iloc[552:]
dataset = dataset.iloc[:-59]

In [10]:
y = dataset['Recession'] #splitting off Y

In [106]:
y_shift = y.shift(-12) #shifting y to forecast 3 months out

In [107]:
y_shift = y_shift.fillna(0)

In [13]:
dataset = dataset.drop(columns = ['GDP','Recession']) #dropping calc column and recession column from dataset, experimenting with taking out fed funds rate

In [14]:
#substituting mean value in for missing values and adding dummy column to indicate where done

for col in dataset.columns:
    dataclean.clean_zeros(col, dataset)

In [15]:
#adding momentum factors

momentum_cols = list(dataset.columns[:-6])

momentum_cols.remove('PPI') #removing PPI and CPI because they need a different transformation
momentum_cols.remove('CPI')

for i in [1,3,12]:
    for col in momentum_cols:
        dataclean.create_momentum(col,dataset,i)

In [16]:
#CPI Calcs

for i in [1,3,12]:
    for col in ['CPI','PPI']:
        dataclean.infl_momentum(col,dataset,i)

In [17]:
X = dataset

In [18]:
### Data Prep Finished Here ###

In [108]:
X_train = X.iloc[12:550]
X_test = X.iloc[550:]
y_train = y_shift.iloc[12:550]
y_test = y_shift.iloc[550:]

In [109]:
X_train

Unnamed: 0,PMI,UNR,YUNR,CONS_SENT,HOME_SALES,PART_TIME,CPPR_PRICE,HOUS_PERMS,HOUS_STARTS,CAP_UTIL,...,3YRT_12m_shift,10 Year Treasury Yield_12m_shift,3 Month Treasury Yield (Bond Equivalent Basis)_12m_shift,Spread_12m_shift,CPI_1m_shift,PPI_1m_shift,CPI_3m_shift,PPI_3m_shift,CPI_12m_shift,PPI_12m_shift
1960-01,61.500,5.200,10.900,93.800,3219.348,1015.000,152.022,1092.000,1460.000,69.088,...,0.000,0.700,1.580,-0.880,-0.340,0.317,-0.340,0.000,1.034,-0.315
1960-02,52.300,4.800,10.200,93.800,3219.348,1062.000,152.022,1088.000,1503.000,69.088,...,0.000,0.530,1.300,-0.770,0.341,0.000,0.000,0.317,1.730,-0.315
1960-03,47.800,5.400,11.500,93.800,3219.348,888.000,152.022,955.000,1109.000,69.088,...,0.000,0.260,0.520,-0.260,0.000,0.633,0.000,0.952,1.730,0.315
1960-04,45.300,5.200,10.900,93.800,3219.348,1041.000,152.022,1016.000,1289.000,69.088,...,0.000,0.160,0.290,-0.130,0.340,0.000,0.683,0.633,1.724,0.000
1960-05,42.600,5.100,10.700,93.300,3219.348,988.000,152.022,1052.000,1271.000,69.088,...,0.000,0.040,0.460,-0.420,0.000,-0.314,0.340,0.316,1.724,-0.314
1960-06,44.400,5.400,11.000,93.300,3219.348,966.000,152.022,958.000,1247.000,69.088,...,0.000,-0.190,-0.770,0.580,0.339,0.000,0.680,-0.314,1.718,0.000
1960-07,43.700,5.500,10.800,93.300,3219.348,1013.000,152.022,999.000,1197.000,69.088,...,0.000,-0.500,-0.920,0.420,0.000,0.000,0.339,-0.314,1.370,0.000
1960-08,47.600,5.600,11.400,97.200,3219.348,1018.000,152.022,994.000,1344.000,69.088,...,0.000,-0.630,-1.110,0.480,0.000,-0.315,0.339,-0.315,1.370,0.000
1960-09,45.400,5.500,11.000,97.200,3219.348,1027.000,152.022,984.000,1097.000,69.088,...,0.000,-0.880,-1.610,0.730,0.000,0.000,0.000,-0.315,1.024,-0.315
1960-10,46.000,6.100,11.800,97.200,3219.348,1017.000,152.022,972.000,1246.000,69.088,...,0.000,-0.640,-1.800,1.160,0.676,0.316,0.676,0.000,1.361,0.316


In [114]:
model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=45) #grid search on max depth
model.fit(X_train, y_train) #fitting model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=45, verbose=0, warm_start=False)

In [115]:
probs = model.predict_proba(X_test)

In [116]:
log_loss(y_test, probs)

0.2737747429020189

In [119]:
roc_auc_score(y_test.values, probs[:,1:])

0.8949115044247787

In [113]:
results = pd.DataFrame(probs)
results['actual'] = y_test.values
results.index = y_test.index
pd.set_option('display.float_format', lambda x: '%.3f' % x)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

            0     1  actual
2004-11 0.942 0.058   0.000
2004-12 0.958 0.042   0.000
2005-01 0.939 0.061   0.000
2005-02 0.940 0.060   0.000
2005-03 0.931 0.069   0.000
2005-04 0.957 0.043   0.000
2005-05 0.936 0.064   0.000
2005-06 0.943 0.057   0.000
2005-07 0.941 0.059   0.000
2005-08 0.963 0.037   0.000
2005-09 0.954 0.046   0.000
2005-10 0.935 0.065   0.000
2005-11 0.906 0.094   0.000
2005-12 0.899 0.101   0.000
2006-01 0.902 0.098   0.000
2006-02 0.903 0.097   0.000
2006-03 0.885 0.115   0.000
2006-04 0.892 0.108   0.000
2006-05 0.912 0.088   0.000
2006-06 0.849 0.151   0.000
2006-07 0.866 0.134   0.000
2006-08 0.806 0.194   0.000
2006-09 0.885 0.115   0.000
2006-10 0.776 0.224   0.000
2006-11 0.803 0.197   0.000
2006-12 0.892 0.108   0.000
2007-01 0.799 0.201   1.000
2007-02 0.866 0.134   1.000
2007-03 0.870 0.130   1.000
2007-04 0.856 0.144   0.000
2007-05 0.831 0.169   0.000
2007-06 0.860 0.140   0.000
2007-07 0.844 0.156   0.000
2007-08 0.830 0.170   0.000
2007-09 0.882 0.118 