In [1]:
#1.1 Initial datapoints
#1.2 Momentum factors, CPI/PPI Calculations, modified timeframe.
#1.3 Doing grid search on the depth parameter, and sub space sampling

In [19]:
#Imports and API Key

import pandas as pd
import quandl
from scipy import stats
import scipy
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer

%aimport dataclean

quandl.ApiConfig.api_key = 'm8FYMyoCaJSbTrBASNHh'

In [63]:
#pulling data from Quandl

data = pd.read_csv('datasources2.csv') #staging the QUANDL keys to pull in CSV
cols = list(data['Var_name'].astype('str'))
dataset = quandl.get([val for val in data['Quandl Key']]) #looping through the QUANDL keys to pull it into one DF
dataset.columns = cols

In [64]:
#pulling FED Yield Curve data

yields = pd.read_csv('Fed10Y_3M.csv')
yields['Date'] = pd.to_datetime(yields['Date'])
yields['Date'] = yields['Date'].apply(lambda x: x.strftime('%Y-%m'))
yields = yields.set_index('Date')
yields = yields.drop(['3 Month Treasury Yield', 'Rec_prob', 'NBER_Rec','Unnamed: 7'], axis=1) 

In [65]:
## back to working on the general data
dataset.index = dataset.index.strftime('%Y-%m') #converting the datetime index to Y/M so it is collapsable
dataset = dataset.groupby(dataset.index, as_index=True).agg(sum) #collapsing by Y/M

In [66]:
#converting GDP quarterly data into monthly

dataset = dataclean.convert_q_to_m(dataset, 'GDP')

#converting consumer sentiment into monthly

dataset = dataclean.convert_q_to_m(dataset, 'CONS_SENT')

In [67]:
#calculating change in GDP and converting Y into categorical values 
dataset['Recession'] = ((dataset['GDP'] - dataset['GDP'].shift(3)) < 0).astype(int)

In [68]:
#offsetting columns to align with time
offset_dict = {'PMI':1, 'UNR':1, 'YUNR':1,'US_NHOME_SALES':1, 'PART_TIME':1, 'CPPR_PRICE':1, 'HOUS_PERMS':1, 'HOUS_STARTS':1, 'CAP_UTIL':1, 'PERS_SAVINGS':3, 'EXPORTS':3, 'IMPORTS':3, 'TRADE_BALANCE':3, 'US_M2':1, 'US_NHOME_SALES':1,'PPI':1,'CPI':1} #positives shift down, negatives shift up. Goal of this dict is to align data to when it gets released

for entry in offset_dict:
    #print(entry)
    #print(offset_dict[entry])
    dataset[str(entry)] = dataset[str(entry)].shift(offset_dict[entry])

In [69]:
#merge fed interest rate data here
dataset = dataset.join(yields, how='outer')

In [70]:
#cutoff most of missing data, Post March 2019, Prior 1959. CPI/PPI missing 2016 onward so need to cut that off
dataset = dataset.iloc[552:]
dataset = dataset.iloc[:-59]

In [71]:
y = dataset['Recession'] #splitting off Y

In [72]:
dataset = dataset.drop(columns = ['GDP','Recession']) #dropping calc column and recession column from dataset, experimenting with taking out fed funds rate

In [73]:
#substituting mean value in for missing values and adding dummy column to indicate where done

for col in dataset.columns:
    dataclean.clean_zeros(col, dataset)

In [74]:
#adding momentum factors

momentum_cols = list(dataset.columns[:-6])

momentum_cols.remove('PPI') #removing PPI and CPI because they need a different transformation
momentum_cols.remove('CPI')

for i in [1,3,12]:
    for col in momentum_cols:
        dataclean.create_momentum(col,dataset,i)

In [75]:
#CPI Calcs

for i in [1,3,12]:
    for col in ['CPI','PPI']:
        dataclean.infl_momentum(col,dataset,i)

In [76]:
X = dataset

In [77]:
### Data Prep Finished Here ###

In [100]:
y_shift = y.shift(-12) #shifting y to forecast 3 months out
y_shift = y_shift.fillna(0)

In [101]:
X_train = X.iloc[12:550]
X_test = X.iloc[550:]
y_train = y_shift.iloc[12:550]
y_test = y_shift.iloc[550:]

In [102]:
X_train

Unnamed: 0,PMI,UNR,YUNR,CONS_SENT,PART_TIME,CPPR_PRICE,HOUS_PERMS,HOUS_STARTS,CAP_UTIL,PERS_SAVINGS,...,3 Month Treasury Yield (Bond Equivalent Basis)_12m_shift,Spread_12m_shift,CPPR_PRICE_PXY_12m_shift,HOUS_PERMS_PXY_12m_shift,CPI_1m_shift,PPI_1m_shift,CPI_3m_shift,PPI_3m_shift,CPI_12m_shift,PPI_12m_shift
1960-01,58.200,5.300,11.100,93.800,1000.000,151.480,1336.630,1601.000,68.973,9.400,...,1.580,-0.880,0.000,0.000,0.000,0.000,0.341,-0.631,1.730,-0.316
1960-02,61.500,5.200,10.900,93.800,1015.000,151.480,1092.000,1460.000,68.973,10.100,...,1.300,-0.770,0.000,-1.000,-0.340,0.317,-0.340,0.000,1.034,-0.315
1960-03,52.300,4.800,10.200,93.800,1062.000,151.480,1088.000,1503.000,68.973,11.000,...,0.520,-0.260,0.000,-1.000,0.341,0.000,0.000,0.317,1.730,-0.315
1960-04,47.800,5.400,11.500,93.800,888.000,151.480,955.000,1109.000,68.973,10.900,...,0.290,-0.130,0.000,-1.000,0.000,0.633,0.000,0.952,1.730,0.315
1960-05,45.300,5.200,10.900,93.300,1041.000,151.480,1016.000,1289.000,68.973,10.600,...,0.460,-0.420,0.000,-1.000,0.340,0.000,0.683,0.633,1.724,0.000
1960-06,42.600,5.100,10.700,93.300,988.000,151.480,1052.000,1271.000,68.973,9.400,...,-0.770,0.580,0.000,-1.000,0.000,-0.314,0.340,0.316,1.724,-0.314
1960-07,44.400,5.400,11.000,93.300,966.000,151.480,958.000,1247.000,68.973,8.400,...,-0.920,0.420,0.000,-1.000,0.339,0.000,0.680,-0.314,1.718,0.000
1960-08,43.700,5.500,10.800,97.200,1013.000,151.480,999.000,1197.000,68.973,10.400,...,-1.110,0.480,0.000,-1.000,0.000,0.000,0.339,-0.314,1.370,0.000
1960-09,47.600,5.600,11.400,97.200,1018.000,151.480,994.000,1344.000,68.973,10.400,...,-1.610,0.730,0.000,-1.000,0.000,-0.315,0.339,-0.315,1.370,0.000
1960-10,45.400,5.500,11.000,97.200,1027.000,151.480,984.000,1097.000,68.973,10.400,...,-1.800,1.160,0.000,-1.000,0.000,0.000,0.000,-0.315,1.024,-0.315


In [103]:
model = RandomForestClassifier(n_estimators=500, max_depth=4, random_state=45, max_features = 'sqrt') #grid search on max depth
model.fit(X_train, y_train) #fitting model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=45, verbose=0, warm_start=False)

In [104]:
probs = model.predict_proba(X_test)

In [105]:
log_loss(y_test, probs) 

0.539681091763301

In [106]:
roc_auc_score(y_test.values, probs[:,1:])

0.5961221122112211

In [99]:
results = pd.DataFrame(probs)
results['actual'] = y_test.values
results.index = y_test.index
pd.set_option('display.float_format', lambda x: '%.3f' % x)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

            0     1  actual
2004-11 0.944 0.056   0.000
2004-12 0.941 0.059   0.000
2005-01 0.946 0.054   0.000
2005-02 0.954 0.046   0.000
2005-03 0.946 0.054   0.000
2005-04 0.925 0.075   0.000
2005-05 0.930 0.070   0.000
2005-06 0.930 0.070   0.000
2005-07 0.949 0.051   0.000
2005-08 0.940 0.060   0.000
2005-09 0.875 0.125   0.000
2005-10 0.815 0.185   0.000
2005-11 0.869 0.131   0.000
2005-12 0.928 0.072   0.000
2006-01 0.946 0.054   0.000
2006-02 0.933 0.067   0.000
2006-03 0.932 0.068   0.000
2006-04 0.937 0.063   0.000
2006-05 0.857 0.143   0.000
2006-06 0.888 0.112   0.000
2006-07 0.877 0.123   0.000
2006-08 0.849 0.151   0.000
2006-09 0.886 0.114   0.000
2006-10 0.864 0.136   0.000
2006-11 0.827 0.173   0.000
2006-12 0.852 0.148   0.000
2007-01 0.883 0.117   0.000
2007-02 0.827 0.173   0.000
2007-03 0.834 0.166   0.000
2007-04 0.844 0.156   0.000
2007-05 0.861 0.139   0.000
2007-06 0.861 0.139   0.000
2007-07 0.873 0.127   0.000
2007-08 0.892 0.108   0.000
2007-09 0.884 0.116 

In [141]:
#Implementing grid search to find the optimal max depth
#have to make my own grid search because of the way my model cross validates

In [173]:
param_grid = {
    'max_depth': [1,3,4,5,7,10],
    'n_estimators': [100, 200, 300,500, 1000]
}


scorer = make_scorer(log_loss,
                     greater_is_better=False,
                     needs_proba=True)


#max_depth = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
rf = RandomForestClassifier() #grid search on max depth
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv =5, n_jobs = -1, verbose = 2, scoring = scorer)

In [174]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   44.1s finished


{'max_depth': 1, 'n_estimators': 200}

In [181]:
#making my own grid search

max_depth = [1,3,4,5,7,10]
n_est = [100, 200, 300,500, 1000]

for depth in max_depth:
    for n in n_est:

        model = RandomForestClassifier(n_estimators=n, max_depth=depth, random_state=45, max_features='sqrt') #grid search on max depth
        model.fit(X_train, y_train) #fitting model
        probs = model.predict_proba(X_test)
        ll = log_loss(y_test, probs)
        auc = roc_auc_score(y_test.values, probs[:,1:])
        print("Testing max depth {} and n_est of {}, yielding log loss of {} and AUC of {}".format(depth, n, ll, auc))

Testing max depth 1 and n_est of 100, yielding log loss of 0.30605535441171733 and AUC of 0.8613569321533924
Testing max depth 1 and n_est of 200, yielding log loss of 0.3124820674444686 and AUC of 0.8783185840707964
Testing max depth 1 and n_est of 300, yielding log loss of 0.3162731796720755 and AUC of 0.8679941002949851
Testing max depth 1 and n_est of 500, yielding log loss of 0.3184693551200969 and AUC of 0.8325958702064897
Testing max depth 1 and n_est of 1000, yielding log loss of 0.3125202037382147 and AUC of 0.8488200589970503
Testing max depth 3 and n_est of 100, yielding log loss of 0.2695262131515273 and AUC of 0.8856932153392331
Testing max depth 3 and n_est of 200, yielding log loss of 0.26352082161025686 and AUC of 0.9107669616519174
Testing max depth 3 and n_est of 300, yielding log loss of 0.26942846949993965 and AUC of 0.8827433628318584
Testing max depth 3 and n_est of 500, yielding log loss of 0.275730266439535 and AUC of 0.8620943952802359
Testing max depth 3 and n

In [None]:
#note below was done without messing w max features

#Optimal AUC
#Testing max depth 7 and n_est of 200, yielding log loss of 0.2658659820011174 and AUC of 0.9174041297935103

#Optimal LL
#Testing max depth 4 and n_est of 100, yielding log loss of 0.2608396722765084 and AUC of 0.8856932153392331
