In [1]:
#1.1 Initial datapoints. log loss 0.64
#1.2 Momentum factors, CPI/PPI Calculations, modified timeframe. LL: 1yr - 10.64 3M - 1.64
#1.3 Standardizing continuous features

In [20]:
#Imports and API Key

import pandas as pd
import quandl
from scipy import stats
import scipy
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline

from basis_expansions.basis_expansions import (
    Polynomial, LinearSpline)

from regression_tools.dftransformers import (
    ColumnSelector, Identity, FeatureUnion, MapFeature, Intercept)

from sklearn.preprocessing import StandardScaler

%aimport dataclean

quandl.ApiConfig.api_key = 'm8FYMyoCaJSbTrBASNHh'

In [21]:
#pulling data from Quandl

data = pd.read_csv('data_for_pull.csv') #staging the QUANDL keys to pull in CSV
cols = list(data['Var_name'].astype('str'))
dataset = quandl.get([val for val in data['Quandl Key']]) #looping through the QUANDL keys to pull it into one DF
dataset.columns = cols

In [22]:
#pulling FED Yield Curve data

yields = pd.read_csv('Fed10Y_3M.csv')
yields['Date'] = pd.to_datetime(yields['Date'])
yields['Date'] = yields['Date'].apply(lambda x: x.strftime('%Y-%m'))
yields = yields.set_index('Date')
yields = yields.drop(['3 Month Treasury Yield', 'Rec_prob', 'NBER_Rec','Unnamed: 7'], axis=1) 

In [23]:
## back to working on the general data
dataset.index = dataset.index.strftime('%Y-%m') #converting the datetime index to Y/M so it is collapsable
dataset = dataset.groupby(dataset.index, as_index=True).agg(sum) #collapsing by Y/M

In [24]:
#converting GDP quarterly data into monthly

dataset = dataclean.convert_q_to_m(dataset, 'GDP')

#converting consumer sentiment into monthly

dataset = dataclean.convert_q_to_m(dataset, 'CONS_SENT')

In [25]:
#calculating change in GDP and converting Y into categorical values 
dataset['Recession'] = ((dataset['GDP'] - dataset['GDP'].shift(3)) < 0).astype(int)

In [26]:
#merge fed interest rate data here
dataset = dataset.join(yields, how='outer')

In [27]:
#cutoff most of missing data, Post March 2019, Prior 1959. CPI/PPI missing 2016 onward so need to cut that off
dataset = dataset.iloc[552:]
dataset = dataset.iloc[:-59]

In [28]:
y = dataset['Recession'] #splitting off Y

In [29]:
dataset = dataset.drop(columns = ['GDP','Recession']) #dropping calc column and recession column from dataset, experimenting with taking out fed funds rate

In [30]:
#substituting mean value in for missing values and adding dummy column to indicate where done

for col in dataset.columns:
    dataclean.clean_zeros(col, dataset)

In [31]:
#adding momentum factors

momentum_cols = list(dataset.columns[:-6])

momentum_cols.remove('PPI') #removing PPI and CPI because they need a different transformation
momentum_cols.remove('CPI')

for i in [1,3,12]:
    for col in momentum_cols:
        dataclean.create_momentum(col,dataset,i)

In [32]:
#CPI Calcs

for i in [1,3,12]:
    for col in ['CPI','PPI']:
        dataclean.infl_momentum(col,dataset,i)

In [37]:
dataset.columns

Index(['PMI', 'UNR', 'YUNR', 'CONS_SENT', 'HOME_SALES', 'PART_TIME',
       'CPPR_PRICE', 'HOUS_PERMS', 'HOUS_STARTS', 'CAP_UTIL', 'PERS_SAVINGS',
       'EXPORTS', 'IMPORTS', 'TRADE_BALANCE', 'INT_RATE', 'US_M2',
       'US_NHOME_SALES', 'PPI', 'CPI', '3YRT', '10 Year Treasury Yield',
       '3 Month Treasury Yield (Bond Equivalent Basis)', 'Spread',
       'HOME_SALES_PXY', 'CPPR_PRICE_PXY', 'HOUS_PERMS_PXY', 'CAP_UTIL_PXY',
       'US_NHOME_SALES_PXY', '3YRT_PXY', 'PMI_1m_shift', 'UNR_1m_shift',
       'YUNR_1m_shift', 'CONS_SENT_1m_shift', 'HOME_SALES_1m_shift',
       'PART_TIME_1m_shift', 'CPPR_PRICE_1m_shift', 'HOUS_PERMS_1m_shift',
       'HOUS_STARTS_1m_shift', 'CAP_UTIL_1m_shift', 'PERS_SAVINGS_1m_shift',
       'EXPORTS_1m_shift', 'IMPORTS_1m_shift', 'TRADE_BALANCE_1m_shift',
       'INT_RATE_1m_shift', 'US_M2_1m_shift', 'US_NHOME_SALES_1m_shift',
       '3YRT_1m_shift', '10 Year Treasury Yield_1m_shift',
       '3 Month Treasury Yield (Bond Equivalent Basis)_1m_shift',
    

In [43]:
#spline time

#individual splines

CPPR_PRICE_fit = Pipeline([
    ('CPPR_PRICE', ColumnSelector(name='CPPR_PRICE')),
    ('CPPR_PRICE_spline', LinearSpline(knots=[160]))
])

Spread_fit = Pipeline([
    ('Spread', ColumnSelector(name='Spread')),
    ('Spread_spline', LinearSpline(knots=[0,0.25]))
])

EXPORTS_1m_shift_fit = Pipeline([
    ('EXPORTS_1m_shift', ColumnSelector(name='EXPORTS_1m_shift')),
    ('EXPORT1m_spline', LinearSpline(knots=[700,900]))
])

ThreeYT_1m_shift_fit = Pipeline([
    ('3YT_1m_shift', ColumnSelector(name='3YRT_1m_shift')),
    ('3YT_1m_spline', LinearSpline(knots=[-15]))
])

US_M2_1m_shift = Pipeline([
    ('US_M2_1m_shift', ColumnSelector(name='US_M2_1m_shift')),
    ('US_M2_1m_spline', LinearSpline(knots=[17]))
])

HOME_SALES_3m_shift = Pipeline([
    ('HOME_SALES_3m_shift', ColumnSelector(name='HOME_SALES_3m_shift')),
    ('HOME_SALES_3m_spline', LinearSpline(knots=[-500]))
])

PART_TIME_3m_shift = Pipeline([
    ('PART_TIME_3m_shift', ColumnSelector(name='PART_TIME_3m_shift')),
    ('PART_TIME_3m_spline', LinearSpline(knots=[-160]))
])

CAP_UTIL_3m_shift = Pipeline([
    ('CAP_UTIL_3m_shift', ColumnSelector(name='CAP_UTIL_3m_shift')),
    ('CAP_UTIL_3m_spline', LinearSpline(knots=[-0.8]))
])

EXPORTS_3m_shift = Pipeline([
    ('EXPORTS_3m_shift', ColumnSelector(name='EXPORTS_3m_shift')),
    ('EXPORTS_3m_spline', LinearSpline(knots=[1500,1600]))
])

IMPORTS_3m_shift = Pipeline([
    ('IMPORTS_3m_shift', ColumnSelector(name='IMPORTS_3m_shift')),
    ('IMPORTS_3m_spline', LinearSpline(knots=[2000]))
])

TRADE_BALANCE_3m_shift = Pipeline([
    ('TRADE_BALANCE_3m_shift', ColumnSelector(name='TRADE_BALANCE_3m_shift')),
    ('TRADE_BALANCE_3m_spline', LinearSpline(knots=[-2500]))
])

US_M2_3m_shift = Pipeline([
    ('US_M2_3m_shift', ColumnSelector(name='US_M2_3m_shift')),
    ('US_M2_3m_spline', LinearSpline(knots=[60]))
])

HOME_SALES_12m_shift = Pipeline([
    ('HOME_SALES_12m_shift', ColumnSelector(name='HOME_SALES_12m_shift')),
    ('HOME_SALES_12m_spline', LinearSpline(knots=[60]))
])

PART_TIME_12m_shift = Pipeline([
    ('PART_TIME_12m_shift', ColumnSelector(name='PART_TIME_12m_shift')),
    ('PART_TIME_12m_spline', LinearSpline(knots=[-225, -187.5, -180,-140]))
])

CPPR_PRICE_12m_shift = Pipeline([
    ('CPPR_PRICE_12m_shift', ColumnSelector(name='CPPR_PRICE_12m_shift')),
    ('CPPR_PRICE_12m_spline', LinearSpline(knots=[-30]))
])

CAP_UTIL_12m_shift = Pipeline([
    ('CAP_UTIL_12m_shift', ColumnSelector(name='CAP_UTIL_12m_shift')),
    ('CAP_UTIL_12m_spline', LinearSpline(knots=[-2]))
])

Spread_12m_shift = Pipeline([
    ('Spread_12m_shift', ColumnSelector(name='Spread_12m_shift')),
    ('Spread_12m_spline', LinearSpline(knots=[-1]))
])





#union features together

feature_pipeline = FeatureUnion([
    ('intercept', Intercept()),
    ('CPPR_PRICE_fit', CPPR_PRICE_fit),
    ('Spread_fit', Spread_fit),
    ('EXPORTS_1m_shift_fit', EXPORTS_1m_shift_fit),
    ('ThreeYT_1m_shift_fit', ThreeYT_1m_shift_fit),
    ("US_M2_1m_shift", US_M2_1m_shift),
    ("HOME_SALES_3m_shift", HOME_SALES_3m_shift),
    ("PART_TIME_3m_shift", PART_TIME_3m_shift),
    ("CAP_UTIL_3m_shift", CAP_UTIL_3m_shift),
    ("EXPORTS_3m_shift", EXPORTS_3m_shift),
    ("IMPORTS_3m_shift", IMPORTS_3m_shift),
    ("TRADE_BALANCE_3m_shift", TRADE_BALANCE_3m_shift),
    ("HOME_SALES_12m_shift", HOME_SALES_12m_shift),
    ("PART_TIME_12m_shift", PART_TIME_12m_shift),
    ("CPPR_Price_12m_shift", CPPR_PRICE_12m_shift),
    ("CAP_UTIL_12m_shift", CAP_UTIL_12m_shift),
    ("Spread_12m_shift", Spread_12m_shift)
])


feature_pipeline.fit(dataset)
features = feature_pipeline.transform(dataset)

#add back into the dataset

In [45]:
features

Unnamed: 0,intercept,CPPR_PRICE_spline_linear,CPPR_PRICE_spline_0,Spread_spline_linear,Spread_spline_0,Spread_spline_1,EXPORTS_1m_shift_spline_linear,EXPORTS_1m_shift_spline_0,EXPORTS_1m_shift_spline_1,3YRT_1m_shift_spline_linear,...,PART_TIME_12m_shift_spline_0,PART_TIME_12m_shift_spline_1,PART_TIME_12m_shift_spline_2,PART_TIME_12m_shift_spline_3,CPPR_PRICE_12m_shift_spline_linear,CPPR_PRICE_12m_shift_spline_0,CAP_UTIL_12m_shift_spline_linear,CAP_UTIL_12m_shift_spline_0,Spread_12m_shift_spline_linear,Spread_12m_shift_spline_0
1959-01,1.0,152.021926,0.0,1.14,1.14,0.89,,,,,...,,,,,,,,,,
1959-02,1.0,152.021926,0.0,1.20,1.20,0.95,-58.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-03,1.0,152.021926,0.0,1.13,1.13,0.88,70.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-04,1.0,152.021926,0.0,1.11,1.11,0.86,-21.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-05,1.0,152.021926,0.0,1.41,1.41,1.16,15.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-06,1.0,152.021926,0.0,1.06,1.06,0.81,37.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-07,1.0,152.021926,0.0,1.13,1.13,0.88,40.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-08,1.0,152.021926,0.0,0.97,0.97,0.72,35.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-09,1.0,152.021926,0.0,0.54,0.54,0.29,96.0,0.0,0.0,0.00,...,,,,,,,,,,
1959-10,1.0,152.021926,0.0,0.38,0.38,0.13,-200.0,0.0,0.0,0.00,...,,,,,,,,,,


In [15]:
dataset.columns

Index(['PMI', 'UNR', 'YUNR', 'CONS_SENT', 'HOME_SALES', 'PART_TIME',
       'CPPR_PRICE', 'HOUS_PERMS', 'HOUS_STARTS', 'CAP_UTIL', 'PERS_SAVINGS',
       'EXPORTS', 'IMPORTS', 'TRADE_BALANCE', 'INT_RATE', 'US_M2',
       'US_NHOME_SALES', 'PPI', 'CPI', '3YRT', '10 Year Treasury Yield',
       '3 Month Treasury Yield (Bond Equivalent Basis)', 'Spread',
       'HOME_SALES_PXY', 'CPPR_PRICE_PXY', 'HOUS_PERMS_PXY', 'CAP_UTIL_PXY',
       'US_NHOME_SALES_PXY', '3YRT_PXY', 'PMI_1m_shift', 'UNR_1m_shift',
       'YUNR_1m_shift', 'CONS_SENT_1m_shift', 'HOME_SALES_1m_shift',
       'PART_TIME_1m_shift', 'CPPR_PRICE_1m_shift', 'HOUS_PERMS_1m_shift',
       'HOUS_STARTS_1m_shift', 'CAP_UTIL_1m_shift', 'PERS_SAVINGS_1m_shift',
       'EXPORTS_1m_shift', 'IMPORTS_1m_shift', 'TRADE_BALANCE_1m_shift',
       'INT_RATE_1m_shift', 'US_M2_1m_shift', 'US_NHOME_SALES_1m_shift',
       '3YRT_1m_shift', '10 Year Treasury Yield_1m_shift',
       '3 Month Treasury Yield (Bond Equivalent Basis)_1m_shift',
    

In [37]:
#standardizing in the logistic regression model specifically as that's the only model it will impact

stand_cols = ['PMI', 'UNR', 'YUNR', 'CONS_SENT', 'HOME_SALES', 'PART_TIME',
       'CPPR_PRICE', 'HOUS_PERMS', 'HOUS_STARTS', 'CAP_UTIL', 'PERS_SAVINGS',
       'EXPORTS', 'IMPORTS', 'TRADE_BALANCE', 'INT_RATE', 'US_M2',
       'US_NHOME_SALES', 'PPI', 'CPI', '3YRT', '10 Year Treasury Yield',
       '3 Month Treasury Yield (Bond Equivalent Basis)', 'Spread','PMI_1m_shift', 'UNR_1m_shift',
       'YUNR_1m_shift', 'CONS_SENT_1m_shift', 'HOME_SALES_1m_shift',
       'PART_TIME_1m_shift', 'CPPR_PRICE_1m_shift', 'HOUS_PERMS_1m_shift',
       'HOUS_STARTS_1m_shift', 'CAP_UTIL_1m_shift', 'PERS_SAVINGS_1m_shift',
       'EXPORTS_1m_shift', 'IMPORTS_1m_shift', 'TRADE_BALANCE_1m_shift',
       'INT_RATE_1m_shift', 'US_M2_1m_shift', 'US_NHOME_SALES_1m_shift',
       '3YRT_1m_shift', '10 Year Treasury Yield_1m_shift',
       '3 Month Treasury Yield (Bond Equivalent Basis)_1m_shift',
       'Spread_1m_shift', 'PMI_3m_shift', 'UNR_3m_shift', 'YUNR_3m_shift',
       'CONS_SENT_3m_shift', 'HOME_SALES_3m_shift', 'PART_TIME_3m_shift',
       'CPPR_PRICE_3m_shift', 'HOUS_PERMS_3m_shift', 'HOUS_STARTS_3m_shift',
       'CAP_UTIL_3m_shift', 'PERS_SAVINGS_3m_shift', 'EXPORTS_3m_shift',
       'IMPORTS_3m_shift', 'TRADE_BALANCE_3m_shift', 'INT_RATE_3m_shift',
       'US_M2_3m_shift', 'US_NHOME_SALES_3m_shift', '3YRT_3m_shift',
       '10 Year Treasury Yield_3m_shift',
       '3 Month Treasury Yield (Bond Equivalent Basis)_3m_shift',
       'Spread_3m_shift', 'PMI_12m_shift', 'UNR_12m_shift', 'YUNR_12m_shift',
       'CONS_SENT_12m_shift', 'HOME_SALES_12m_shift', 'PART_TIME_12m_shift',
       'CPPR_PRICE_12m_shift', 'HOUS_PERMS_12m_shift', 'HOUS_STARTS_12m_shift',
       'CAP_UTIL_12m_shift', 'PERS_SAVINGS_12m_shift', 'EXPORTS_12m_shift',
       'IMPORTS_12m_shift', 'TRADE_BALANCE_12m_shift', 'INT_RATE_12m_shift',
       'US_M2_12m_shift', 'US_NHOME_SALES_12m_shift', '3YRT_12m_shift',
       '10 Year Treasury Yield_12m_shift',
       '3 Month Treasury Yield (Bond Equivalent Basis)_12m_shift',
       'Spread_12m_shift', 'CPI_1m_shift', 'PPI_1m_shift', 'CPI_3m_shift',
       'PPI_3m_shift', 'CPI_12m_shift', 'PPI_12m_shift']

dataset[stand_cols] = StandardScaler().fit_transform(dataset[stand_cols])


In [38]:
X = dataset

In [39]:
### Data Prep Finished Here ###

In [40]:
y_shift = y.shift(0) #shifting y to forecast 3 months out

In [41]:
y_shift = y_shift.fillna(0)

In [42]:
X_train = X.iloc[12:550]
X_test = X.iloc[550:]
y_train = y_shift.iloc[12:550]
y_test = y_shift.iloc[550:]

In [43]:
X_train.columns

Index(['PMI', 'UNR', 'YUNR', 'CONS_SENT', 'HOME_SALES', 'PART_TIME',
       'CPPR_PRICE', 'HOUS_PERMS', 'HOUS_STARTS', 'CAP_UTIL', 'PERS_SAVINGS',
       'EXPORTS', 'IMPORTS', 'TRADE_BALANCE', 'INT_RATE', 'US_M2',
       'US_NHOME_SALES', 'PPI', 'CPI', '3YRT', '10 Year Treasury Yield',
       '3 Month Treasury Yield (Bond Equivalent Basis)', 'Spread',
       'HOME_SALES_PXY', 'CPPR_PRICE_PXY', 'HOUS_PERMS_PXY', 'CAP_UTIL_PXY',
       'US_NHOME_SALES_PXY', '3YRT_PXY', 'PMI_1m_shift', 'UNR_1m_shift',
       'YUNR_1m_shift', 'CONS_SENT_1m_shift', 'HOME_SALES_1m_shift',
       'PART_TIME_1m_shift', 'CPPR_PRICE_1m_shift', 'HOUS_PERMS_1m_shift',
       'HOUS_STARTS_1m_shift', 'CAP_UTIL_1m_shift', 'PERS_SAVINGS_1m_shift',
       'EXPORTS_1m_shift', 'IMPORTS_1m_shift', 'TRADE_BALANCE_1m_shift',
       'INT_RATE_1m_shift', 'US_M2_1m_shift', 'US_NHOME_SALES_1m_shift',
       '3YRT_1m_shift', '10 Year Treasury Yield_1m_shift',
       '3 Month Treasury Yield (Bond Equivalent Basis)_1m_shift',
    

In [64]:
model = LogisticRegression() #try throwing in a bigger C than 1
model.fit(X_train, y_train) #fitting model



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [65]:
probs = model.predict_proba(X_test)

In [66]:
log_loss(y_test, probs)

5.930068219021127

In [67]:
roc_auc_score(y_test.values, probs[:,1:])

0.7912979351032449

In [68]:
for object in zip(dataset.columns,model.coef_[0] ):
    print(object)

('PMI', -0.5577581345059722)
('UNR', -0.005793428131188654)
('YUNR', 0.16261354189189647)
('CONS_SENT', -0.2962351563321862)
('HOME_SALES', -0.38970432370944547)
('PART_TIME', -0.01998451273796755)
('CPPR_PRICE', 1.2766653987212835)
('HOUS_PERMS', -0.49900938269381173)
('HOUS_STARTS', -0.1714564716487508)
('CAP_UTIL', -0.6261149105404844)
('PERS_SAVINGS', -0.19429642007678607)
('EXPORTS', 0.5071290638835023)
('IMPORTS', 0.5052557348145503)
('TRADE_BALANCE', -0.4496205528634843)
('INT_RATE', -0.22905258285221383)
('US_M2', 0.3268464317083054)
('US_NHOME_SALES', -0.5971097747775221)
('PPI', 0.11533946625709293)
('CPI', -0.19901845631168136)
('3YRT', 0.15775302232099553)
('10 Year Treasury Yield', -0.4004110132370056)
('3 Month Treasury Yield (Bond Equivalent Basis)', -0.2803050711129759)
('Spread', -0.1522306004660534)
('HOME_SALES_PXY', -0.30414118837917586)
('CPPR_PRICE_PXY', -0.2377555779702628)
('HOUS_PERMS_PXY', 0.0)
('CAP_UTIL_PXY', -0.2377555779702628)
('US_NHOME_SALES_PXY', 0.513

In [28]:
results = pd.DataFrame(probs)
results['actual'] = y_test.values
results.index = y_test.index
pd.set_option('display.float_format', lambda x: '%.3f' % x)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

            0     1  actual
2004-11 1.000 0.000   0.000
2004-12 1.000 0.000   0.000
2005-01 1.000 0.000   0.000
2005-02 1.000 0.000   0.000
2005-03 1.000 0.000   0.000
2005-04 1.000 0.000   0.000
2005-05 1.000 0.000   0.000
2005-06 0.262 0.738   0.000
2005-07 1.000 0.000   0.000
2005-08 1.000 0.000   0.000
2005-09 1.000 0.000   0.000
2005-10 1.000 0.000   0.000
2005-11 0.000 1.000   0.000
2005-12 1.000 0.000   0.000
2006-01 1.000 0.000   0.000
2006-02 1.000 0.000   0.000
2006-03 0.997 0.003   0.000
2006-04 1.000 0.000   0.000
2006-05 1.000 0.000   0.000
2006-06 0.001 0.999   0.000
2006-07 0.495 0.505   0.000
2006-08 0.000 1.000   0.000
2006-09 0.000 1.000   0.000
2006-10 0.000 1.000   0.000
2006-11 0.000 1.000   0.000
2006-12 0.000 1.000   0.000
2007-01 0.000 1.000   1.000
2007-02 0.000 1.000   1.000
2007-03 0.000 1.000   1.000
2007-04 0.000 1.000   0.000
2007-05 0.000 1.000   0.000
2007-06 0.000 1.000   0.000
2007-07 0.000 1.000   0.000
2007-08 0.000 1.000   0.000
2007-09 0.000 1.000 