In [927]:
#Imports and API Key
#building in offsets

#Imports and API Key

import pandas as pd
import quandl
from scipy import stats
import scipy
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline

from basis_expansions.basis_expansions import (
    Polynomial, LinearSpline)

from regression_tools.dftransformers import (
    ColumnSelector, Identity, FeatureUnion, MapFeature, Intercept)

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeClassifier

import xgboost as xgb

%aimport dataclean

quandl.ApiConfig.api_key = 'm8FYMyoCaJSbTrBASNHh'

In [928]:
#pulling data from Quandl

data = pd.read_csv('data_for_pull.csv') #staging the QUANDL keys to pull in CSV
cols = list(data['Var_name'].astype('str'))
dataset = quandl.get([val for val in data['Quandl Key']]) #looping through the QUANDL keys to pull it into one DF
dataset.columns = cols

In [929]:
#pulling FED Yield Curve data

yields = pd.read_csv('Fed10Y_3M.csv')
yields['Date'] = pd.to_datetime(yields['Date'])
yields['Date'] = yields['Date'].apply(lambda x: x.strftime('%Y-%m'))
yields = yields.set_index('Date')
yields = yields.drop(['3 Month Treasury Yield', 'Rec_prob', 'NBER_Rec','Unnamed: 7'], axis=1) 

In [930]:
#need to write in special logic to factor in for 3YRT being a daily

treas = dataset['3YRT']
dataset = dataset.drop(columns = ['3YRT'])

In [931]:
treas = treas.resample('MS').mean()

In [932]:
## back to working on the general data
dataset.index = dataset.index.strftime('%Y-%m') #converting the datetime index to Y/M so it is collapsable
dataset = dataset.groupby(dataset.index, as_index=True).agg(sum) #collapsing by Y/M

In [933]:
#readd 3YRT back into data

dataset = dataset.join(treas, how='outer')

In [934]:
#converting GDP quarterly data into monthly - need to convert it so it fills in the following 3 months

dataset = dataclean.convert_q_to_m(dataset, 'GDP')

#converting consumer sentiment into monthly

dataset = dataclean.convert_q_to_m(dataset, 'CONS_SENT')

In [935]:
offset_dict = {'PMI':1, 'UNR':1, 'YUNR':1,'HOME_SALES':1, 'PART_TIME':1, 'CPPR_PRICE':1, 'HOUS_PERMS':1, 'HOUS_STARTS':1, 'CAP_UTIL':1, 'PERS_SAVINGS':3, 'EXPORTS':3, 'IMPORTS':3, 'TRADE_BALANCE':3, 'US_M2':1, 'US_NHOME_SALES':1,'PPI':1,'CPI':1} #positives shift down, negatives shift up. Goal of this dict is to align data to when it gets released

In [936]:
for entry in offset_dict:
    #print(entry)
    #print(offset_dict[entry])
    dataset[str(entry)] = dataset[str(entry)].shift(offset_dict[entry])

In [937]:
#calculating change in GDP and converting Y into categorical values 
dataset['Recession'] = ((dataset['GDP'] - dataset['GDP'].shift(3)) < 0).astype(int)
#dataset = dataset.drop(columns = ['GDP','Recession']) #dropping calc column and recession column from dataset, experimenting with taking out fed funds rate

In [938]:
#merge fed interest rate data here
dataset = dataset.join(yields, how='outer')

In [939]:
dataset.shape

(1286, 25)

In [940]:
dataset = dataset[552:]

In [941]:
dataset = dataset[:-12]

In [942]:
#substituting mean value in for missing values and adding dummy column to indicate where done

for col in dataset.columns:
    if str(col)=='Recession':
        continue
    dataclean.clean_zeros(col, dataset)

In [943]:
dataset['3YRT'] = dataset['3YRT'].fillna(dataset['3YRT'].mean())

In [944]:
dataset.head()

Unnamed: 0,PMI,UNR,YUNR,GDP,CONS_SENT,HOME_SALES,PART_TIME,CPPR_PRICE,HOUS_PERMS,HOUS_STARTS,CAP_UTIL,PERS_SAVINGS,EXPORTS,IMPORTS,TRADE_BALANCE,INT_RATE,US_M2,US_NHOME_SALES,PPI,CPI,3YRT,Recession,10 Year Treasury Yield,3 Month Treasury Yield (Bond Equivalent Basis),Spread,HOME_SALES_PXY,CPPR_PRICE_PXY,HOUS_PERMS_PXY,HOUS_STARTS_PXY,CAP_UTIL_PXY,PERS_SAVINGS_PXY,US_M2_PXY,US_NHOME_SALES_PXY,CPI_PXY
1959-01-01,60.5,6.2,12.1,510.33,90.8,3353.324,1081.0,165.089,1331.222,1429.532,69.48,8.779,1349.0,1091.0,258.0,2.48,4028.724,606.03,31.6,13.254,5.572,0.0,4.02,2.88,1.14,1,1,1,1,1,1,1,1,0
1959-02-01,64.4,6.0,11.6,510.33,90.8,3353.324,1022.0,165.089,1331.222,1657.0,69.48,8.779,1401.0,1156.0,245.0,2.43,286.6,606.03,31.7,13.299,5.572,0.0,3.96,2.76,1.2,1,1,1,0,1,1,0,1,0
1959-03-01,66.9,5.9,11.1,510.33,90.8,3353.324,973.0,165.089,1331.222,1667.0,69.48,8.779,1339.0,1139.0,200.0,2.8,287.7,606.03,31.7,13.254,5.572,0.0,3.99,2.86,1.13,1,1,1,0,1,1,0,1,0
1959-04-01,67.1,5.6,11.1,522.653,90.8,3353.324,1102.0,165.089,1331.222,1620.0,69.48,11.3,1314.0,1166.0,148.0,2.96,289.2,606.03,31.7,13.254,5.572,0.0,4.12,3.01,1.11,1,1,1,0,1,0,0,1,0
1959-05-01,66.9,5.2,10.8,522.653,95.3,3353.324,1086.0,165.089,1331.222,1590.0,69.48,10.6,1256.0,1202.0,54.0,2.9,290.1,606.03,31.8,13.299,5.572,0.0,4.31,2.9,1.41,1,1,1,0,1,0,0,1,0


In [945]:
#adding momentum factors

momentum_cols = list(dataset.columns[:-6])

momentum_cols.remove('PPI') #removing PPI and CPI because they need a different transformation
momentum_cols.remove('CPI')
momentum_cols.remove('Recession')

for i in [1,3,12]:
    for col in momentum_cols:
        dataclean.create_momentum(col,dataset,i)

In [946]:
#CPI Calcs

for i in [1,3,12]:
    for col in ['CPI','PPI']:
        dataclean.infl_momentum(col,dataset,i)

In [947]:
#spline time - splines seriously impede the model, Time Horizon of 1 goes from LL of 4.9 to 8.19, AUC degreades by .04
#stickiness remains

#individual splines

CPPR_PRICE_fit = Pipeline([
    ('CPPR_PRICE', ColumnSelector(name='CPPR_PRICE')),
    ('CPPR_PRICE_spline', LinearSpline(knots=[160]))
])

Spread_fit = Pipeline([
    ('Spread', ColumnSelector(name='Spread')),
    ('Spread_spline', LinearSpline(knots=[0,0.25]))
])

EXPORTS_1m_shift_fit = Pipeline([
    ('EXPORTS_1m_shift', ColumnSelector(name='EXPORTS_1m_shift')),
    ('EXPORT1m_spline', LinearSpline(knots=[700,900]))
])

ThreeYT_1m_shift_fit = Pipeline([
    ('3YT_1m_shift', ColumnSelector(name='3YRT_1m_shift')),
    ('3YT_1m_spline', LinearSpline(knots=[-15]))
])

US_M2_1m_shift = Pipeline([
    ('US_M2_1m_shift', ColumnSelector(name='US_M2_1m_shift')),
    ('US_M2_1m_spline', LinearSpline(knots=[17]))
])

HOME_SALES_3m_shift = Pipeline([
    ('HOME_SALES_3m_shift', ColumnSelector(name='HOME_SALES_3m_shift')),
    ('HOME_SALES_3m_spline', LinearSpline(knots=[-500]))
])

PART_TIME_3m_shift = Pipeline([
    ('PART_TIME_3m_shift', ColumnSelector(name='PART_TIME_3m_shift')),
    ('PART_TIME_3m_spline', LinearSpline(knots=[-160]))
])

CAP_UTIL_3m_shift = Pipeline([
    ('CAP_UTIL_3m_shift', ColumnSelector(name='CAP_UTIL_3m_shift')),
    ('CAP_UTIL_3m_spline', LinearSpline(knots=[-0.8]))
])

EXPORTS_3m_shift = Pipeline([
    ('EXPORTS_3m_shift', ColumnSelector(name='EXPORTS_3m_shift')),
    ('EXPORTS_3m_spline', LinearSpline(knots=[1500,1600]))
])

IMPORTS_3m_shift = Pipeline([
    ('IMPORTS_3m_shift', ColumnSelector(name='IMPORTS_3m_shift')),
    ('IMPORTS_3m_spline', LinearSpline(knots=[2000]))
])

TRADE_BALANCE_3m_shift = Pipeline([
    ('TRADE_BALANCE_3m_shift', ColumnSelector(name='TRADE_BALANCE_3m_shift')),
    ('TRADE_BALANCE_3m_spline', LinearSpline(knots=[-2500]))
])

US_M2_3m_shift = Pipeline([
    ('US_M2_3m_shift', ColumnSelector(name='US_M2_3m_shift')),
    ('US_M2_3m_spline', LinearSpline(knots=[60]))
])

HOME_SALES_12m_shift = Pipeline([
    ('HOME_SALES_12m_shift', ColumnSelector(name='HOME_SALES_12m_shift')),
    ('HOME_SALES_12m_spline', LinearSpline(knots=[60]))
])

PART_TIME_12m_shift = Pipeline([
    ('PART_TIME_12m_shift', ColumnSelector(name='PART_TIME_12m_shift')),
    ('PART_TIME_12m_spline', LinearSpline(knots=[-225, -187.5, -180,-140]))
])

CPPR_PRICE_12m_shift = Pipeline([
    ('CPPR_PRICE_12m_shift', ColumnSelector(name='CPPR_PRICE_12m_shift')),
    ('CPPR_PRICE_12m_spline', LinearSpline(knots=[-30]))
])

CAP_UTIL_12m_shift = Pipeline([
    ('CAP_UTIL_12m_shift', ColumnSelector(name='CAP_UTIL_12m_shift')),
    ('CAP_UTIL_12m_spline', LinearSpline(knots=[-2]))
])

Spread_12m_shift = Pipeline([
    ('Spread_12m_shift', ColumnSelector(name='Spread_12m_shift')),
    ('Spread_12m_spline', LinearSpline(knots=[-1]))
])





#union features together

feature_pipeline = FeatureUnion([
    ('intercept', Intercept()),
    ('CPPR_PRICE_fit', CPPR_PRICE_fit),
    ('Spread_fit', Spread_fit),
    ('EXPORTS_1m_shift_fit', EXPORTS_1m_shift_fit),
    ('ThreeYT_1m_shift_fit', ThreeYT_1m_shift_fit),
    ("US_M2_1m_shift", US_M2_1m_shift),
    ("HOME_SALES_3m_shift", HOME_SALES_3m_shift),
    ("PART_TIME_3m_shift", PART_TIME_3m_shift),
    ("CAP_UTIL_3m_shift", CAP_UTIL_3m_shift),
    ("EXPORTS_3m_shift", EXPORTS_3m_shift),
    ("IMPORTS_3m_shift", IMPORTS_3m_shift),
    ("TRADE_BALANCE_3m_shift", TRADE_BALANCE_3m_shift),
    ("HOME_SALES_12m_shift", HOME_SALES_12m_shift),
    ("PART_TIME_12m_shift", PART_TIME_12m_shift),
    ("CPPR_Price_12m_shift", CPPR_PRICE_12m_shift),
    ("CAP_UTIL_12m_shift", CAP_UTIL_12m_shift),
    ("Spread_12m_shift", Spread_12m_shift)
])


feature_pipeline.fit(dataset)
features = feature_pipeline.transform(dataset)

In [948]:
#dropping columns from OG dataset that were splined

splined_cols = ['CPPR_PRICE','Spread','EXPORTS_1m_shift','3YRT_1m_shift','US_M2_1m_shift','PART_TIME_3m_shift',
'CAP_UTIL_3m_shift',
'EXPORTS_3m_shift',
'IMPORTS_3m_shift',
'TRADE_BALANCE_3m_shift',
'US_M2_3m_shift',
'HOME_SALES_12m_shift',
'PART_TIME_12m_shift',
'CPPR_PRICE_12m_shift',
'CAP_UTIL_12m_shift',
'Spread_12m_shift']

dataset = dataset.drop(columns = splined_cols)

In [949]:
dataset.shape

(722, 99)

In [950]:
dataset[12:]

Unnamed: 0,PMI,UNR,YUNR,GDP,CONS_SENT,HOME_SALES,PART_TIME,HOUS_PERMS,HOUS_STARTS,CAP_UTIL,PERS_SAVINGS,EXPORTS,IMPORTS,TRADE_BALANCE,INT_RATE,US_M2,US_NHOME_SALES,PPI,CPI,3YRT,Recession,10 Year Treasury Yield,3 Month Treasury Yield (Bond Equivalent Basis),HOME_SALES_PXY,CPPR_PRICE_PXY,HOUS_PERMS_PXY,HOUS_STARTS_PXY,CAP_UTIL_PXY,PERS_SAVINGS_PXY,US_M2_PXY,US_NHOME_SALES_PXY,CPI_PXY,PMI_1m_shift,UNR_1m_shift,YUNR_1m_shift,GDP_1m_shift,CONS_SENT_1m_shift,HOME_SALES_1m_shift,PART_TIME_1m_shift,CPPR_PRICE_1m_shift,HOUS_PERMS_1m_shift,HOUS_STARTS_1m_shift,CAP_UTIL_1m_shift,PERS_SAVINGS_1m_shift,IMPORTS_1m_shift,TRADE_BALANCE_1m_shift,INT_RATE_1m_shift,US_NHOME_SALES_1m_shift,10 Year Treasury Yield_1m_shift,3 Month Treasury Yield (Bond Equivalent Basis)_1m_shift,Spread_1m_shift,HOME_SALES_PXY_1m_shift,CPPR_PRICE_PXY_1m_shift,HOUS_PERMS_PXY_1m_shift,PMI_3m_shift,UNR_3m_shift,YUNR_3m_shift,GDP_3m_shift,CONS_SENT_3m_shift,HOME_SALES_3m_shift,CPPR_PRICE_3m_shift,HOUS_PERMS_3m_shift,HOUS_STARTS_3m_shift,PERS_SAVINGS_3m_shift,INT_RATE_3m_shift,US_NHOME_SALES_3m_shift,3YRT_3m_shift,10 Year Treasury Yield_3m_shift,3 Month Treasury Yield (Bond Equivalent Basis)_3m_shift,Spread_3m_shift,HOME_SALES_PXY_3m_shift,CPPR_PRICE_PXY_3m_shift,HOUS_PERMS_PXY_3m_shift,PMI_12m_shift,UNR_12m_shift,YUNR_12m_shift,GDP_12m_shift,CONS_SENT_12m_shift,HOUS_PERMS_12m_shift,HOUS_STARTS_12m_shift,PERS_SAVINGS_12m_shift,EXPORTS_12m_shift,IMPORTS_12m_shift,TRADE_BALANCE_12m_shift,INT_RATE_12m_shift,US_M2_12m_shift,US_NHOME_SALES_12m_shift,3YRT_12m_shift,10 Year Treasury Yield_12m_shift,3 Month Treasury Yield (Bond Equivalent Basis)_12m_shift,HOME_SALES_PXY_12m_shift,CPPR_PRICE_PXY_12m_shift,HOUS_PERMS_PXY_12m_shift,CPI_1m_shift,PPI_1m_shift,CPI_3m_shift,PPI_3m_shift,CPI_12m_shift,PPI_12m_shift
1960-01-01,58.200,5.300,11.100,542.648,93.800,3353.324,1000.000,1331.222,1601.000,69.480,9.400,1328.000,1184.000,144.000,3.990,297.800,606.030,31.500,13.483,5.572,0.000,4.720,4.460,1,1,1,0,1,0,0,1,0,7.600,-0.500,-0.200,14.048,0.000,0.000,7.000,0.000,0.000,185.000,0.000,0.700,-227.000,27.000,0.000,0.000,0.030,-0.140,0.170,0.000,0.000,0.000,9.900,-0.200,-0.200,14.048,-1.500,0.000,0.000,0.000,61.000,-1.300,0.010,0.000,0.000,0.190,0.310,-0.120,0.000,0.000,0.000,-2.300,-0.900,-1.000,32.318,3.000,0.000,171.468,0.621,-21.000,93.000,-114.000,1.510,-3730.924,0.000,0.000,0.700,1.580,0.000,0.000,0.000,0.000,0.000,0.341,-0.631,1.730,-0.316
1960-02-01,61.500,5.200,10.900,542.648,93.800,3353.324,1015.000,1092.000,1460.000,69.480,10.100,1376.000,1292.000,84.000,3.970,298.200,606.030,31.600,13.437,5.572,0.000,4.490,4.060,1,1,0,0,1,0,0,1,0,3.300,-0.100,-0.200,0.000,0.000,0.000,15.000,0.000,-239.222,-141.000,0.000,0.700,108.000,-60.000,-0.020,0.000,-0.230,-0.400,0.170,0.000,0.000,-1.000,11.800,-0.500,-0.600,14.048,0.000,0.000,0.000,-239.222,105.000,0.500,-0.030,0.000,0.000,-0.040,-0.190,0.150,0.000,0.000,-1.000,-2.900,-0.800,-0.700,32.318,3.000,-239.222,-197.000,1.321,-25.000,136.000,-161.000,1.540,11.600,0.000,0.000,0.530,1.300,0.000,0.000,-1.000,-0.340,0.317,-0.340,0.000,1.034,-0.315
1960-03-01,52.300,4.800,10.200,542.648,93.800,3353.324,1062.000,1088.000,1503.000,69.480,11.000,1493.000,1353.000,140.000,3.840,298.500,606.030,31.600,13.483,5.572,0.000,4.250,3.380,1,1,0,0,1,0,0,1,0,-9.200,-0.400,-0.700,0.000,0.000,0.000,47.000,0.000,-4.000,43.000,0.000,0.900,61.000,56.000,-0.130,0.000,-0.240,-0.680,0.440,0.000,0.000,0.000,1.700,-1.000,-1.100,14.048,0.000,0.000,0.000,-243.222,87.000,2.300,-0.150,0.000,0.000,-0.440,-1.220,0.780,0.000,0.000,-1.000,-14.600,-1.100,-0.900,32.318,3.000,-243.222,-164.000,2.221,154.000,214.000,-60.000,1.040,10.800,0.000,0.000,0.260,0.520,0.000,0.000,-1.000,0.341,0.000,0.000,0.317,1.730,-0.315
1960-04-01,47.800,5.400,11.500,541.080,93.800,3353.324,888.000,955.000,1109.000,69.480,10.900,2048.000,1883.000,165.000,3.920,299.400,606.030,31.800,13.483,5.572,1.000,4.280,3.300,1,1,0,0,1,0,0,1,0,-4.500,0.600,1.300,-1.568,0.000,0.000,-174.000,0.000,-133.000,-394.000,0.000,-0.100,530.000,25.000,0.080,0.000,0.030,-0.080,0.110,0.000,0.000,0.000,-10.400,0.100,0.400,-1.568,0.000,0.000,0.000,-376.222,-492.000,1.500,-0.070,0.000,0.000,-0.440,-1.160,0.720,0.000,0.000,-1.000,-19.300,-0.200,0.400,18.427,3.000,-376.222,-511.000,-0.400,734.000,717.000,17.000,0.960,10.200,0.000,0.000,0.160,0.290,0.000,0.000,-1.000,0.000,0.633,0.000,0.952,1.730,0.315
1960-05-01,45.300,5.200,10.900,541.080,93.300,3353.324,1041.000,1016.000,1289.000,69.480,10.600,2068.000,1989.000,79.000,3.850,300.100,606.030,31.800,13.529,5.572,1.000,4.350,3.360,1,1,0,0,1,0,0,1,0,-2.500,-0.200,-0.600,0.000,-0.500,0.000,153.000,0.000,61.000,180.000,0.000,-0.300,106.000,-86.000,-0.070,0.000,0.070,0.060,0.010,0.000,0.000,0.000,-16.200,0.000,0.000,-1.568,-0.500,0.000,0.000,-76.000,-171.000,0.500,-0.120,0.000,0.000,-0.140,-0.700,0.560,0.000,0.000,0.000,-21.600,0.000,0.100,18.427,-2.000,-315.222,-301.000,0.000,812.000,787.000,25.000,0.950,10.000,0.000,0.000,0.040,0.460,0.000,0.000,-1.000,0.340,0.000,0.683,0.633,1.724,0.000
1960-06-01,42.600,5.100,10.700,541.080,93.300,3353.324,988.000,1052.000,1271.000,69.480,9.400,2055.000,1927.000,128.000,3.320,300.900,606.030,31.700,13.529,5.572,1.000,4.150,2.510,1,1,0,0,1,0,0,1,0,-2.700,-0.100,-0.200,0.000,0.000,0.000,-53.000,0.000,36.000,-18.000,0.000,-1.200,-62.000,49.000,-0.530,0.000,-0.200,-0.850,0.650,0.000,0.000,0.000,-9.700,0.300,0.500,-1.568,-0.500,0.000,0.000,-36.000,-232.000,-1.600,-0.520,0.000,0.000,-0.100,-0.870,0.770,0.000,0.000,0.000,-25.600,0.000,0.700,18.427,-2.000,-279.222,-227.000,-0.900,729.000,707.000,22.000,-0.070,8.700,0.000,0.000,-0.190,-0.770,0.000,0.000,-1.000,0.000,-0.314,0.340,0.316,1.724,-0.314
1960-07-01,44.400,5.400,11.000,545.604,93.300,3353.324,966.000,958.000,1247.000,69.480,8.400,2199.000,1988.000,211.000,3.230,302.300,606.030,31.700,13.575,5.572,0.000,3.900,2.350,1,1,0,0,1,0,0,1,0,1.800,0.300,0.300,4.524,0.000,0.000,-22.000,0.000,-94.000,-24.000,0.000,-1.000,61.000,83.000,-0.090,0.000,-0.250,-0.160,-0.090,0.000,0.000,0.000,-3.400,0.000,-0.500,4.524,-0.500,0.000,0.000,3.000,138.000,-2.500,-0.690,0.000,0.000,-0.380,-0.950,0.570,0.000,0.000,0.000,-20.000,0.400,0.500,20.570,-2.000,-373.222,-256.000,-2.800,894.000,770.000,124.000,-0.240,8.200,0.000,0.000,-0.500,-0.920,0.000,0.000,-1.000,0.339,0.000,0.680,-0.314,1.718,0.000
1960-08-01,43.700,5.500,10.800,545.604,97.200,3353.324,1013.000,999.000,1197.000,69.480,10.400,2216.000,1913.000,303.000,2.980,304.100,606.030,31.700,13.575,5.572,0.000,3.800,2.350,1,1,0,0,1,0,0,1,0,-0.700,0.100,-0.200,0.000,3.900,0.000,47.000,0.000,41.000,-50.000,0.000,2.000,-75.000,92.000,-0.250,0.000,-0.100,0.000,-0.100,0.000,0.000,0.000,-1.600,0.300,-0.100,4.524,3.900,0.000,0.000,-17.000,-92.000,-0.200,-0.870,0.000,0.000,-0.550,-1.010,0.460,0.000,0.000,0.000,-17.800,0.400,0.300,20.570,1.900,-332.222,-350.000,-0.200,896.000,583.000,313.000,-0.520,8.900,0.000,0.000,-0.630,-1.110,0.000,0.000,-1.000,0.000,0.000,0.339,-0.314,1.370,0.000
1960-09-01,47.600,5.600,11.400,545.604,97.200,3353.324,1018.000,994.000,1344.000,69.480,10.400,2215.000,1911.000,304.000,2.600,306.900,606.030,31.600,13.575,5.572,0.000,3.800,2.530,1,1,0,0,1,0,0,1,0,3.900,0.100,0.600,0.000,0.000,0.000,5.000,0.000,-5.000,147.000,0.000,0.000,-2.000,1.000,-0.380,0.000,0.000,0.180,-0.180,0.000,0.000,0.000,5.000,0.500,0.700,4.524,3.900,0.000,0.000,-58.000,73.000,1.000,-0.720,0.000,0.000,-0.350,0.020,-0.370,0.000,0.000,0.000,-7.500,0.400,0.200,20.570,1.900,-337.222,-86.000,-0.100,858.000,610.000,248.000,-1.160,10.500,0.000,0.000,-0.880,-1.610,0.000,0.000,-1.000,0.000,-0.315,0.339,-0.315,1.370,0.000
1960-10-01,45.400,5.500,11.000,540.197,97.200,3353.324,1027.000,984.000,1097.000,69.480,10.400,2195.000,1925.000,270.000,2.470,308.400,606.030,31.600,13.575,5.572,1.000,3.890,2.350,1,1,0,0,1,0,0,1,0,-2.200,-0.100,-0.400,-5.407,0.000,0.000,9.000,0.000,-10.000,-247.000,0.000,0.000,14.000,-34.000,-0.130,0.000,0.090,-0.180,0.270,0.000,0.000,0.000,1.000,0.100,0.000,-5.407,3.900,0.000,0.000,26.000,-150.000,2.000,-0.760,0.000,0.000,-0.010,0.000,-0.010,0.000,0.000,0.000,-2.900,0.000,-0.300,11.597,1.900,-347.222,-443.000,-0.300,798.000,698.000,100.000,-1.510,11.700,0.000,0.000,-0.640,-1.800,0.000,0.000,-1.000,0.000,0.000,0.000,-0.315,1.024,-0.315


In [951]:
#cutoff most of missing data, Post March 2019, Prior 1959. CPI/PPI missing 2016 onward so need to cut that off
#dataset = dataset.iloc[552:]
#dataset = dataset.iloc[:-2]

#y = y.iloc[552:]
y = dataset['Recession']
dataset = dataset.drop(columns = ['Recession'])
X = dataset

In [952]:
### Data Prep Finished Here ###

In [961]:
y_shift = y.shift(-12) #needs to be negative to look forward
y_shift = y_shift.fillna(0)

In [962]:
X_train = X.iloc[12:550]
X_test = X.iloc[550:]
y_train = y_shift.iloc[12:550]
y_test = y_shift.iloc[550:]

In [963]:
model = LogisticRegression(penalty = 'l2', C=2000, max_iter = 100, solver = 'sag') #try throwing in a bigger C than 1
#RidgeClassifier().fit(X, y)
model.fit(X_train, y_train) #fitting model



LogisticRegression(C=2000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [964]:
probs = model.predict_proba(X_test)

In [965]:
y_test.shape

(172,)

In [966]:
X_test.shape

(172, 98)

In [967]:
log_loss(y_test, probs)

0.3546175018821827

In [968]:
roc_auc_score(y_test.values, probs[:,1:])

0.8703124999999999

In [859]:
results = pd.DataFrame(probs)
results['actual'] = y_test.values
results.index = y_test.index
pd.set_option('display.float_format', lambda x: '%.3f' % x)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

               0     1  actual
2004-11-01 0.994 0.006   0.000
2004-12-01 0.988 0.012   0.000
2005-01-01 0.994 0.006   0.000
2005-02-01 0.995 0.005   0.000
2005-03-01 0.985 0.015   0.000
2005-04-01 0.990 0.010   0.000
2005-05-01 0.990 0.010   0.000
2005-06-01 0.964 0.036   0.000
2005-07-01 0.992 0.008   0.000
2005-08-01 0.975 0.025   0.000
2005-09-01 0.972 0.028   0.000
2005-10-01 0.976 0.024   0.000
2005-11-01 0.971 0.029   0.000
2005-12-01 0.985 0.015   0.000
2006-01-01 0.971 0.029   0.000
2006-02-01 0.923 0.077   0.000
2006-03-01 0.969 0.031   0.000
2006-04-01 0.973 0.027   0.000
2006-05-01 0.905 0.095   0.000
2006-06-01 0.971 0.029   0.000
2006-07-01 0.929 0.071   0.000
2006-08-01 0.958 0.042   0.000
2006-09-01 0.924 0.076   0.000
2006-10-01 0.928 0.072   0.000
2006-11-01 0.941 0.059   0.000
2006-12-01 0.818 0.182   0.000
2007-01-01 0.715 0.285   0.000
2007-02-01 0.811 0.189   0.000
2007-03-01 0.849 0.151   0.000
2007-04-01 0.607 0.393   0.000
2007-05-01 0.714 0.286   0.000
2007-06-