In [2]:
#1.1 Initial datapoints. log loss 0.64
#1.2 Momentum factors, CPI/PPI Calculations, modified timeframe. LL: 1yr - 10.64 3M - 1.64
#1.3 Standardizing continuous features
#1.4 Inserting Linear Splines, Implement Ridge Regression to dampen spline overfitting

In [83]:
#Ridge Regression Gameplan

#Find Alpha


In [119]:
#Imports and API Key

import pandas as pd
import quandl
from scipy import stats
import scipy
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline

from basis_expansions.basis_expansions import (
    Polynomial, LinearSpline)

from regression_tools.dftransformers import (
    ColumnSelector, Identity, FeatureUnion, MapFeature, Intercept)

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeClassifier

%aimport dataclean

quandl.ApiConfig.api_key = 'm8FYMyoCaJSbTrBASNHh'

In [85]:
#pulling data from Quandl

data = pd.read_csv('data_for_pull.csv') #staging the QUANDL keys to pull in CSV
cols = list(data['Var_name'].astype('str'))
dataset = quandl.get([val for val in data['Quandl Key']]) #looping through the QUANDL keys to pull it into one DF
dataset.columns = cols

In [86]:
#pulling FED Yield Curve data

yields = pd.read_csv('Fed10Y_3M.csv')
yields['Date'] = pd.to_datetime(yields['Date'])
yields['Date'] = yields['Date'].apply(lambda x: x.strftime('%Y-%m'))
yields = yields.set_index('Date')
yields = yields.drop(['3 Month Treasury Yield', 'Rec_prob', 'NBER_Rec','Unnamed: 7'], axis=1) 

In [87]:
## back to working on the general data
dataset.index = dataset.index.strftime('%Y-%m') #converting the datetime index to Y/M so it is collapsable
dataset = dataset.groupby(dataset.index, as_index=True).agg(sum) #collapsing by Y/M

In [88]:
#converting GDP quarterly data into monthly

dataset = dataclean.convert_q_to_m(dataset, 'GDP')

#converting consumer sentiment into monthly

dataset = dataclean.convert_q_to_m(dataset, 'CONS_SENT')

In [89]:
#calculating change in GDP and converting Y into categorical values 
dataset['Recession'] = ((dataset['GDP'] - dataset['GDP'].shift(3)) < 0).astype(int)

In [90]:
#merge fed interest rate data here
dataset = dataset.join(yields, how='outer')

In [91]:
#we split off the recession data here because later we start purging 0s
y = dataset['Recession'] #splitting off Y
dataset = dataset.drop(columns = ['GDP','Recession'])

In [92]:
#substituting mean value in for missing values and adding dummy column to indicate where done

for col in dataset.columns:
    dataclean.clean_zeros(col, dataset)

In [93]:
#adding momentum factors

momentum_cols = list(dataset.columns[:-6])

momentum_cols.remove('PPI') #removing PPI and CPI because they need a different transformation
momentum_cols.remove('CPI')

for i in [1,3,12]:
    for col in momentum_cols:
        dataclean.create_momentum(col,dataset,i)

In [94]:
#CPI Calcs

for i in [1,3,12]:
    for col in ['CPI','PPI']:
        dataclean.infl_momentum(col,dataset,i)

In [95]:
#cutoff most of missing data, Post March 2019, Prior 1959. CPI/PPI missing 2016 onward so need to cut that off
dataset = dataset.iloc[552:]
dataset = dataset.iloc[:-59]

In [96]:
dataset.head()

Unnamed: 0,PMI,UNR,YUNR,CONS_SENT,HOME_SALES,PART_TIME,CPPR_PRICE,HOUS_PERMS,HOUS_STARTS,CAP_UTIL,...,PERS_SAVINGS_PXY_12m_shift,EXPORTS_PXY_12m_shift,IMPORTS_PXY_12m_shift,TRADE_BALANCE_PXY_12m_shift,CPI_1m_shift,PPI_1m_shift,CPI_3m_shift,PPI_3m_shift,CPI_12m_shift,PPI_12m_shift
1959-01,64.4,6.0,11.6,90.8,1901.732,1022.0,93.706,754.262,1657.0,39.376,...,-1.0,0.0,0.0,0.0,0.346,0.316,0.346,0.635,1.399,0.635
1959-02,66.9,5.9,11.1,90.8,1901.732,973.0,93.706,754.262,1667.0,39.376,...,-1.0,0.0,0.0,0.0,-0.345,0.0,-0.345,0.316,1.049,0.635
1959-03,67.1,5.6,11.1,90.8,1901.732,1102.0,93.706,754.262,1620.0,39.376,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316,0.347,0.0
1959-04,66.9,5.2,10.8,90.8,1901.732,1086.0,93.706,754.262,1590.0,39.376,...,-1.0,0.0,0.0,0.0,0.346,0.315,0.0,0.315,0.346,0.633
1959-05,68.2,5.1,10.0,95.3,1901.732,968.0,93.706,754.262,1498.0,39.376,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.346,0.315,0.346,0.315


In [97]:


#spline time - splines seriously impede the model, Time Horizon of 1 goes from LL of 4.9 to 8.19, AUC degreades by .04
#stickiness remains

#individual splines

CPPR_PRICE_fit = Pipeline([
    ('CPPR_PRICE', ColumnSelector(name='CPPR_PRICE')),
    ('CPPR_PRICE_spline', LinearSpline(knots=[160]))
])

Spread_fit = Pipeline([
    ('Spread', ColumnSelector(name='Spread')),
    ('Spread_spline', LinearSpline(knots=[0,0.25]))
])

EXPORTS_1m_shift_fit = Pipeline([
    ('EXPORTS_1m_shift', ColumnSelector(name='EXPORTS_1m_shift')),
    ('EXPORT1m_spline', LinearSpline(knots=[700,900]))
])

ThreeYT_1m_shift_fit = Pipeline([
    ('3YT_1m_shift', ColumnSelector(name='3YRT_1m_shift')),
    ('3YT_1m_spline', LinearSpline(knots=[-15]))
])

US_M2_1m_shift = Pipeline([
    ('US_M2_1m_shift', ColumnSelector(name='US_M2_1m_shift')),
    ('US_M2_1m_spline', LinearSpline(knots=[17]))
])

HOME_SALES_3m_shift = Pipeline([
    ('HOME_SALES_3m_shift', ColumnSelector(name='HOME_SALES_3m_shift')),
    ('HOME_SALES_3m_spline', LinearSpline(knots=[-500]))
])

PART_TIME_3m_shift = Pipeline([
    ('PART_TIME_3m_shift', ColumnSelector(name='PART_TIME_3m_shift')),
    ('PART_TIME_3m_spline', LinearSpline(knots=[-160]))
])

CAP_UTIL_3m_shift = Pipeline([
    ('CAP_UTIL_3m_shift', ColumnSelector(name='CAP_UTIL_3m_shift')),
    ('CAP_UTIL_3m_spline', LinearSpline(knots=[-0.8]))
])

EXPORTS_3m_shift = Pipeline([
    ('EXPORTS_3m_shift', ColumnSelector(name='EXPORTS_3m_shift')),
    ('EXPORTS_3m_spline', LinearSpline(knots=[1500,1600]))
])

IMPORTS_3m_shift = Pipeline([
    ('IMPORTS_3m_shift', ColumnSelector(name='IMPORTS_3m_shift')),
    ('IMPORTS_3m_spline', LinearSpline(knots=[2000]))
])

TRADE_BALANCE_3m_shift = Pipeline([
    ('TRADE_BALANCE_3m_shift', ColumnSelector(name='TRADE_BALANCE_3m_shift')),
    ('TRADE_BALANCE_3m_spline', LinearSpline(knots=[-2500]))
])

US_M2_3m_shift = Pipeline([
    ('US_M2_3m_shift', ColumnSelector(name='US_M2_3m_shift')),
    ('US_M2_3m_spline', LinearSpline(knots=[60]))
])

HOME_SALES_12m_shift = Pipeline([
    ('HOME_SALES_12m_shift', ColumnSelector(name='HOME_SALES_12m_shift')),
    ('HOME_SALES_12m_spline', LinearSpline(knots=[60]))
])

PART_TIME_12m_shift = Pipeline([
    ('PART_TIME_12m_shift', ColumnSelector(name='PART_TIME_12m_shift')),
    ('PART_TIME_12m_spline', LinearSpline(knots=[-225, -187.5, -180,-140]))
])

CPPR_PRICE_12m_shift = Pipeline([
    ('CPPR_PRICE_12m_shift', ColumnSelector(name='CPPR_PRICE_12m_shift')),
    ('CPPR_PRICE_12m_spline', LinearSpline(knots=[-30]))
])

CAP_UTIL_12m_shift = Pipeline([
    ('CAP_UTIL_12m_shift', ColumnSelector(name='CAP_UTIL_12m_shift')),
    ('CAP_UTIL_12m_spline', LinearSpline(knots=[-2]))
])

Spread_12m_shift = Pipeline([
    ('Spread_12m_shift', ColumnSelector(name='Spread_12m_shift')),
    ('Spread_12m_spline', LinearSpline(knots=[-1]))
])





#union features together

feature_pipeline = FeatureUnion([
    ('intercept', Intercept()),
    ('CPPR_PRICE_fit', CPPR_PRICE_fit),
    ('Spread_fit', Spread_fit),
    ('EXPORTS_1m_shift_fit', EXPORTS_1m_shift_fit),
    ('ThreeYT_1m_shift_fit', ThreeYT_1m_shift_fit),
    ("US_M2_1m_shift", US_M2_1m_shift),
    ("HOME_SALES_3m_shift", HOME_SALES_3m_shift),
    ("PART_TIME_3m_shift", PART_TIME_3m_shift),
    ("CAP_UTIL_3m_shift", CAP_UTIL_3m_shift),
    ("EXPORTS_3m_shift", EXPORTS_3m_shift),
    ("IMPORTS_3m_shift", IMPORTS_3m_shift),
    ("TRADE_BALANCE_3m_shift", TRADE_BALANCE_3m_shift),
    ("HOME_SALES_12m_shift", HOME_SALES_12m_shift),
    ("PART_TIME_12m_shift", PART_TIME_12m_shift),
    ("CPPR_Price_12m_shift", CPPR_PRICE_12m_shift),
    ("CAP_UTIL_12m_shift", CAP_UTIL_12m_shift),
    ("Spread_12m_shift", Spread_12m_shift)
])


feature_pipeline.fit(dataset)
features = feature_pipeline.transform(dataset)

In [98]:
#dropping columns from OG dataset that were splined

splined_cols = ['CPPR_PRICE','Spread','EXPORTS_1m_shift','3YRT_1m_shift','US_M2_1m_shift','PART_TIME_3m_shift',
'CAP_UTIL_3m_shift',
'EXPORTS_3m_shift',
'IMPORTS_3m_shift',
'TRADE_BALANCE_3m_shift',
'US_M2_3m_shift',
'HOME_SALES_12m_shift',
'PART_TIME_12m_shift',
'CPPR_PRICE_12m_shift',
'CAP_UTIL_12m_shift',
'Spread_12m_shift']

dataset = dataset.drop(columns = splined_cols)

In [99]:
#merge splined features into dataset

#dataset = dataset.join(features, how='outer')

In [100]:
list(dataset.columns)

['PMI',
 'UNR',
 'YUNR',
 'CONS_SENT',
 'HOME_SALES',
 'PART_TIME',
 'HOUS_PERMS',
 'HOUS_STARTS',
 'CAP_UTIL',
 'PERS_SAVINGS',
 'EXPORTS',
 'IMPORTS',
 'TRADE_BALANCE',
 'INT_RATE',
 'US_M2',
 'US_NHOME_SALES',
 'PPI',
 'CPI',
 '3YRT',
 '10 Year Treasury Yield',
 '3 Month Treasury Yield (Bond Equivalent Basis)',
 'PMI_PXY',
 'UNR_PXY',
 'YUNR_PXY',
 'CONS_SENT_PXY',
 'HOME_SALES_PXY',
 'PART_TIME_PXY',
 'CPPR_PRICE_PXY',
 'HOUS_PERMS_PXY',
 'HOUS_STARTS_PXY',
 'CAP_UTIL_PXY',
 'PERS_SAVINGS_PXY',
 'EXPORTS_PXY',
 'IMPORTS_PXY',
 'TRADE_BALANCE_PXY',
 'INT_RATE_PXY',
 'US_M2_PXY',
 'US_NHOME_SALES_PXY',
 'PPI_PXY',
 'CPI_PXY',
 '3YRT_PXY',
 'PMI_1m_shift',
 'UNR_1m_shift',
 'YUNR_1m_shift',
 'CONS_SENT_1m_shift',
 'HOME_SALES_1m_shift',
 'PART_TIME_1m_shift',
 'CPPR_PRICE_1m_shift',
 'HOUS_PERMS_1m_shift',
 'HOUS_STARTS_1m_shift',
 'CAP_UTIL_1m_shift',
 'PERS_SAVINGS_1m_shift',
 'IMPORTS_1m_shift',
 'TRADE_BALANCE_1m_shift',
 'INT_RATE_1m_shift',
 'US_NHOME_SALES_1m_shift',
 '10 Year 

In [101]:
#not standardizing as splined columns are now in there

"""

#standardizing in the logistic regression model specifically as that's the only model it will impact

stand_cols = ['PMI',
 'UNR',
 'YUNR',
 'CONS_SENT',
 'HOUS_PERMS',
 'HOUS_STARTS',
 'CAP_UTIL',
 'PERS_SAVINGS',
 'INT_RATE',
 'PPI',
 'CPI',
 '3YRT',
 '10 Year Treasury Yield',
 '3 Month Treasury Yield (Bond Equivalent Basis)',
 'Spread',
 'PMI_1m_shift',
 'UNR_1m_shift',
 'YUNR_1m_shift',
 'CONS_SENT_1m_shift',
 'HOUS_PERMS_1m_shift',
 'HOUS_STARTS_1m_shift',
 'CAP_UTIL_1m_shift',
 'PERS_SAVINGS_1m_shift',
 'INT_RATE_1m_shift',
 '3YRT_1m_shift',
 '10 Year Treasury Yield_1m_shift',
 '3 Month Treasury Yield (Bond Equivalent Basis)_1m_shift',
 'Spread_1m_shift',
 'PMI_3m_shift',
 'UNR_3m_shift',
 'YUNR_3m_shift',
 'CONS_SENT_3m_shift',
 'HOUS_PERMS_3m_shift',
 'HOUS_STARTS_3m_shift',
 'CAP_UTIL_3m_shift',
 'PERS_SAVINGS_3m_shift',
 'INT_RATE_3m_shift',
 '3YRT_3m_shift',
 '10 Year Treasury Yield_3m_shift',
 '3 Month Treasury Yield (Bond Equivalent Basis)_3m_shift',
 'Spread_3m_shift',
 'PMI_12m_shift',
 'UNR_12m_shift',
 'YUNR_12m_shift',
 'CONS_SENT_12m_shift',
 'HOUS_PERMS_12m_shift',
 'HOUS_STARTS_12m_shift',
 'CAP_UTIL_12m_shift',
 'PERS_SAVINGS_12m_shift',
 'INT_RATE_12m_shift',
 '3YRT_12m_shift',
 '10 Year Treasury Yield_12m_shift',
 '3 Month Treasury Yield (Bond Equivalent Basis)_12m_shift',
 'Spread_12m_shift']

dataset[stand_cols] = StandardScaler().fit_transform(dataset[stand_cols])
#dataset[list(dataset.columns)] = StandardScaler().fit_transform(list(dataset.columns))

"""


"\n\n#standardizing in the logistic regression model specifically as that's the only model it will impact\n\nstand_cols = ['PMI',\n 'UNR',\n 'YUNR',\n 'CONS_SENT',\n 'HOUS_PERMS',\n 'HOUS_STARTS',\n 'CAP_UTIL',\n 'PERS_SAVINGS',\n 'INT_RATE',\n 'PPI',\n 'CPI',\n '3YRT',\n '10 Year Treasury Yield',\n '3 Month Treasury Yield (Bond Equivalent Basis)',\n 'Spread',\n 'PMI_1m_shift',\n 'UNR_1m_shift',\n 'YUNR_1m_shift',\n 'CONS_SENT_1m_shift',\n 'HOUS_PERMS_1m_shift',\n 'HOUS_STARTS_1m_shift',\n 'CAP_UTIL_1m_shift',\n 'PERS_SAVINGS_1m_shift',\n 'INT_RATE_1m_shift',\n '3YRT_1m_shift',\n '10 Year Treasury Yield_1m_shift',\n '3 Month Treasury Yield (Bond Equivalent Basis)_1m_shift',\n 'Spread_1m_shift',\n 'PMI_3m_shift',\n 'UNR_3m_shift',\n 'YUNR_3m_shift',\n 'CONS_SENT_3m_shift',\n 'HOUS_PERMS_3m_shift',\n 'HOUS_STARTS_3m_shift',\n 'CAP_UTIL_3m_shift',\n 'PERS_SAVINGS_3m_shift',\n 'INT_RATE_3m_shift',\n '3YRT_3m_shift',\n '10 Year Treasury Yield_3m_shift',\n '3 Month Treasury Yield (Bond Equiv

In [102]:
X = dataset

In [103]:
### Data Prep Finished Here ###

In [104]:
#cutoff most of missing data, Post March 2019, Prior 1959. CPI/PPI missing 2016 onward so need to cut that off
y = y.iloc[552:]
y = y.iloc[:-59]

In [204]:
y_shift = y.shift(-12) #shifting y to forecast 3 months out

In [205]:
y_shift = y_shift.fillna(0)

In [206]:
X_train = X.iloc[12:550]
X_test = X.iloc[550:]
y_train = y_shift.iloc[12:550]
y_test = y_shift.iloc[550:]

In [207]:
X_train.columns

Index(['PMI', 'UNR', 'YUNR', 'CONS_SENT', 'HOME_SALES', 'PART_TIME',
       'HOUS_PERMS', 'HOUS_STARTS', 'CAP_UTIL', 'PERS_SAVINGS',
       ...
       'PERS_SAVINGS_PXY_12m_shift', 'EXPORTS_PXY_12m_shift',
       'IMPORTS_PXY_12m_shift', 'TRADE_BALANCE_PXY_12m_shift', 'CPI_1m_shift',
       'PPI_1m_shift', 'CPI_3m_shift', 'PPI_3m_shift', 'CPI_12m_shift',
       'PPI_12m_shift'],
      dtype='object', length=138)

In [208]:
X_train.shape

(538, 138)

In [209]:
model = LogisticRegression(penalty = 'l2', C=2000, max_iter = 100, solver = 'sag') #try throwing in a bigger C than 1
#RidgeClassifier().fit(X, y)
model.fit(X_train, y_train) #fitting model



LogisticRegression(C=2000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [210]:
probs = model.predict_proba(X_test)

In [211]:
log_loss(y_test, probs)

0.2718447688132093

In [212]:
roc_auc_score(y_test.values, probs[:,1:])

0.9557522123893805

In [137]:
for object in zip(dataset.columns,model.coef_[0] ):
    print(object)

('PMI', -2.9533398781794177e-05)
('UNR', -2.7231107463532517e-06)
('YUNR', -5.779159773359312e-06)
('CONS_SENT', -4.513900111147545e-05)
('HOME_SALES', -0.0009983302351860238)
('PART_TIME', -0.0004952141776065778)
('HOUS_PERMS', -0.0006583264497515178)
('HOUS_STARTS', -0.000786358863190315)
('CAP_UTIL', -3.012437022295791e-05)
('PERS_SAVINGS', -6.183478912330797e-06)
('EXPORTS', -5.677324649719334e-06)
('IMPORTS', 3.7078530897625896e-05)
('TRADE_BALANCE', -4.2654436031569705e-05)
('INT_RATE', 1.6006184976256074e-07)
('US_M2', -0.00024552606150556316)
('US_NHOME_SALES', -0.00026347520783013287)
('PPI', -1.2787836034333423e-05)
('CPI', -6.795236345227047e-06)
('3YRT', -2.4760800238103704e-05)
('10 Year Treasury Yield', -1.7045267009027669e-06)
('3 Month Treasury Yield (Bond Equivalent Basis)', 9.51921603095906e-08)
('PMI_PXY', 0.0)
('UNR_PXY', 0.0)
('YUNR_PXY', 0.0)
('CONS_SENT_PXY', 0.0)
('HOME_SALES_PXY', -3.309181245132186e-07)
('PART_TIME_PXY', 0.0)
('CPPR_PRICE_PXY', -2.860402053352

In [213]:
results = pd.DataFrame(probs)
results['actual'] = y_test.values
results.index = y_test.index
pd.set_option('display.float_format', lambda x: '%.3f' % x)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

            0     1  actual
2004-11 0.934 0.066   0.000
2004-12 0.885 0.115   0.000
2005-01 0.910 0.090   0.000
2005-02 0.925 0.075   0.000
2005-03 0.921 0.079   0.000
2005-04 0.929 0.071   0.000
2005-05 0.919 0.081   0.000
2005-06 0.882 0.118   0.000
2005-07 0.901 0.099   0.000
2005-08 0.895 0.105   0.000
2005-09 0.923 0.077   0.000
2005-10 0.854 0.146   0.000
2005-11 0.783 0.217   0.000
2005-12 0.819 0.181   0.000
2006-01 0.808 0.192   0.000
2006-02 0.768 0.232   0.000
2006-03 0.806 0.194   0.000
2006-04 0.770 0.230   0.000
2006-05 0.717 0.283   0.000
2006-06 0.613 0.387   0.000
2006-07 0.643 0.357   0.000
2006-08 0.587 0.413   0.000
2006-09 0.415 0.585   0.000
2006-10 0.433 0.567   0.000
2006-11 0.543 0.457   0.000
2006-12 0.625 0.375   0.000
2007-01 0.448 0.552   1.000
2007-02 0.605 0.395   1.000
2007-03 0.533 0.467   1.000
2007-04 0.465 0.535   0.000
2007-05 0.455 0.545   0.000
2007-06 0.460 0.540   0.000
2007-07 0.319 0.681   0.000
2007-08 0.273 0.727   0.000
2007-09 0.288 0.712 

In [116]:
#Testing on the training data to see if model fit got better
probs = model.predict_proba(X_train)

In [117]:
log_loss(y_train, probs)

0.012927259322249461

In [118]:
roc_auc_score(y_train.values, probs[:,1:])

1.0

In [173]:
#Grid Search

C = [1000,1500,2000,2500] #inverse of regularization strength, smaller means stronger regularization, defaults to 1
max_iter = [100, 50000, 100000,200000] #maximum number of iterations taken for the solver to converge
solver = ['lbfgs', 'sag'] #newtons method is computationally expensive so take out, lbfgs is the best for smaller data, 

for val in C:
    for n in max_iter:
        for s in solver:

            model = LogisticRegression(penalty = 'l2', C=val, max_iter = n, solver = s)
            model.fit(X_train, y_train) #fitting model
            probs = model.predict_proba(X_test)
            ll = log_loss(y_test, probs)
            auc = roc_auc_score(y_test.values, probs[:,1:])
            print("Testing C {}, max_iter {} and s of {}, yielding log loss of {} and AUC of {}".format(val,n, s, ll, auc))



Testing C 1000, max_iter 100 and s of lbfgs, yielding log loss of 3.312143078722374 and AUC of 0.9373156342182891
Testing C 1000, max_iter 100 and s of sag, yielding log loss of 0.33369214137174696 and AUC of 0.8893805309734513




Testing C 1000, max_iter 50000 and s of lbfgs, yielding log loss of 25.56731985577733 and AUC of 0.6017699115044248
Testing C 1000, max_iter 50000 and s of sag, yielding log loss of 1.2977805591886182 and AUC of 0.9115044247787611




Testing C 1000, max_iter 100000 and s of lbfgs, yielding log loss of 25.56731985577733 and AUC of 0.6017699115044248
Testing C 1000, max_iter 100000 and s of sag, yielding log loss of 1.2974593959851188 and AUC of 0.9115044247787611




Testing C 1000, max_iter 200000 and s of lbfgs, yielding log loss of 25.56731985577733 and AUC of 0.6017699115044248
Testing C 1000, max_iter 200000 and s of sag, yielding log loss of 1.297632346136051 and AUC of 0.9115044247787611
Testing C 1500, max_iter 100 and s of lbfgs, yielding log loss of 4.127371147883606 and AUC of 0.9380530973451329
Testing C 1500, max_iter 100 and s of sag, yielding log loss of 0.3348948467006205 and AUC of 0.890117994100295




Testing C 1500, max_iter 50000 and s of lbfgs, yielding log loss of 24.349199408268976 and AUC of 0.6150442477876106
Testing C 1500, max_iter 50000 and s of sag, yielding log loss of 1.2977537433074378 and AUC of 0.9115044247787611




Testing C 1500, max_iter 100000 and s of lbfgs, yielding log loss of 24.349199408268976 and AUC of 0.6150442477876106
Testing C 1500, max_iter 100000 and s of sag, yielding log loss of 1.2976223555596151 and AUC of 0.9115044247787611




Testing C 1500, max_iter 200000 and s of lbfgs, yielding log loss of 24.349199408268976 and AUC of 0.6150442477876106
Testing C 1500, max_iter 200000 and s of sag, yielding log loss of 1.298053432944702 and AUC of 0.9115044247787611
Testing C 2000, max_iter 100 and s of lbfgs, yielding log loss of 3.3462730484259624 and AUC of 0.9358407079646018
Testing C 2000, max_iter 100 and s of sag, yielding log loss of 0.3334209951454132 and AUC of 0.8908554572271387




Testing C 2000, max_iter 50000 and s of lbfgs, yielding log loss of 25.014409358885995 and AUC of 0.6283185840707964
Testing C 2000, max_iter 50000 and s of sag, yielding log loss of 1.2972165144026322 and AUC of 0.9115044247787611




Testing C 2000, max_iter 100000 and s of lbfgs, yielding log loss of 25.014409358885995 and AUC of 0.6283185840707964
Testing C 2000, max_iter 100000 and s of sag, yielding log loss of 1.2975352718645652 and AUC of 0.9115044247787611




Testing C 2000, max_iter 200000 and s of lbfgs, yielding log loss of 25.014409358885995 and AUC of 0.6283185840707964
Testing C 2000, max_iter 200000 and s of sag, yielding log loss of 1.2974643230237517 and AUC of 0.9115044247787611
Testing C 2500, max_iter 100 and s of lbfgs, yielding log loss of 3.7505483433130085 and AUC of 0.9402654867256637
Testing C 2500, max_iter 100 and s of sag, yielding log loss of 0.3342093523007668 and AUC of 0.8908554572271387


  np.exp(prob, prob)


Testing C 2500, max_iter 50000 and s of lbfgs, yielding log loss of 26.79057398669737 and AUC of 0.5752212389380531
Testing C 2500, max_iter 50000 and s of sag, yielding log loss of 1.297789562964393 and AUC of 0.9115044247787611


  np.exp(prob, prob)


Testing C 2500, max_iter 100000 and s of lbfgs, yielding log loss of 26.79057398669737 and AUC of 0.5752212389380531
Testing C 2500, max_iter 100000 and s of sag, yielding log loss of 1.2976890026295989 and AUC of 0.9115044247787611


  np.exp(prob, prob)


Testing C 2500, max_iter 200000 and s of lbfgs, yielding log loss of 26.79057398669737 and AUC of 0.5752212389380531
Testing C 2500, max_iter 200000 and s of sag, yielding log loss of 1.2973564034003342 and AUC of 0.9115044247787611


In [178]:
model = LogisticRegression(penalty = 'l2', C=2000, max_iter = 100, solver = 'sag')
model.fit(X_train, y_train) #fitting model
probs = model.predict_proba(X_test)
ll = log_loss(y_test, probs)
auc = roc_auc_score(y_test.values, probs[:,1:])



In [179]:
ll

0.33403368640904707

In [180]:
auc

0.890117994100295

In [181]:
results = pd.DataFrame(probs)
results['actual'] = y_test.values
results.index = y_test.index
pd.set_option('display.float_format', lambda x: '%.3f' % x)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

            0     1  actual
2004-11 0.992 0.008   0.000
2004-12 0.982 0.018   0.000
2005-01 0.991 0.009   0.000
2005-02 0.988 0.012   0.000
2005-03 0.960 0.040   0.000
2005-04 0.990 0.010   0.000
2005-05 0.971 0.029   0.000
2005-06 0.969 0.031   0.000
2005-07 0.972 0.028   0.000
2005-08 0.973 0.027   0.000
2005-09 0.985 0.015   0.000
2005-10 0.973 0.027   0.000
2005-11 0.929 0.071   0.000
2005-12 0.967 0.033   0.000
2006-01 0.974 0.026   0.000
2006-02 0.928 0.072   0.000
2006-03 0.978 0.022   0.000
2006-04 0.943 0.057   0.000
2006-05 0.969 0.031   0.000
2006-06 0.943 0.057   0.000
2006-07 0.933 0.067   0.000
2006-08 0.948 0.052   0.000
2006-09 0.848 0.152   0.000
2006-10 0.735 0.265   0.000
2006-11 0.884 0.116   0.000
2006-12 0.914 0.086   0.000
2007-01 0.703 0.297   0.000
2007-02 0.807 0.193   0.000
2007-03 0.914 0.086   0.000
2007-04 0.773 0.227   0.000
2007-05 0.787 0.213   0.000
2007-06 0.811 0.189   0.000
2007-07 0.784 0.216   0.000
2007-08 0.696 0.304   0.000
2007-09 0.760 0.240 