# Data Gathering

In [1]:
from getKey import *
import quandl
quandl.ApiConfig.api_key = getKey('quandl.key')

#PMI
data = quandl.get("ISM/MAN_PMI")
data.head()

Unnamed: 0_level_0,PMI
Date,Unnamed: 1_level_1
1948-01-01,51.7
1948-02-01,50.2
1948-03-01,43.3
1948-04-01,45.4
1948-05-01,49.5


In [2]:
#S&P 500 returns
SPY = quandl.get("MULTPL/SP500_REAL_PRICE_MONTH")
SPY["shifted"] = SPY.shift(periods=1)
SPY["returns"] = (SPY["Value"] - SPY['shifted'])/SPY['shifted']
data["S&P 500 Returns"] = SPY["returns"]*100

data.head()

Unnamed: 0_level_0,PMI,S&P 500 Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1948-01-01,51.7,-1.330672
1948-02-01,50.2,-4.922454
1948-03-01,43.3,1.41844
1948-04-01,45.4,7.692308
1948-05-01,49.5,4.87013


In [3]:
#Building permits: https://www.census.gov/construction/bps/uspermits.html
#for some reason the date format changes at Jan 2000 which is index 505
import pandas as pd
permits = pd.read_csv('building_permits.csv')
permits["Date"] = pd.to_datetime((permits.Year*10000+permits.Month*100+permits.Day).apply(str),format='%Y%m%d')
permits = permits.set_index("Date")
data["Building Permits"] = permits["Total"]
data = data[permits.index[0]:'2018-07-01']
data.head()

Unnamed: 0_level_0,PMI,S&P 500 Returns,Building Permits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,64.4,3.982053,75.7
1959-02-01,66.9,-1.528227,79.0
1959-03-01,67.1,2.537886,119.0
1959-04-01,66.9,1.673789,127.9
1959-05-01,68.2,1.50613,120.8


# Feature Engineering

In [4]:
#Change in PMI
data['dPMI'] = data['PMI'] - data['PMI'].shift(1)

#Percent change in building permits
data['dBuilding Permits'] = 100*(data['Building Permits'] - data['Building Permits'].shift(1))/data['Building Permits'].shift(1)

#Previous n month's S&P 500 returns
n = 12
for i in range(1, n+1):
    data["S&P 500 Returns " + str(i) + " Months Ago"] = data["S&P 500 Returns"].shift(i)
data.head()

Unnamed: 0_level_0,PMI,S&P 500 Returns,Building Permits,dPMI,dBuilding Permits,S&P 500 Returns 1 Months Ago,S&P 500 Returns 2 Months Ago,S&P 500 Returns 3 Months Ago,S&P 500 Returns 4 Months Ago,S&P 500 Returns 5 Months Ago,S&P 500 Returns 6 Months Ago,S&P 500 Returns 7 Months Ago,S&P 500 Returns 8 Months Ago,S&P 500 Returns 9 Months Ago,S&P 500 Returns 10 Months Ago,S&P 500 Returns 11 Months Ago,S&P 500 Returns 12 Months Ago
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1959-01-01,64.4,3.982053,75.7,,,,,,,,,,,,,,
1959-02-01,66.9,-1.528227,79.0,2.5,4.359313,3.982053,,,,,,,,,,,
1959-03-01,67.1,2.537886,119.0,0.2,50.632911,-1.528227,3.982053,,,,,,,,,,
1959-04-01,66.9,1.673789,127.9,-0.2,7.478992,2.537886,-1.528227,3.982053,,,,,,,,,
1959-05-01,68.2,1.50613,120.8,1.3,-5.551212,1.673789,2.537886,-1.528227,3.982053,,,,,,,,


# Data Prep

In [5]:
#Add Y vector
n = 12
for i in range(1, n):
    data["S&P 500 Returns " + str(i) + " Months Ahead"] = data["S&P 500 Returns"].shift(-i)
    data["S&P 500 Binary Returns " + str(i) + " Months Ahead"] = data["S&P 500 Returns"].shift(-i) > 0
data = data.dropna()
data.tail()

Unnamed: 0_level_0,PMI,S&P 500 Returns,Building Permits,dPMI,dBuilding Permits,S&P 500 Returns 1 Months Ago,S&P 500 Returns 2 Months Ago,S&P 500 Returns 3 Months Ago,S&P 500 Returns 4 Months Ago,S&P 500 Returns 5 Months Ago,...,S&P 500 Returns 7 Months Ahead,S&P 500 Binary Returns 7 Months Ahead,S&P 500 Returns 8 Months Ahead,S&P 500 Binary Returns 8 Months Ahead,S&P 500 Returns 9 Months Ahead,S&P 500 Binary Returns 9 Months Ahead,S&P 500 Returns 10 Months Ahead,S&P 500 Binary Returns 10 Months Ahead,S&P 500 Returns 11 Months Ahead,S&P 500 Binary Returns 11 Months Ahead
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-01,55.3,-0.317303,105.0,-1.3,-8.933218,1.584181,2.408225,1.268122,3.770918,1.025189,...,1.431756,True,2.727087,True,4.708858,True,-3.033909,False,-0.08835,False
2017-05-01,55.5,1.527565,115.3,0.2,9.809524,-0.317303,1.584181,2.408225,1.268122,3.770918,...,2.727087,True,4.708858,True,-3.033909,False,-0.08835,False,-1.818135,False
2017-06-01,56.7,1.613125,130.8,1.2,13.443192,1.527565,-0.317303,1.584181,2.408225,1.268122,...,4.708858,True,-3.033909,False,-0.08835,False,-1.818135,False,1.803567,True
2017-07-01,56.5,0.826215,102.9,-0.2,-21.330275,1.613125,1.527565,-0.317303,1.584181,2.408225,...,-3.033909,False,-0.08835,False,-1.818135,False,1.803567,True,1.956698,True
2017-08-01,59.3,0.086386,122.1,2.8,18.658892,0.826215,1.613125,1.527565,-0.317303,1.584181,...,-0.08835,False,-1.818135,False,1.803567,True,1.956698,True,-0.644072,False


In [6]:
for i in range(len(data.columns)):
    print((i, data.columns[i]))

(0, 'PMI')
(1, 'S&P 500 Returns')
(2, 'Building Permits')
(3, 'dPMI')
(4, 'dBuilding Permits')
(5, 'S&P 500 Returns 1 Months Ago')
(6, 'S&P 500 Returns 2 Months Ago')
(7, 'S&P 500 Returns 3 Months Ago')
(8, 'S&P 500 Returns 4 Months Ago')
(9, 'S&P 500 Returns 5 Months Ago')
(10, 'S&P 500 Returns 6 Months Ago')
(11, 'S&P 500 Returns 7 Months Ago')
(12, 'S&P 500 Returns 8 Months Ago')
(13, 'S&P 500 Returns 9 Months Ago')
(14, 'S&P 500 Returns 10 Months Ago')
(15, 'S&P 500 Returns 11 Months Ago')
(16, 'S&P 500 Returns 12 Months Ago')
(17, 'S&P 500 Returns 1 Months Ahead')
(18, 'S&P 500 Binary Returns 1 Months Ahead')
(19, 'S&P 500 Returns 2 Months Ahead')
(20, 'S&P 500 Binary Returns 2 Months Ahead')
(21, 'S&P 500 Returns 3 Months Ahead')
(22, 'S&P 500 Binary Returns 3 Months Ahead')
(23, 'S&P 500 Returns 4 Months Ahead')
(24, 'S&P 500 Binary Returns 4 Months Ahead')
(25, 'S&P 500 Returns 5 Months Ahead')
(26, 'S&P 500 Binary Returns 5 Months Ahead')
(27, 'S&P 500 Returns 6 Months Ahead')

# Machine Learning

## SVM, Regression

In [7]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=2)
for months_ahead in range(1, 12):
    print("Months Ahead: " + str(months_ahead))
    
    #Break data into X and Y values
    array = data.values.astype(np.float)
    x_columns = slice(0,17)
    y_column = months_ahead*2 + 15
    X = array[:, x_columns]
    Y = array[:,y_column]
    #print("X Columns: " + str(data.columns[x_columns]))
    #print("Y Column: " + str(data.columns[y_column]))
    
    #Break data into test and training data
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = False)
    
    #Scale data
    scaler = MinMaxScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    #Train and score model
    print("Training Model")
    model = SVR(kernel='rbf')
    param_dist = {'gamma':['auto', 'scale'], 'C':[1, 10, 100, 1000], 'epsilon':[0.01, 0.1, 1], 'shrinking':[False, True]}
    random_search = GridSearchCV(model, param_grid = param_dist, iid = True, cv = 5)
    random_search.fit(x_train, y_train)
    print(random_search.score(x_test, y_test))
    print()

Months Ahead: 1
X Columns: Index(['PMI', 'S&P 500 Returns', 'Building Permits', 'dPMI',
       'dBuilding Permits', 'S&P 500 Returns 1 Months Ago',
       'S&P 500 Returns 2 Months Ago', 'S&P 500 Returns 3 Months Ago',
       'S&P 500 Returns 4 Months Ago', 'S&P 500 Returns 5 Months Ago',
       'S&P 500 Returns 6 Months Ago', 'S&P 500 Returns 7 Months Ago',
       'S&P 500 Returns 8 Months Ago', 'S&P 500 Returns 9 Months Ago',
       'S&P 500 Returns 10 Months Ago', 'S&P 500 Returns 11 Months Ago',
       'S&P 500 Returns 12 Months Ago'],
      dtype='object')
Y Column: S&P 500 Returns 1 Months Ahead
Training Model
0.03478272735177945

Months Ahead: 2
X Columns: Index(['PMI', 'S&P 500 Returns', 'Building Permits', 'dPMI',
       'dBuilding Permits', 'S&P 500 Returns 1 Months Ago',
       'S&P 500 Returns 2 Months Ago', 'S&P 500 Returns 3 Months Ago',
       'S&P 500 Returns 4 Months Ago', 'S&P 500 Returns 5 Months Ago',
       'S&P 500 Returns 6 Months Ago', 'S&P 500 Returns 7 Months 

## SVM, Classification

In [8]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=2)
for months_ahead in range(1, 12):
    print("Months Ahead: " + str(months_ahead))
    
    #Break data into X and Y values
    array = data.values.astype(np.float)
    x_columns = slice(0,17)
    y_column = months_ahead*2 + 16
    X = array[:, x_columns]
    Y = array[:,y_column]
    print("X Columns: " + str(data.columns[x_columns]))
    print("Y Column: " + str(data.columns[y_column]))
    
    #Break data into test and training data
    testProportion = 0.2
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = testProportion, shuffle = False)
    
    #Scale data
    scaler = MinMaxScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    print("Proportion of positive returns: " + str(sum(1 if (returns > 0) else 0 for returns in y_test)/len(y_test)))

    #Train and score model
    print("Training Model")
    model = SVC(kernel='rbf')
    param_dist = {'gamma':['auto', 'scale'], 'C':[1, 10, 100, 1000], 'shrinking':[False, True]}
    random_search = GridSearchCV(model, param_grid = param_dist, iid = True, cv = 5)
    random_search.fit(x_train, y_train)
    print(random_search.score(x_test, y_test))
    print()

Months Ahead: 1
X Columns: Index(['PMI', 'S&P 500 Returns', 'Building Permits', 'dPMI',
       'dBuilding Permits', 'S&P 500 Returns 1 Months Ago',
       'S&P 500 Returns 2 Months Ago', 'S&P 500 Returns 3 Months Ago',
       'S&P 500 Returns 4 Months Ago', 'S&P 500 Returns 5 Months Ago',
       'S&P 500 Returns 6 Months Ago', 'S&P 500 Returns 7 Months Ago',
       'S&P 500 Returns 8 Months Ago', 'S&P 500 Returns 9 Months Ago',
       'S&P 500 Returns 10 Months Ago', 'S&P 500 Returns 11 Months Ago',
       'S&P 500 Returns 12 Months Ago'],
      dtype='object')
Y Column: S&P 500 Binary Returns 1 Months Ahead
Proportion of positive returns: 0.6618705035971223
Training Model
0.6690647482014388

Months Ahead: 2
X Columns: Index(['PMI', 'S&P 500 Returns', 'Building Permits', 'dPMI',
       'dBuilding Permits', 'S&P 500 Returns 1 Months Ago',
       'S&P 500 Returns 2 Months Ago', 'S&P 500 Returns 3 Months Ago',
       'S&P 500 Returns 4 Months Ago', 'S&P 500 Returns 5 Months Ago',
       '