# Initial Setup

In [30]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

Import Processed data as Samples

In [40]:
samples = pd.read_csv("data/processed/samples.csv", encoding="latin-1")

In [41]:
samples = samples.drop("Unnamed: 0", axis=1)
samples.head()

Unnamed: 0,establishments,employees,growth,exemptions_stay,exemptions_in_is,exemptions_in_oos,exemptions_out_is,exemptions_out_oos,returns_stay,returns_in_is,returns_in_oos,returns_out_is,returns_out_oos,agi_stay,agi_in_is,agi_in_oos,agi_out_is,agi_out_oos
0,35,157,False,35901.0,2185.0,4184.0,1883.0,3338.0,15062.0,951.0,1410.0,853.0,1245.0,714261.0,32399.0,73510.0,26576.0,59705.0
1,34,175,False,36850.0,2519.0,4209.0,2102.0,3545.0,15473.0,1139.0,1551.0,971.0,1307.0,756692.0,38883.0,82027.0,31657.0,70658.0
2,30,109,True,37767.0,2394.0,4168.0,2127.0,3443.0,15944.0,1072.0,1447.0,1025.0,1284.0,827611.0,37153.0,79737.0,36112.0,67092.0
3,38,116,False,39518.0,2548.0,3863.0,2047.0,3601.0,16791.0,1185.0,1458.0,989.0,1398.0,901200.0,43332.0,83366.0,34063.0,69089.0
4,37,0,False,40719.0,2230.0,3873.0,2118.0,3660.0,17385.0,1038.0,1462.0,1018.0,1375.0,936888.0,36050.0,79697.0,34539.0,75861.0


Split off a hold-out test set of data

In [4]:
y = samples["growth"]
X = samples.drop("growth", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=36)

# Testing Various Models
## Model 1: Logistic Regression

In [5]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", LogisticRegression())]
parameters = {"pca__n_components":np.arange(1,18)}

In [6]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('model', LogisticRegression(C=1.0, class_weight=None, d...y='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'pca__n_components': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
# score the model
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.652959898154
Best Params: {'pca__n_components': 16}
Classification Report:
             precision    recall  f1-score   support

      False       0.66      0.97      0.78      5044
       True       0.60      0.09      0.16      2811

avg / total       0.64      0.65      0.56      7855



## Model 2: Nearest Neighbors

In [8]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", KNeighborsClassifier())]
parameters = {"pca__n_components":np.arange(1,18),
             "model__n_neighbors":np.arange(1,10)}

In [9]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [10]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.624188415022
Best Params: {'model__n_neighbors': 8, 'pca__n_components': 4}
Classification Report:
             precision    recall  f1-score   support

      False       0.66      0.87      0.75      5044
       True       0.44      0.18      0.26      2811

avg / total       0.58      0.62      0.57      7855



## Model 3: Naive Bayes

In [14]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", GaussianNB())]
parameters = {"pca__n_components":np.arange(1,18)}

In [15]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [16]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.64455760662
Best Params: {'pca__n_components': 11}
Classification Report:
             precision    recall  f1-score   support

      False       0.65      0.95      0.77      5044
       True       0.52      0.11      0.17      2811

avg / total       0.61      0.64      0.56      7855



## Model 4: Random Forest

In [27]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", RandomForestClassifier())]
parameters = {"pca__n_components":np.arange(10,18),
             "model__n_estimators":[50],
             "model__max_features":["sqrt", "log2"],
             "model__min_samples_leaf":[20, 30, 40]}

In [28]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [29]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.660980267346
Best Params: {'model__max_features': 'log2', 'model__min_samples_leaf': 30, 'model__n_estimators': 50, 'pca__n_components': 17}
Classification Report:
             precision    recall  f1-score   support

      False       0.67      0.92      0.78      5044
       True       0.58      0.20      0.29      2811

avg / total       0.64      0.66      0.60      7855



## Model 5: Support Vector Machine

In [34]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", SVC())]
parameters = {"pca__n_components":np.arange(10,18),
             "model__C":[0.01, 0.1, 1, 10]}

In [35]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [36]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.655378739656
Best Params: {'model__C': 10, 'pca__n_components': 17}
Classification Report:
             precision    recall  f1-score   support

      False       0.66      0.97      0.78      5044
       True       0.62      0.10      0.17      2811

avg / total       0.64      0.66      0.56      7855



# Intepretation

In [None]:
The models all produce accuracy in the mid 60s.

In [39]:
samples.describe()

Unnamed: 0,establishments,employees,exemptions_stay,exemptions_in_is,exemptions_in_oos,exemptions_out_is,exemptions_out_oos,returns_stay,returns_in_is,returns_in_oos,returns_out_is,returns_out_oos,agi_stay,agi_in_is,agi_in_oos,agi_out_is,agi_out_oos
count,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0,31420.0
mean,114.962222,641.940388,73211.57,2289.236219,2935.86316,2289.167632,2913.232654,33517.73,1222.222629,1550.652642,1222.191598,1531.74626,2085755.0,54290.66,75607.89,54266.44,75726.41
std,466.492941,3096.071891,234528.4,6121.617496,6917.110254,6672.119041,6913.923344,107315.2,3341.004634,3879.945076,3523.109907,3714.271337,7401976.0,169016.9,215525.4,187726.4,222673.2
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-433717.0,0.0,-349925.0
25%,6.0,0.0,7996.75,188.0,312.0,215.0,301.0,3513.75,91.0,153.0,108.0,153.0,153503.8,2784.0,5068.0,3263.0,4870.0
50%,19.0,40.0,18673.5,556.0,775.0,583.0,723.0,8301.5,272.0,386.0,293.0,368.0,388081.5,8874.5,14294.5,9282.0,13178.5
75%,60.0,231.0,49056.75,1685.0,2417.0,1619.0,2382.25,22152.75,856.25,1224.0,850.0,1238.0,1139224.0,31407.5,51632.75,29359.5,50467.5
max,14544.0,92897.0,7807472.0,120219.0,187441.0,169531.0,147905.0,3527047.0,65835.0,95120.0,80329.0,72316.0,262581700.0,3774457.0,5170328.0,4549605.0,6824326.0
