# Initial Setup

In [30]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

Import Processed data as Samples

In [2]:
samples = pd.read_csv("data/processed/samples.csv", encoding="latin-1")

In [17]:
samples = samples.drop("Unnamed: 0", axis=1)
samples.head()

ValueError: labels ['Unnamed: 0'] not contained in axis

Split off a hold-out test set of data

In [4]:
y = samples["growth"]
X = samples.drop("growth", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=36)

# Testing Various Models
## Model 1: Logistic Regression

In [5]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", LogisticRegression())]
parameters = {"pca__n_components":np.arange(1,18)}

In [6]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('model', LogisticRegression(C=1.0, class_weight=None, d...y='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'pca__n_components': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
# score the model
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.652959898154
Best Params: {'pca__n_components': 16}
Classification Report:
             precision    recall  f1-score   support

      False       0.66      0.97      0.78      5044
       True       0.60      0.09      0.16      2811

avg / total       0.64      0.65      0.56      7855



## Model 2: Nearest Neighbors

In [8]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", KNeighborsClassifier())]
parameters = {"pca__n_components":np.arange(1,18),
             "model__n_neighbors":np.arange(1,10)}

In [9]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [10]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.624188415022
Best Params: {'model__n_neighbors': 8, 'pca__n_components': 4}
Classification Report:
             precision    recall  f1-score   support

      False       0.66      0.87      0.75      5044
       True       0.44      0.18      0.26      2811

avg / total       0.58      0.62      0.57      7855



## Model 3: Naive Bayes

In [14]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", GaussianNB())]
parameters = {"pca__n_components":np.arange(1,18)}

In [15]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [16]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.64455760662
Best Params: {'pca__n_components': 11}
Classification Report:
             precision    recall  f1-score   support

      False       0.65      0.95      0.77      5044
       True       0.52      0.11      0.17      2811

avg / total       0.61      0.64      0.56      7855



## Model 4: Random Forest

In [27]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", RandomForestClassifier())]
parameters = {"pca__n_components":np.arange(10,18),
             "model__n_estimators":[50],
             "model__max_features":["sqrt", "log2"],
             "model__min_samples_leaf":[20, 30, 40]}

In [28]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [29]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.660980267346
Best Params: {'model__max_features': 'log2', 'model__min_samples_leaf': 30, 'model__n_estimators': 50, 'pca__n_components': 17}
Classification Report:
             precision    recall  f1-score   support

      False       0.67      0.92      0.78      5044
       True       0.58      0.20      0.29      2811

avg / total       0.64      0.66      0.60      7855



## Model 5: Support Vector Machine

In [34]:
# define steps and hyperparameter ranges
steps = [("scaler", preprocessing.StandardScaler()),
        ("pca", PCA()),
        ("model", SVC())]
parameters = {"pca__n_components":np.arange(10,18),
             "model__C":[0.01, 0.1, 1, 10]}

In [35]:
# create pipeline and train model
pipeline = Pipeline(steps)
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

In [36]:
#score the model
y_pred = cv.predict(X_test)
print("Best Score:", cv.score(X_test, y_test))
print("Best Params:", cv.best_params_)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Score: 0.655378739656
Best Params: {'model__C': 10, 'pca__n_components': 17}
Classification Report:
             precision    recall  f1-score   support

      False       0.66      0.97      0.78      5044
       True       0.62      0.10      0.17      2811

avg / total       0.64      0.66      0.56      7855



# Intepretation

In [None]:
The models all produce accuracy in the mid 60s.