## Logistic Regression

In [131]:
# Import libraries and load data
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

## Load in data
dems = pd.read_csv('../data/dem_with_dummies_renamed.csv').drop(['Unnamed: 0', 'Candidate', 'State', 'Primary_Vote_Percentage'], axis=1)

In [117]:
# Split into training and testing data
train_features, test_features, train_outcome, test_outcome = train_test_split(
    dems.drop('Primary_Status_Advanced', axis = 1),
    dems['Primary_Status_Advanced'],
    test_size = 0.3,
    random_state = 11
)

In [118]:
# Find best value for k-fold cross validation for Logistic Regression classifier
accuracies = {}
for k in range(2,15):
    lr_clf = LogisticRegressionCV(cv=k)
    lr_preds = lr_clf.fit(train_features, train_outcome).predict(test_features)
    accuracies[k] = lr_clf.score(test_features, test_outcome.values)
accuracies

{2: 0.7459016393442623,
 3: 0.75,
 4: 0.75,
 5: 0.7459016393442623,
 6: 0.7459016393442623,
 7: 0.7459016393442623,
 8: 0.7459016393442623,
 9: 0.7459016393442623,
 10: 0.7459016393442623,
 11: 0.7459016393442623,
 12: 0.7459016393442623,
 13: 0.7459016393442623,
 14: 0.7459016393442623}

In [119]:
lr_clf.get_params().keys()

dict_keys(['Cs', 'class_weight', 'cv', 'dual', 'fit_intercept', 'intercept_scaling', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'refit', 'scoring', 'solver', 'tol', 'verbose'])

In [132]:
# Create logistic regression classifier
linear_clf = LinearRegression()

# Build backward feature selection
sfs1 = sfs(linear_clf,
           k_features=10,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

sfs1 = sfs1.fit(train_features, train_outcome)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [124]:
# Create logistic regression classifier
lr_clf = LogisticRegressionCV(cv=4)

# Build backward feature selection
sfs1 = sfs(lr_clf,
           k_features=10,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

sfs1 = sfs1.fit(train_features, train_outcome)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    4.9s finished

[2018-12-01 16:40:12] Features: 1/10 -- score: 0.7106271642939592[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:    7.4s finished

[2018-12-01 16:40:19] Features: 2/10 -- score: 0.7142285494420931[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    7.6s finished

[2018-12-01 16:40:27] Features: 3/10 -- score: 0.7320200076952673[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    8.2s finished

[2018-12-01 16:40:35] Features: 4/10 -- score: 0.7514274721046557[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    

In [127]:
feat_cols = list(sfs1.k_feature_idx_)

In [128]:
lr_clf.fit(train_features.iloc[:, feat_cols], train_outcome)
train_outcome_pred = lr_clf.predict(train_features.iloc[:, feat_cols])
lr_clf.score(train_features.iloc[:, feat_cols], train_outcome.values)

0.7513227513227513

In [129]:
train_outcome_pred = lr_clf.predict(test_features.iloc[:, feat_cols])
lr_clf.score(test_features.iloc[:, feat_cols], test_outcome.values)

0.7418032786885246