## Logistic Regression

In [4]:
# Import libraries and load data
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Load in data
dems = pd.read_csv('../data/dem_with_dummies_renamed.csv').drop(['Unnamed: 0', 'Primary_Vote_Percentage'], axis=1)

In [5]:
# Split into training and testing data
cat_train_features, cat_test_features, train_outcome, test_outcome = train_test_split(
    dems.drop('Primary_Status_Advanced', axis = 1),
    dems['Primary_Status_Advanced'],
    test_size = 0.3,
    random_state = 11
)

In [6]:
train_features = cat_train_features.drop(['Candidate', 'State'], axis=1)
test_features = cat_test_features.drop(['Candidate', 'State'], axis=1)

In [7]:
# Find best value for k-fold cross validation for Logistic Regression classifier
accuracies = {}
for k in range(2,15):
    lr_clf = LogisticRegressionCV(cv=k)
    lr_preds = lr_clf.fit(train_features, train_outcome).predict(test_features)
    accuracies[k] = lr_clf.score(test_features, test_outcome.values)
accuracies

{2: 0.7459016393442623,
 3: 0.75,
 4: 0.75,
 5: 0.7459016393442623,
 6: 0.7459016393442623,
 7: 0.7459016393442623,
 8: 0.7459016393442623,
 9: 0.7459016393442623,
 10: 0.7459016393442623,
 11: 0.7459016393442623,
 12: 0.7459016393442623,
 13: 0.7459016393442623,
 14: 0.7459016393442623}

In [8]:
# Create logistic regression classifier
lr_clf = LogisticRegressionCV(cv=4)

# Build and execute backward feature selection
sfs1 = sfs(lr_clf,
           k_features=10,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

sfs1 = sfs1.fit(train_features, train_outcome)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   11.3s finished

[2018-12-03 23:17:31] Features: 1/10 -- score: 0.7106271642939592[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:   17.6s finished

[2018-12-03 23:17:48] Features: 2/10 -- score: 0.7142285494420931[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:   27.2s finished

[2018-12-03 23:18:16] Features: 3/10 -- score: 0.7320200076952673[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:   31.3s finished

[2018-12-03 23:18:47] Features: 4/10 -- score: 0.7514274721046557[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   2

In [10]:
feat_cols = list(sfs1.k_feature_idx_)

# View feature names selected
sfs1.k_feature_names_

('Neutral_Endorsements',
 'Yes_Endorsements',
 'No_Endorsements',
 'Race_Unknown',
 'Veteran__No',
 'Veteran__Unknown',
 'Veteran__Yes',
 'LGBTQ__Unknown',
 'STEM__Unknown',
 'Party_Support__Yes')

In [11]:
lr_clf.fit(train_features.iloc[:, feat_cols], train_outcome)
train_outcome_pred = lr_clf.predict(train_features.iloc[:, feat_cols])
lr_clf.score(train_features.iloc[:, feat_cols], train_outcome.values)

0.7513227513227513

In [12]:
test_outcome_pred = lr_clf.predict(test_features.iloc[:, feat_cols])
lr_clf.score(test_features.iloc[:, feat_cols], test_outcome.values)

0.7418032786885246

In [13]:
test_outcome.values

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0])

In [14]:
test_outcome_pred

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0])

In [15]:
# Create data frame of test outcomes and predictions for candidates in test set
test_results = cat_test_features.iloc[:,0:2]
test_results['outcome'] = test_outcome.values
test_results['lr_pred'] = test_outcome_pred

In [16]:
test_results

Unnamed: 0,Candidate,State,outcome,lr_pred
274,Andrew Duck,MD,0,0
123,Bobby Kaple,GA,0,0
25,Brianna Westbrook,AZ,0,1
522,Janet Everhard,OH,0,0
46,Mike Barkley,CA,0,0
576,Marc Friedenberg,PA,1,1
424,Henry Thorns,NV,0,0
588,Madeleine Dean,PA,1,1
622,Eric Graben,SC,0,0
288,Mark Eves,ME,0,0


In [13]:
# Export results to csv
test_results.to_csv('../logistic_test_predictions.csv')

In [14]:
# Create data frame of statistical model scores
stats_model_scores = {'Model Type': ['Logistic regression'], 'Score':[ lr_clf.score(test_features.iloc[:, feat_cols], test_outcome.values)]}
stats_scores_df = pd.DataFrame(stats_model_scores)
stats_scores_df

Unnamed: 0,Model Type,Score
0,Logistic regression,0.741803


In [177]:
# Export scores to csv
stats_scores_df.to_csv('../model_accuracy_stats.csv')

In [17]:
#make a confusion matrix
confm = metrics.confusion_matrix(test_results['outcome'] , test_results['lr_pred'])

In [18]:
confm

array([[157,  13],
       [ 50,  24]])

In [19]:
#calculate the sensitivity, specificity, and type1 and type2 errors 
sensitivity = confm[1, 1] / (confm[1,1] + confm[1,0])
print(sensitivity)
specificity = confm[0,0]/ (confm[0,0] + confm[0, 1])
print(specificity)
type1error = 1 - specificity
print(type1error)
type2error = 1 - sensitivity
print(type2error)

0.32432432432432434
0.9235294117647059
0.07647058823529407
0.6756756756756757
