In [304]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [305]:
#read data
donations = pd.read_csv("contributions.csv")

In [306]:
#drop na rows
donations = donations[pd.notnull(donations['Transaction amt'])]
donations = donations[pd.notnull(donations['Won Primary'])]

#split data based on party

dem = donations[donations['Obama Alum?'].notnull()]
rep = donations[donations['Obama Alum?'].isnull()]

In [307]:


# clean data: replace string values with binary var
dem = dem[['Transaction amt','Won Primary', 'Obama Alum?','Party Support?', 'Emily Endorsed?',
       'Guns Sense Candidate?', 'Biden Endorsed?', 'Warren Endorsed? ',
       'Sanders Endorsed?', 'Our Revolution Endorsed?',
       'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?',
       'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?',
        'Rep Party Support?', 'Trump Endorsed?', 'Bannon Endorsed?',
       'Great America Endorsed?', 'NRA Endorsed?', 'Right to Life Endorsed?',
       'Susan B. Anthony Endorsed?', 'Club for Growth Endorsed?',
       'Koch Support?', 'House Freedom Support?', 'Tea Party Endorsed?',
       'Main Street Endorsed?', 'Chamber Endorsed?']].replace(['Yes', 'No'], [1, 0]).fillna(0)




# clean data: replace string values with binary var
rep = rep[['Transaction amt','Won Primary', 'Obama Alum?','Party Support?', 'Emily Endorsed?',
       'Guns Sense Candidate?', 'Biden Endorsed?', 'Warren Endorsed? ',
       'Sanders Endorsed?', 'Our Revolution Endorsed?',
       'Justice Dems Endorsed?', 'PCCC Endorsed?', 'Indivisible Endorsed?',
       'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?',
        'Rep Party Support?', 'Trump Endorsed?', 'Bannon Endorsed?',
       'Great America Endorsed?', 'NRA Endorsed?', 'Right to Life Endorsed?',
       'Susan B. Anthony Endorsed?', 'Club for Growth Endorsed?',
       'Koch Support?', 'House Freedom Support?', 'Tea Party Endorsed?',
       'Main Street Endorsed?', 'Chamber Endorsed?']].replace(['Yes', 'No'], [1, 0]).fillna(0)


## GLM: Logistic Regression

### A. Democratic Election

#### Features: Transaction amt, Obama Alum, Biden Endorsed, Sanders Endorsed

We do not include "Obama Alum?" and similar credentials as features because they result in coeffficients very close to 0 after fitting a logistic regression model.

In [320]:
#split into x and y

#x: try features: Transaction amt and endorsements from top endorsers
# top endorsers as listed in the table by https://fivethirtyeight.com/features/the-establishment-is-beating-the-progressive-wing-in-democratic-primaries-so-far/

y = dem['Won Primary']
x = dem[['Transaction amt','Obama Alum?','Biden Endorsed?','Emily Endorsed?','PCCC Endorsed?']]



In [321]:
#split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [322]:
fit_model = sm.Logit(y_train, X_train).fit_regularized()
print(fit_model.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.46679546073332123
            Iterations: 53
            Function evaluations: 61
            Gradient evaluations: 53
                           Logit Regression Results                           
Dep. Variable:            Won Primary   No. Observations:                 2916
Model:                          Logit   Df Residuals:                     2911
Method:                           MLE   Df Model:                            4
Date:                Mon, 10 May 2021   Pseudo R-squ.:                  0.1543
Time:                        20:47:34   Log-Likelihood:                -1361.2
converged:                       True   LL-Null:                       -1609.4
Covariance Type:            nonrobust   LLR p-value:                3.795e-106
                      coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


In [323]:
predicted = fit_model.predict(X_test)
p = list(map(round, predicted))
print('Test accuracy = ', accuracy_score(y_test, p))

Test accuracy =  0.6068493150684932


The feature "Biden Endorsed?" leads to a high standard error, since data on Biden's endorsements are rare; all 10 candidates receiving a Biden endorsement in 2018 won the election.

In [325]:
#x: try features: Transaction amt and endorsements from top non-Biden endorsers
# top endorsers as listed in the table by https://fivethirtyeight.com/features/the-establishment-is-beating-the-progressive-wing-in-democratic-primaries-so-far/

y = dem['Won Primary']
x = dem[['Transaction amt','Emily Endorsed?','PCCC Endorsed?']]

#split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

fit_model = sm.Logit(y_train, X_train).fit_regularized()
print(fit_model.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5228808055152547
            Iterations: 20
            Function evaluations: 28
            Gradient evaluations: 20
                           Logit Regression Results                           
Dep. Variable:            Won Primary   No. Observations:                 2916
Model:                          Logit   Df Residuals:                     2913
Method:                           MLE   Df Model:                            2
Date:                Mon, 10 May 2021   Pseudo R-squ.:                 0.05264
Time:                        20:47:48   Log-Likelihood:                -1524.7
converged:                       True   LL-Null:                       -1609.4
Covariance Type:            nonrobust   LLR p-value:                 1.619e-37
                      coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


In [326]:
predicted = fit_model.predict(X_test)
p = list(map(round, predicted))
print('Test accuracy = ', accuracy_score(y_test, p))

Test accuracy =  0.5726027397260274


When we include endorsements from top endorsers as a feature in our model, test accuracy decreases compared to when using just Obama alum status and contributions.

In [327]:
#x: try features: Transaction amt, Obama alum, and endorsements from top non-Biden endorsers
# top endorsers as listed in the table by https://fivethirtyeight.com/features/the-establishment-is-beating-the-progressive-wing-in-democratic-primaries-so-far/

y = dem['Won Primary']
x = dem[['Transaction amt']]

#split to train and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

fit_model = sm.Logit(y_train, X_train).fit_regularized()
print(fit_model.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.6927535908805859
            Iterations: 5
            Function evaluations: 13
            Gradient evaluations: 5
                           Logit Regression Results                           
Dep. Variable:            Won Primary   No. Observations:                 2916
Model:                          Logit   Df Residuals:                     2915
Method:                           MLE   Df Model:                            0
Date:                Mon, 10 May 2021   Pseudo R-squ.:                 -0.2551
Time:                        20:47:51   Log-Likelihood:                -2020.1
converged:                       True   LL-Null:                       -1609.4
Covariance Type:            nonrobust   LLR p-value:                       nan
                      coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


In [328]:
predicted = fit_model.predict(X_test)
p = list(map(round, predicted))
print('Test accuracy = ', accuracy_score(y_test, p))

Test accuracy =  0.7205479452054795


## Nonparametric method: Random Forest

In [329]:
forest = RandomForestClassifier(n_estimators=50, max_depth=8)
forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, n_estimators=50)

In [330]:
y_pred_test = forest.predict(X_test)
accuracy_score(y_test, y_pred_test)

0.736986301369863

In [331]:
y_pred_train = forest.predict(X_train)
accuracy_score(y_train, y_pred_train)

0.7962962962962963

In [332]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.59      0.12      0.19       199
           1       0.75      0.97      0.84       531

    accuracy                           0.74       730
   macro avg       0.67      0.54      0.52       730
weighted avg       0.70      0.74      0.67       730

