In [143]:
# import packages
!pip install dmba
import dmba
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [144]:
# load dataset
voter_df = pd.read_csv('FX_indicators_2020.csv', on_bad_lines='skip')

In [145]:
voter_df.head()

Unnamed: 0,VOTER_ID,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
0,84508,3,0,25,4,0,0,38,39,38,...,,,,,,,,,,
1,608312,1,0,35,0,0,3,46,46,46,...,,,,,,,,,,
2,222821,3,0,73,3,0,0,42,36,48,...,,,,,,,,,,
3,137882,2,0,54,1,1,0,37,34,51,...,,,,,,,,,,
4,531303,2,0,51,0,3,0,46,46,46,...,,,,,,,,,,


In [146]:
voter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384987 entries, 0 to 384986
Columns: 152 entries, VOTER_ID to MOVED_ARMB
dtypes: float64(2), int64(105), object(45)
memory usage: 446.5+ MB


In [147]:
column_names = voter_df.columns.tolist()
column_names

['VOTER_ID',
 'SET_NO',
 'OPP_SEX',
 'AGE',
 'HH_ND',
 'HH_NR',
 'HH_NI',
 'MED_AGE',
 'MED_AGE_M',
 'MED_AGE_F',
 'NH_WHITE',
 'NH_AA',
 'NH_NATAM',
 'NH_ASIAN',
 'NH_HPI',
 'NH_OTHER',
 'NH_MULT',
 'HISP',
 'COMM_LT10',
 'COMM_609P',
 'MED_HH_INC',
 'COMM_CAR',
 'COMM_CP',
 'COMM_PT',
 'COMM_WALK',
 'KIDS',
 'KIDS_MC',
 'M_NEV_MAR',
 'M_MAR',
 'M_MAR_SP',
 'M_MAR_SNP',
 'F_NEV_MAR',
 'F_MAR',
 'F_MAR_SP',
 'F_MAR_SNP',
 'ED_ASSOC',
 'ED_BACH',
 'ED_MD',
 'ED_PROF',
 'ED_DOC',
 'ED_4COL',
 'GENDER_F',
 'GENDER_M',
 'H_AFDLN3P',
 'H_AFSSLN3P',
 'H_F1',
 'H_FFDLN2',
 'H_FFSLN2',
 'H_M1',
 'H_MFDLN2',
 'H_MFDLN3P',
 'H_MFSLN2',
 'H_MFSLN3P',
 'H_MFSSLN3P',
 'H_MMDLN2',
 'H_MMSLN2',
 'PARTY_D',
 'PARTY_I',
 'PARTY_R',
 'HHP_D',
 'HHP_DD',
 'HHP_DI',
 'HHP_DR',
 'HHP_I',
 'HHP_II',
 'HHP_R',
 'HHP_RI',
 'HHP_RR',
 'VPP_12',
 'VPP_16',
 'VPR_12',
 'VPR_14',
 'VPR_16',
 'VG_08',
 'VG_10',
 'VG_12',
 'VG_14',
 'VG_16',
 'PP_PELIG',
 'PR_PELIG',
 'AP_PELIG',
 'G_PELIG',
 'E_PELIG',
 'NL5G',
 '

In [148]:
voter_df = voter_df.fillna(0)

In [149]:
voter_df.head()

Unnamed: 0,VOTER_ID,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
0,84508,3,0,25,4,0,0,38,39,38,...,0,0,0,0,0,0,0,0,0,0
1,608312,1,0,35,0,0,3,46,46,46,...,0,0,0,0,0,0,0,0,0,0
2,222821,3,0,73,3,0,0,42,36,48,...,0,0,0,0,0,0,0,0,0,0
3,137882,2,0,54,1,1,0,37,34,51,...,0,0,0,0,0,0,0,0,0,0
4,531303,2,0,51,0,3,0,46,46,46,...,0,0,0,0,0,0,0,0,0,0


In [150]:
voter_df = voter_df.replace({'Y': 1, 'N': 0})

In [151]:
voter_df = voter_df.apply(pd.to_numeric, errors='coerce').fillna(0)

In [152]:
# predictors = ['AGE', 'HH_ND', 'HH_NR', 'HH_NI', 'MED_AGE', 'MED_AGE_M', 'MED_AGE_F',  'NH_WHITE', 'NH_AA', 'NH_NATAM', 'NH_ASIAN', 'NH_HPI', 'NH_OTHER', 'NH_MULT', 'HISP', 'COMM_LT10', 'COMM_609P', 'MED_HH_INC', 'COMM_CAR', 'COMM_CP', 'COMM_PT', 'COMM_WALK', 'KIDS', 'KIDS_MC', 'M_NEV_MAR', 'M_MAR', 'M_MAR_SP', 'M_MAR_SNP', 'F_NEV_MAR', 'F_MAR', 'F_MAR_SP', 'F_MAR_SNP', 'ED_ASSOC', 'ED_BACH', 'ED_MD', 'ED_PROF', 'ED_DOC', 'ED_4COL', 'GENDER_F', 'GENDER_M', 'H_AFDLN3P', 'H_AFSSLN3P', 'H_F1', 'H_FFDLN2', 'H_FFSLN2', 'H_M1', 'H_MFDLN2', 'H_MFDLN3P', 'H_MFSLN2', 'H_MFSLN3P', 'H_MFSSLN3P', 'H_MMDLN2', 'H_MMSLN2']
outcome = 'PARTY_D'

In [153]:
voter_df.drop(voter_df[~voter_df['PARTY_D'].isin([0, 1,'0','1'])].index, inplace=True)

In [154]:
X = pd.get_dummies(voter_df.drop(['PARTY_D', 'PARTY_I', 'PARTY_R', 'D3', 'D2', 'R2', 'R3','I3'], axis=1))
# X = voter_df[predictors]
y = voter_df[outcome]
y=y.astype('int')

In [155]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [156]:
dtc = DecisionTreeClassifier(max_depth = 20, random_state = 1)

In [157]:
# train the model
dtc.fit(X_train, y_train)

In [158]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 0.7286)

       Prediction
Actual      0      1
     0 100589  20417
     1  42275  67711


In [159]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.5716)

       Prediction
Actual     0     1
     0 55067 25579
     1 40394 32955


In [160]:
importances = dtc.feature_importances_

In [161]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

        feature  importance
104      CAND1S    0.000000
106     MESSAGE    0.000000
91   RELIGIOUSM    0.000000
105      CAND2S    0.000000
129    MOVED_DR    0.000000
..          ...         ...
100    PRS16_PD    0.023041
3           AGE    0.066730
84     REG_DAYS    0.077966
0      VOTER_ID    0.113461
101    PRS16_PR    0.201651

[144 rows x 2 columns]


In [162]:
# Initially, the model was run with every column except the 3 party columns. However, it seemed that the D2, D3, R2, R3, and I3 columns were too closely related (near 100% for D3 and D2) to the dependent variable PARTY_D 
# So I ran the analysis again with the D2, D3, R2, R3, and I3 columns excluded

In [163]:
voter_ids = X_valid['VOTER_ID']
model_scores = dtc.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores})
output.to_csv('dtc_model_scores.csv', sep='\t', index=False)

In [164]:
output['model_score'].min()

0.0

In [165]:
output['model_score'].max()

100.0

In [166]:
output['model_score'].value_counts()

0.000000      18320
100.000000    17819
45.761060      6758
42.434870      3978
31.057112      3944
              ...  
31.578947         8
64.705882         8
21.052632         8
79.310345         7
72.222222         5
Name: model_score, Length: 800, dtype: int64

In [167]:
# training logistic regression model
lr = LogisticRegression()

In [168]:
lr.fit(X_train, y_train)

In [169]:
classificationSummary(y_train, lr.predict(X_train))

Confusion Matrix (Accuracy 0.5239)

       Prediction
Actual      0      1
     0 120999      7
     1 109979      7


In [170]:
classificationSummary(y_valid, lr.predict(X_valid))

Confusion Matrix (Accuracy 0.5237)

       Prediction
Actual     0     1
     0 80641     5
     1 73343     6


In [171]:
voter_ids = X_valid['VOTER_ID']
model_scores = lr.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores})
output.to_csv('lr_model_scores.csv', sep='\t', index=False)

In [172]:
output['model_score'].min()

43.58631798937074

In [173]:
output['model_score'].max()

50.12167918770909

In [174]:
output['model_score'].value_counts()

48.247220    1
47.323185    1
46.227673    1
48.481906    1
48.672653    1
            ..
48.803163    1
47.697353    1
46.515515    1
48.216542    1
47.404076    1
Name: model_score, Length: 153995, dtype: int64

In [175]:
# create model to predict turnout
outcome = 'VG_10'

In [176]:
X = pd.get_dummies(voter_df.drop([ 'VPP_12', 'VPP_16', 'VPR_12', 'VPR_14', 'VPR_16', 'VG_08', 'VG_10', 'VG_12', 'VG_14', 'VG_16','PARTY_D', 'PARTY_I', 'PARTY_R', 'D3', 'D2', 'R2', 'R3','I3'], axis=1))
y = voter_df[outcome]
y=y.astype('int')

In [177]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [178]:
# train the model
dtc.fit(X_train, y_train)

In [179]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 0.9834)

       Prediction
Actual      0      1
     0 153205    242
     1   3595  73950


In [180]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.9482)

       Prediction
Actual     0     1
     0 99312  3009
     1  4967 46707


In [181]:
importances = dtc.feature_importances_

In [182]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

        feature  importance
96      MESSAGE    0.000000
95       CAND2S    0.000000
130  MOVED_DRMB    0.000000
128  MOVED_ARMA    0.000000
94       CAND1S    0.000000
..          ...         ...
0      VOTER_ID    0.009488
3           AGE    0.012365
68      G_PELIG    0.038173
97     VG_14_DV    0.049176
70         NL5G    0.821874

[134 rows x 2 columns]


In [183]:
voter_ids = X_valid['VOTER_ID']
model_scores = dtc.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores})
output.to_csv('dtc_turnout_scores.csv', sep='\t', index=False)

In [184]:
lr.fit(X_train, y_train)

In [185]:
classificationSummary(y_train, lr.predict(X_train))

Confusion Matrix (Accuracy 0.8560)

       Prediction
Actual      0      1
     0 135226  18221
     1  15032  62513


In [186]:
classificationSummary(y_valid, lr.predict(X_valid))

Confusion Matrix (Accuracy 0.8561)

       Prediction
Actual     0     1
     0 90282 12039
     1 10117 41557


In [187]:
voter_ids = X_valid['VOTER_ID']
model_scores = lr.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores})
output.to_csv('lr_turnout_scores.csv', sep='\t', index=False)

In [188]:
# prepare smaller dataset for use with the models from questions 1 and 2
small_df = pd.read_csv('FX_indicators_2020_rand_10k.csv', on_bad_lines='skip')

In [189]:
small_df = small_df.fillna(0)

In [190]:
small_df = small_df.replace({'Y': 1, 'N': 0})

In [191]:
small_df = small_df.apply(pd.to_numeric, errors='coerce').fillna(0)

In [192]:
small_df.head()

Unnamed: 0,VOTER_ID,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
0,84508,3,0,25,4,0,0,38,39,38,...,0,0,0,0,0,0,0,0,0,0
1,35758,2,0,45,3,0,1,34,33,35,...,0,0,0,0,0,0,0,0,0,0
2,32007,2,0,23,1,1,0,37,34,51,...,0,0,0,0,0,0,0,0,0,0
3,631210,1,0,82,1,0,0,44,40,51,...,0,0,0,0,0,0,0,0,0,0
4,154033,1,55,78,0,2,0,42,36,48,...,0,0,0,0,0,0,0,0,0,0


In [193]:
small_df.drop(small_df[~small_df['PARTY_D'].isin([0, 1,'0','1'])].index, inplace=True)

In [194]:
X = pd.get_dummies(small_df.drop(['PARTY_D', 'PARTY_I', 'PARTY_R', 'D3', 'D2', 'R2', 'R3','I3', 'VG_10'], axis=1))
# X = voter_df[predictors]
y = small_df[outcome]
y=y.astype('int')

In [195]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [196]:
# train the model
dtc.fit(X_train, y_train)

In [197]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 4031    0
     1    0 1969


In [198]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.9962)

       Prediction
Actual    0    1
     0 2641    7
     1    8 1344


In [199]:
importances = dtc.feature_importances_

In [200]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

       feature  importance
0     VOTER_ID    0.000000
107  CAND1_UND    0.000000
105    MESSAGE    0.000000
104     CAND2S    0.000000
103     CAND1S    0.000000
..         ...         ...
71       VG_12    0.035109
73       VG_16    0.050010
72       VG_14    0.052312
70       VG_08    0.057098
79        NL5G    0.746561

[143 rows x 2 columns]


In [201]:
voter_ids = X_valid['VOTER_ID']
model_scores = dtc.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores})
output.to_csv('small_dtc_model_scores.csv', sep='\t', index=False)

In [202]:
output['model_score'].min()

0.0

In [203]:
output['model_score'].max()

100.0

In [204]:
output['model_score'].value_counts()

0.0      2649
100.0    1351
Name: model_score, dtype: int64

In [205]:
# training logistic regression model
lr = LogisticRegression()

In [206]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [207]:
classificationSummary(y_train, lr.predict(X_train))

Confusion Matrix (Accuracy 0.8447)

       Prediction
Actual    0    1
     0 3585  446
     1  486 1483


In [208]:
classificationSummary(y_valid, lr.predict(X_valid))

Confusion Matrix (Accuracy 0.8448)

       Prediction
Actual    0    1
     0 2353  295
     1  326 1026


In [209]:
voter_ids = X_valid['VOTER_ID']
model_scores = lr.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores})
output.to_csv('small_lr_model_scores.csv', sep='\t', index=False)

In [210]:
output['model_score'].min()

0.05782679464133481

In [211]:
output['model_score'].max()

99.43140368432428

In [212]:
output['model_score'].value_counts()

64.881032    1
42.653375    1
65.288987    1
4.690406     1
55.825653    1
            ..
80.981870    1
70.751866    1
13.827898    1
39.512842    1
0.221002     1
Name: model_score, Length: 4000, dtype: int64