In [1]:
# import packages
!pip install dmba
import dmba
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dmba
  Downloading dmba-0.1.0-py3-none-any.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.1.0
no display found. Using non-interactive Agg backend


In [2]:
# load dataset
voter_df = pd.read_csv('FX_indicators_2020.csv', on_bad_lines='skip')

In [3]:
voter_df.head()

Unnamed: 0,VOTER_ID,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
0,84508,3,0,25,4,0,0,38,39,38,...,,,,,,,,,,
1,608312,1,0,35,0,0,3,46,46,46,...,,,,,,,,,,
2,222821,3,0,73,3,0,0,42,36,48,...,,,,,,,,,,
3,137882,2,0,54,1,1,0,37,34,51,...,,,,,,,,,,
4,531303,2,0,51,0,3,0,46,46,46,...,,,,,,,,,,


In [4]:
voter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54311 entries, 0 to 54310
Columns: 152 entries, VOTER_ID to MOVED_ARMB
dtypes: float64(28), int64(79), object(45)
memory usage: 63.0+ MB


In [5]:
column_names = voter_df.columns.tolist()
column_names

['VOTER_ID',
 'SET_NO',
 'OPP_SEX',
 'AGE',
 'HH_ND',
 'HH_NR',
 'HH_NI',
 'MED_AGE',
 'MED_AGE_M',
 'MED_AGE_F',
 'NH_WHITE',
 'NH_AA',
 'NH_NATAM',
 'NH_ASIAN',
 'NH_HPI',
 'NH_OTHER',
 'NH_MULT',
 'HISP',
 'COMM_LT10',
 'COMM_609P',
 'MED_HH_INC',
 'COMM_CAR',
 'COMM_CP',
 'COMM_PT',
 'COMM_WALK',
 'KIDS',
 'KIDS_MC',
 'M_NEV_MAR',
 'M_MAR',
 'M_MAR_SP',
 'M_MAR_SNP',
 'F_NEV_MAR',
 'F_MAR',
 'F_MAR_SP',
 'F_MAR_SNP',
 'ED_ASSOC',
 'ED_BACH',
 'ED_MD',
 'ED_PROF',
 'ED_DOC',
 'ED_4COL',
 'GENDER_F',
 'GENDER_M',
 'H_AFDLN3P',
 'H_AFSSLN3P',
 'H_F1',
 'H_FFDLN2',
 'H_FFSLN2',
 'H_M1',
 'H_MFDLN2',
 'H_MFDLN3P',
 'H_MFSLN2',
 'H_MFSLN3P',
 'H_MFSSLN3P',
 'H_MMDLN2',
 'H_MMSLN2',
 'PARTY_D',
 'PARTY_I',
 'PARTY_R',
 'HHP_D',
 'HHP_DD',
 'HHP_DI',
 'HHP_DR',
 'HHP_I',
 'HHP_II',
 'HHP_R',
 'HHP_RI',
 'HHP_RR',
 'VPP_12',
 'VPP_16',
 'VPR_12',
 'VPR_14',
 'VPR_16',
 'VG_08',
 'VG_10',
 'VG_12',
 'VG_14',
 'VG_16',
 'PP_PELIG',
 'PR_PELIG',
 'AP_PELIG',
 'G_PELIG',
 'E_PELIG',
 'NL5G',
 '

In [6]:
voter_df = voter_df.fillna(0)

In [7]:
voter_df.head()

Unnamed: 0,VOTER_ID,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
0,84508,3,0,25,4,0,0,38,39,38,...,0,0,0,0,0,0,0,0,0,0
1,608312,1,0,35,0,0,3,46,46,46,...,0,0,0,0,0,0,0,0,0,0
2,222821,3,0,73,3,0,0,42,36,48,...,0,0,0,0,0,0,0,0,0,0
3,137882,2,0,54,1,1,0,37,34,51,...,0,0,0,0,0,0,0,0,0,0
4,531303,2,0,51,0,3,0,46,46,46,...,0,0,0,0,0,0,0,0,0,0


In [8]:
voter_df = voter_df.replace({'Y': 1, 'N': 0})

In [9]:
voter_df = voter_df.apply(pd.to_numeric, errors='coerce').fillna(0)

In [10]:
# predictors = ['AGE', 'HH_ND', 'HH_NR', 'HH_NI', 'MED_AGE', 'MED_AGE_M', 'MED_AGE_F',  'NH_WHITE', 'NH_AA', 'NH_NATAM', 'NH_ASIAN', 'NH_HPI', 'NH_OTHER', 'NH_MULT', 'HISP', 'COMM_LT10', 'COMM_609P', 'MED_HH_INC', 'COMM_CAR', 'COMM_CP', 'COMM_PT', 'COMM_WALK', 'KIDS', 'KIDS_MC', 'M_NEV_MAR', 'M_MAR', 'M_MAR_SP', 'M_MAR_SNP', 'F_NEV_MAR', 'F_MAR', 'F_MAR_SP', 'F_MAR_SNP', 'ED_ASSOC', 'ED_BACH', 'ED_MD', 'ED_PROF', 'ED_DOC', 'ED_4COL', 'GENDER_F', 'GENDER_M', 'H_AFDLN3P', 'H_AFSSLN3P', 'H_F1', 'H_FFDLN2', 'H_FFSLN2', 'H_M1', 'H_MFDLN2', 'H_MFDLN3P', 'H_MFSLN2', 'H_MFSLN3P', 'H_MFSSLN3P', 'H_MMDLN2', 'H_MMSLN2']
outcome = 'PARTY_D'

In [11]:
voter_df.drop(voter_df[~voter_df['PARTY_D'].isin([0, 1,'0','1'])].index, inplace=True)

In [12]:
X = pd.get_dummies(voter_df.drop(['PARTY_D', 'PARTY_I', 'PARTY_R', 'D3', 'D2', 'R2', 'R3','I3'], axis=1))
y = voter_df[outcome]
y=y.astype('int')

In [13]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [14]:
dtc = DecisionTreeClassifier(max_depth = 20, random_state = 1)

In [15]:
# train the model
dtc.fit(X_train, y_train)

In [16]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 0.8583)

       Prediction
Actual     0     1
     0 15840  1249
     1  3370 12127


In [17]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.5481)

       Prediction
Actual    0    1
     0 7306 4247
     1 5571 4601


In [18]:
importances = dtc.feature_importances_

In [19]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

        feature  importance
85   UPSCALEBUY    0.000000
105      CAND2S    0.000000
104      CAND1S    0.000000
140  MOVED_DRMB    0.000000
139  MOVED_RDMB    0.000000
..          ...         ...
100    PRS16_PD    0.031967
3           AGE    0.069349
84     REG_DAYS    0.091167
0      VOTER_ID    0.106457
101    PRS16_PR    0.126582

[144 rows x 2 columns]


In [20]:
# Initially, the model was run with every column except the 3 party columns. However, it seemed that the D2, D3, R2, R3, and I3 columns were too closely related (near 100% for D3 and D2) to the dependent variable PARTY_D 
# So I ran the analysis again with the D2, D3, R2, R3, and I3 columns excluded

In [21]:
voter_ids = X_valid['VOTER_ID']
model_scores = dtc.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores, 'partisanship': y_valid})

In [22]:
output['model_score'].min()

0.0

In [23]:
output['model_score'].max()

100.0

In [24]:
output['model_score'].value_counts()

100.000000    6171
0.000000      5989
41.120886     2124
27.240978      588
61.809045      246
              ... 
86.666667        5
62.500000        5
61.538462        4
58.333333        4
54.545455        3
Name: model_score, Length: 189, dtype: int64

In [25]:
# introduce a small amount of noise in order to prevent an error that arises when bin edges are not unique
output['model_score'] += np.random.normal(0, 0.0001, len(output))

In [26]:
output['quintile'] = pd.qcut(output['model_score'], q=5, labels=['Quintile 1', 'Quintile 2', 'Quintile 3', 'Quintile 4', 'Quintile 5'])

print(output.head())

       VOTER_ID  model_score  partisanship    quintile
28035    324817   100.000027             1  Quintile 5
21205    346173    55.319074             1  Quintile 4
39319    206216    41.120866             0  Quintile 3
23432    278908     0.000004             0  Quintile 1
2951     585259    41.120843             0  Quintile 3


In [27]:
# calculate y percentage for each quantile in the test set
test_set_y_percent = output.groupby('quintile')['partisanship'].mean()

In [28]:
# calculate y percentage for the whole test set
test_set_y_total_percent = y_valid.mean()

In [29]:
print(test_set_y_percent)
print('Test set Y percentage for entire test set:', test_set_y_total_percent)

quintile
Quintile 1    0.467894
Quintile 2    0.412428
Quintile 3    0.416571
Quintile 4    0.504948
Quintile 5    0.539241
Name: partisanship, dtype: float64
Test set Y percentage for entire test set: 0.46821634062140394


In [30]:
lift = test_set_y_percent / test_set_y_total_percent

In [31]:
# Question 1 answer:
print(lift)

quintile
Quintile 1    0.999312
Quintile 2    0.880849
Quintile 3    0.889697
Quintile 4    1.078451
Quintile 5    1.151691
Name: partisanship, dtype: float64


In [32]:
# training logistic regression model
lr = LogisticRegression()

In [33]:
lr.fit(X_train, y_train)

In [34]:
classificationSummary(y_train, lr.predict(X_train))

Confusion Matrix (Accuracy 0.5245)

       Prediction
Actual     0     1
     0 17084     5
     1 15489     8


In [35]:
classificationSummary(y_valid, lr.predict(X_valid))

Confusion Matrix (Accuracy 0.5317)

       Prediction
Actual     0     1
     0 11547     6
     1 10168     4


In [36]:
voter_ids = X_valid['VOTER_ID']
model_scores = lr.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores, 'partisanship': y_valid})

In [37]:
output['model_score'].min()

42.4405340233531

In [38]:
output['model_score'].max()

50.71249155731452

In [39]:
output['model_score'].value_counts()

48.075866    1
49.414825    1
47.911459    1
45.621799    1
47.753229    1
            ..
48.379408    1
47.510973    1
47.099470    1
45.622787    1
48.251537    1
Name: model_score, Length: 21725, dtype: int64

In [40]:
# introduce a small amount of noise in order to prevent an error that arises when bin edges are not unique
output['model_score'] += np.random.normal(0, 0.0001, len(output))

In [41]:
output['quintile'] = pd.qcut(output['model_score'], q=5, labels=['Quintile 1', 'Quintile 2', 'Quintile 3', 'Quintile 4', 'Quintile 5'])

print(output.head())

       VOTER_ID  model_score  partisanship    quintile
28035    324817    48.075902             1  Quintile 3
21205    346173    48.040476             1  Quintile 3
39319    206216    47.913450             0  Quintile 3
23432    278908    47.965127             0  Quintile 3
2951     585259    47.791430             0  Quintile 3


In [42]:
# calculate y percentage for each quantile in the test set
test_set_y_percent = output.groupby('quintile')['partisanship'].mean()

In [43]:
# calculate y percentage for the whole test set
test_set_y_total_percent = y_valid.mean()

In [44]:
print(test_set_y_percent)
print('Test set Y percentage for entire test set:', test_set_y_total_percent)

quintile
Quintile 1    0.464212
Quintile 2    0.468354
Quintile 3    0.461680
Quintile 4    0.479402
Quintile 5    0.467434
Name: partisanship, dtype: float64
Test set Y percentage for entire test set: 0.46821634062140394


In [45]:
lift = test_set_y_percent / test_set_y_total_percent

In [46]:
# Question 1 answer:
print(lift)

quintile
Quintile 1    0.991447
Quintile 2    1.000295
Quintile 3    0.986040
Quintile 4    1.023889
Quintile 5    0.998329
Name: partisanship, dtype: float64


In [47]:
# Question 2 answer: combine two models into an ensemble to make predictions
# gather predictions to combine into an ensemble model
dtc_predictions = dtc.predict(X_valid)
lr_predictions = lr.predict(X_valid)
dtc_scores = dtc.predict_proba(X_valid)[:, 1] * 100
lr_scores = lr.predict_proba(X_valid)[:, 1] * 100

# assign weights to each model based on their accuracy
dtc_weight = 0.55
lr_weight = 0.45

# combine predictions using a weighted average
ensemble_predictions = (dtc_weight * dtc_predictions) + (lr_weight * lr_predictions)
ensemble_scores = (dtc_weight * dtc_scores) + (lr_weight * lr_scores)

# set threshold for final prediction
threshold = 0.5

# generate final predictions based on the threshold
final_predictions = np.where(ensemble_predictions > threshold, 1, 0)


In [48]:
classificationSummary(y_valid, final_predictions)

Confusion Matrix (Accuracy 0.5481)

       Prediction
Actual    0    1
     0 7306 4247
     1 5571 4601


In [49]:
voter_ids = X_valid['VOTER_ID']
model_scores = ensemble_predictions * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': ensemble_scores, 'partisanship': y_valid})

In [50]:
output['model_score'].value_counts()

76.634140    1
22.236671    1
21.560157    1
22.921114    1
76.488953    1
            ..
21.770734    1
43.996426    1
21.194761    1
40.995370    1
44.329679    1
Name: model_score, Length: 21725, dtype: int64

In [51]:
# introduce a small amount of noise in order to prevent an error that arises when bin edges are not unique
output['model_score'] += np.random.normal(0, 0.0001, len(output))

In [52]:
output['quintile'] = pd.qcut(output['model_score'], q=5, labels=['Quintile 1', 'Quintile 2', 'Quintile 3', 'Quintile 4', 'Quintile 5'])

print(output.head())

       VOTER_ID  model_score  partisanship    quintile
28035    324817    76.634246             1  Quintile 5
21205    346173    52.043600             1  Quintile 4
39319    206216    44.177514             0  Quintile 3
23432    278908    21.584361             0  Quintile 1
2951     585259    44.122687             0  Quintile 3


In [53]:
# calculate y percentage for each quantile in the test set
test_set_y_percent = output.groupby('quintile')['partisanship'].mean()

In [54]:
# calculate y percentage for the whole test set
test_set_y_total_percent = y_valid.mean()

In [55]:
print(test_set_y_percent)
print('Test set Y percentage for entire test set:', test_set_y_total_percent)

quintile
Quintile 1    0.466743
Quintile 2    0.415880
Quintile 3    0.414960
Quintile 4    0.510702
Quintile 5    0.532796
Name: partisanship, dtype: float64
Test set Y percentage for entire test set: 0.46821634062140394


In [56]:
lift = test_set_y_percent / test_set_y_total_percent

In [57]:
# Question 3 answer:
print(lift)

quintile
Quintile 1    0.996854
Quintile 2    0.888223
Quintile 3    0.886256
Quintile 4    1.090739
Quintile 5    1.137928
Name: partisanship, dtype: float64


In [58]:
# create a model to predict candidate support rather than partisanship
# load dataset
voter_df = pd.read_csv('FX_indicators_2020.csv', on_bad_lines='skip')

In [59]:
voter_df = voter_df.replace({'Y': 1, 'N': 0})

In [60]:
numeric_cols = voter_df.select_dtypes(include=np.number).columns
voter_df[numeric_cols] = voter_df[numeric_cols].fillna(voter_df[numeric_cols].mean())

In [61]:
voter_df = voter_df.fillna(0)

In [62]:
voter_df['CAND1S'].value_counts()

0     54433
SR     6468
SD     3556
U      3429
LR     1944
LD     1455
Name: CAND1S, dtype: int64

In [63]:
outcome = 'CAND1S'

In [64]:
X = pd.get_dummies(voter_df.drop(['CAND1S','CAND2S','CAND1_LD2','CAND1_UND', 'CAND1_SRA','CAND1_LRA','CAND1_SDA','CAND1_SFT','CAND1_LR2','CAND1_LDA','CAND1_SR2','CAND1_SD2'], axis=1))
y = voter_df[outcome]

In [65]:
le = LabelEncoder()

y = y.astype(str)

# fit the encoder to your categorical data
le.fit(y)

# transform your categorical data into numeric labels
y_encoded = le.transform(y)

In [66]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [67]:
dtc = DecisionTreeClassifier(max_depth = 20, random_state = 1)

In [68]:
# train the model
dtc.fit(X_train, y_train)

In [69]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 0.9558)

       Prediction
Actual     0     1     2     3     4     5
     0 32553     1     0     1     2     1
     1   123   605     1    76    20    30
     2   142     4   938    85    20    12
     3   324    14     5  1764     8    16
     4   402     5     5    96  3436    22
     5   296    21     7   128    23  1585


In [70]:
#Question 4 answer:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.8690)

       Prediction
Actual     0     1     2     3     4     5
     0 21348    45    49    53   302    78
     1   114   148    26    97    55   160
     2   123    13   275    85   162    85
     3   261    48    34   897    67   118
     4   439    53   147   150  1538   175
     5   233   148    93   162   161   572


In [71]:
importances = dtc.feature_importances_

In [72]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

        feature  importance
118   CAND2_SR2    0.000000
137  MOVED_ADMB    0.000000
136  MOVED_AWMB    0.000000
134  MOVED_RDMB    0.000000
133  MOVED_ARMA    0.000000
..          ...         ...
114   CAND2_SD2    0.032012
119   CAND2_SRA    0.032264
125    MOVED_AW    0.058271
124    MOVED_DR    0.119631
139   MESSAGE_0    0.411576

[142 rows x 2 columns]


In [73]:
# Question 5: Build a model predicting persuadability. 
# I decided to use the MOVED_AW column as the dependent variable

In [74]:
outcome = 'MOVED_AW'

In [75]:
X = pd.get_dummies(voter_df.drop(['CAND2S','MOVED_AW','MOVED_RD','MOVED_DR','MOVED_U','MOVED_AD','MOVED_AR','MOVED_RDMA','MOVED_DRMA','MOVED_AWMA','MOVED_ADMA','MOVED_ARMA','MOVED_RDMB','MOVED_DRMB','MOVED_AWMB','MOVED_ADMB','MOVED_ARMB','CAND2_UND','CAND2_SD2','CAND2_SDA','CAND2_LD2','CAND2_LDA','CAND2_SR2','CAND2_SRA','CAND2_LRA','CAND2_LR2','CAND2_SFT'], axis=1))
y = voter_df[outcome].astype('int')

In [76]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [77]:
dtc = DecisionTreeClassifier(max_depth = 20, random_state = 1)

In [78]:
# train the model
dtc.fit(X_train, y_train)

In [79]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 0.9882)

       Prediction
Actual     0     1
     0 40872    60
     1   445  1394


In [80]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.9478)

       Prediction
Actual     0     1
     0 26664   660
     1   829   361


In [81]:
importances = dtc.feature_importances_

In [82]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

        feature  importance
15     NH_OTHER    0.000000
43    H_AFDLN3P    0.000000
96   CULINARYIN    0.000000
52    H_MFSLN3P    0.000000
55     H_MMSLN2    0.000000
..          ...         ...
103    PRS16_PD    0.034227
3           AGE    0.037160
87     REG_DAYS    0.054434
0      VOTER_ID    0.062762
122   CAND1_SFT    0.381837

[132 rows x 2 columns]


In [83]:
# Question 6: Build two uplift models predicting how likely it is that a voter will become more likely to support the Democratic candidate based on the test mailings for message A and message B. 
# I decided to use the MOVED_ADMA and MOVED_ADMB columns as the dependent variables

In [84]:
# first, the model predicting motion after receiving message A
outcome = 'MOVED_ADMA'

In [85]:
X = pd.get_dummies(voter_df.drop(['CAND2S','MOVED_AW','MOVED_RD','MOVED_DR','MOVED_U','MOVED_AD','MOVED_AR','MOVED_RDMA','MOVED_DRMA','MOVED_AWMA','MOVED_ADMA','MOVED_ARMA','MOVED_RDMB','MOVED_DRMB','MOVED_AWMB','MOVED_ADMB','MOVED_ARMB','CAND2_UND','CAND2_SD2','CAND2_SDA','CAND2_LD2','CAND2_LDA','CAND2_SR2','CAND2_SRA','CAND2_LRA','CAND2_LR2','CAND2_SFT'], axis=1))
y = voter_df[outcome].astype('int')

In [86]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [87]:
dtc = DecisionTreeClassifier(max_depth = 20, random_state = 1)

In [88]:
# train the model
dtc.fit(X_train, y_train)

In [89]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 0.9999)

       Prediction
Actual     0     1
     0 42200     0
     1     4   567


In [90]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.9843)

       Prediction
Actual     0     1
     0 27940   234
     1   214   126


In [91]:
importances = dtc.feature_importances_

In [92]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

       feature  importance
65       HHP_R    0.000000
55    H_MMSLN2    0.000000
61      HHP_DI    0.000000
64      HHP_II    0.000000
68      VPP_12    0.000000
..         ...         ...
87    REG_DAYS    0.044022
0     VOTER_ID    0.075723
3          AGE    0.083428
122  CAND1_SFT    0.126497
130  MESSAGE_A    0.157928

[132 rows x 2 columns]


In [93]:
# next, the model predicting motion after receiving message B
outcome = 'MOVED_ADMB'

In [94]:
X = pd.get_dummies(voter_df.drop(['CAND2S','MOVED_AW','MOVED_RD','MOVED_DR','MOVED_U','MOVED_AD','MOVED_AR','MOVED_RDMA','MOVED_DRMA','MOVED_AWMA','MOVED_ADMA','MOVED_ARMA','MOVED_RDMB','MOVED_DRMB','MOVED_AWMB','MOVED_ADMB','MOVED_ARMB','CAND2_UND','CAND2_SD2','CAND2_SDA','CAND2_LD2','CAND2_LDA','CAND2_SR2','CAND2_SRA','CAND2_LRA','CAND2_LR2','CAND2_SFT'], axis=1))
y = voter_df[outcome].astype('int')

In [95]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [96]:
dtc = DecisionTreeClassifier(max_depth = 20, random_state = 1)

In [97]:
# train the model
dtc.fit(X_train, y_train)

In [98]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual     0     1
     0 42323     0
     1     0   448


In [99]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.9852)

       Prediction
Actual     0     1
     0 28000   216
     1   205    93


In [100]:
importances = dtc.feature_importances_

In [101]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

        feature  importance
131   MESSAGE_B    0.000000
34    F_MAR_SNP    0.000000
90   UPSCALEFEM    0.000000
89   UPSCALEMAL    0.000000
37        ED_MD    0.000000
..          ...         ...
103    PRS16_PD    0.052214
87     REG_DAYS    0.052751
0      VOTER_ID    0.098604
106       MSG_B    0.123959
122   CAND1_SFT    0.128589

[132 rows x 2 columns]


In [102]:
# prepare smaller dataset for use with the models from questions 1 and 2
small_df = pd.read_csv('FX_indicators_2020_rand_10k.csv', on_bad_lines='skip')

In [103]:
small_df = small_df.fillna(0)

In [104]:
small_df = small_df.replace({'Y': 1, 'N': 0})

In [105]:
small_df = small_df.apply(pd.to_numeric, errors='coerce').fillna(0)

In [106]:
small_df.head()

Unnamed: 0,VOTER_ID,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
0,84508,3,0,25,4,0,0,38,39,38,...,0,0,0,0,0,0,0,0,0,0
1,35758,2,0,45,3,0,1,34,33,35,...,0,0,0,0,0,0,0,0,0,0
2,32007,2,0,23,1,1,0,37,34,51,...,0,0,0,0,0,0,0,0,0,0
3,631210,1,0,82,1,0,0,44,40,51,...,0,0,0,0,0,0,0,0,0,0
4,154033,1,55,78,0,2,0,42,36,48,...,0,0,0,0,0,0,0,0,0,0


In [107]:
small_df.drop(small_df[~small_df['PARTY_D'].isin([0, 1,'0','1'])].index, inplace=True)

In [108]:
X = pd.get_dummies(small_df.drop(['PARTY_D', 'PARTY_I', 'PARTY_R', 'D3', 'D2', 'R2', 'R3','I3', 'VG_10'], axis=1))
# X = voter_df[predictors]
y = small_df[outcome]
y=y.astype('int')

In [109]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [110]:
# train the model
dtc.fit(X_train, y_train)

In [111]:
classificationSummary(y_train, dtc.predict(X_train))

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 5932    0
     1    0   68


In [112]:
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual    0    1
     0 3952    0
     1    0   48


In [113]:
importances = dtc.feature_importances_

In [114]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

        feature  importance
0      VOTER_ID         0.0
104      CAND2S         0.0
103      CAND1S         0.0
102       MSG_B         0.0
101       MSG_A         0.0
..          ...         ...
49     H_MFDLN2         0.0
50    H_MFDLN3P         0.0
51     H_MFSLN2         0.0
70        VG_08         0.0
141  MOVED_ADMB         1.0

[143 rows x 2 columns]


In [115]:
voter_ids = X_valid['VOTER_ID']
model_scores = dtc.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores, 'partisanship': y_valid})

In [116]:
output['model_score'].min()

0.0

In [117]:
output['model_score'].max()

100.0

In [118]:
output['model_score'].value_counts()

0.0      3952
100.0      48
Name: model_score, dtype: int64

In [119]:
output['model_score'].value_counts()

0.0      3952
100.0      48
Name: model_score, dtype: int64

In [120]:
# introduce a small amount of noise in order to prevent an error that arises when bin edges are not unique
output['model_score'] += np.random.normal(0, 0.0001, len(output))

In [121]:
output['quintile'] = pd.qcut(output['model_score'], q=5, labels=['Quintile 1', 'Quintile 2', 'Quintile 3', 'Quintile 4', 'Quintile 5'])

print(output.head())

      VOTER_ID  model_score  partisanship    quintile
9953    490806    -0.000121             0  Quintile 1
3850    263092     0.000230             0  Quintile 5
4962    137807    -0.000035             0  Quintile 2
3886     63543     0.000089             0  Quintile 5
5437    255245     0.000081             0  Quintile 4


In [122]:
# calculate y percentage for each quantile in the test set
test_set_y_percent = output.groupby('quintile')['partisanship'].mean()

In [123]:
# calculate y percentage for the whole test set
test_set_y_total_percent = y_valid.mean()

In [124]:
print(test_set_y_percent)
print('Test set Y percentage for entire test set:', test_set_y_total_percent)

quintile
Quintile 1    0.00
Quintile 2    0.00
Quintile 3    0.00
Quintile 4    0.00
Quintile 5    0.06
Name: partisanship, dtype: float64
Test set Y percentage for entire test set: 0.012


In [125]:
lift = test_set_y_percent / test_set_y_total_percent

In [126]:
print(lift)

quintile
Quintile 1    0.0
Quintile 2    0.0
Quintile 3    0.0
Quintile 4    0.0
Quintile 5    5.0
Name: partisanship, dtype: float64


In [127]:
# training logistic regression model
lr = LogisticRegression()

In [128]:
lr.fit(X_train, y_train)

In [129]:
classificationSummary(y_train, lr.predict(X_train))

Confusion Matrix (Accuracy 0.9887)

       Prediction
Actual    0    1
     0 5932    0
     1   68    0


In [130]:
classificationSummary(y_valid, lr.predict(X_valid))

Confusion Matrix (Accuracy 0.9880)

       Prediction
Actual    0    1
     0 3952    0
     1   48    0


In [141]:
voter_ids = X_valid['VOTER_ID']
model_scores = lr.predict_proba(X_valid)[:, 1] * 100
output = pd.DataFrame({'VOTER_ID': voter_ids, 'model_score': model_scores, 'partisanship': y_valid})

In [142]:
output['model_score'].min()

0.23932981644299595

In [143]:
output['model_score'].max()

4.026334486066209

In [144]:
output['model_score'].value_counts()

1.160556    1
0.912713    1
0.737289    1
0.860941    1
1.009293    1
           ..
1.222559    1
0.988428    1
1.165308    1
0.705895    1
0.631179    1
Name: model_score, Length: 4000, dtype: int64

In [145]:
# introduce a small amount of noise in order to prevent an error that arises when bin edges are not unique
output['model_score'] += np.random.normal(0, 0.0001, len(output))

In [146]:
output['quintile'] = pd.qcut(output['model_score'], q=5, labels=['Quintile 1', 'Quintile 2', 'Quintile 3', 'Quintile 4', 'Quintile 5'])

print(output.head())

      VOTER_ID  model_score  partisanship    quintile
9953    490806     1.160779             0  Quintile 4
3850    263092     1.313196             0  Quintile 4
4962    137807     1.182765             0  Quintile 4
3886     63543     2.545763             0  Quintile 5
5437    255245     0.644683             0  Quintile 1


In [147]:
# calculate y percentage for each quantile in the test set
test_set_y_percent = output.groupby('quintile')['partisanship'].mean()

In [148]:
print(test_set_y_percent)
print('Test set Y percentage for entire test set:', test_set_y_total_percent)

quintile
Quintile 1    0.00875
Quintile 2    0.01375
Quintile 3    0.01375
Quintile 4    0.01375
Quintile 5    0.01000
Name: partisanship, dtype: float64
Test set Y percentage for entire test set: 0.012


In [149]:
lift = test_set_y_percent / test_set_y_total_percent

In [150]:
print(lift)

quintile
Quintile 1    0.729167
Quintile 2    1.145833
Quintile 3    1.145833
Quintile 4    1.145833
Quintile 5    0.833333
Name: partisanship, dtype: float64
