In [2]:
import pandas as pd
import numpy as np
import pprint as pp # for printing
import scipy.stats as st # for Normal PDF

#  Build a Naive Bayesian Classifier



**Can we predict whether someone will vote or not?**

Built a naive bayesian classifier to analyze a dataset of people with
different age, education background, income and race, to predict their propensity to vote.

In [11]:
dat = pd.read_csv('turnout.csv')

# Break data up into training and test data
train=dat.sample(frac=0.8,random_state=323)
test=dat.drop(train.index)

# Reset the indices for both the train and test
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

# Preview the training data 
train.head()

Unnamed: 0,id,age,educate,income,vote,white
0,1353,46,9.0,1.8429,0,0
1,122,25,15.0,3.8606,1,1
2,1530,69,17.0,13.3041,1,1
3,162,53,10.0,3.58,1,1
4,1807,34,16.0,5.4713,1,0


In [12]:
y1=train.query("vote==1")
y0=train.query("vote==0")

In [13]:
pr_y1=sum(train.vote)/len(train.vote)
pr_y0=1-pr_y1
pr_y0

0.25812500000000005

In [16]:
white={\
(1,1):sum(y1.white)/len(y1),
(0,1):sum(y0.white)/len(y0),
(1,0):1-sum(y1.white)/len(y1),
(0,0):1-sum(y0.white)/len(y0)}
white

{(1, 1): 0.8761583824768323,
 (0, 1): 0.7796610169491526,
 (1, 0): 0.12384161752316769,
 (0, 0): 0.22033898305084743}

In [17]:
# Calculate the conditional means/standard deviations
dist_locs = \
{("age",1):{'mean':y1.age.mean(),'sd':y1.age.std()},
 ("age",0):{'mean':y0.age.mean(),'sd':y0.age.std()},
 ("educate",1):{'mean':y1.educate.mean(),'sd':y1.educate.std()},
 ("educate",0):{'mean':y0.educate.mean(),'sd':y0.educate.std()},
 ("income",1):{'mean':y1.income.mean(),'sd':y1.income.std()},
 ("income",0):{'mean':y0.income.mean(),'sd':y0.income.std()},
}

# Print
pp.pprint(dist_locs)

{('age', 0): {'mean': 42.653753026634384, 'sd': 19.127668078553704},
 ('age', 1): {'mean': 46.14827295703454, 'sd': 16.760134583719374},
 ('educate', 0): {'mean': 10.665859564164649, 'sd': 3.2326417027391363},
 ('educate', 1): {'mean': 12.59519797809604, 'sd': 3.249301731768977},
 ('income', 0): {'mean': 2.8083740920096854, 'sd': 2.2222018360245834},
 ('income', 1): {'mean': 4.258665796124673, 'sd': 2.900978327757866}}


In [18]:
def predict(data,dist_locs):
    ''''''
    store_preds = []
    for i,row in data.iterrows():
        
        # Get the predictions using a Gaussan distribution
        pr_0 = 1; pr_1 = 1
        for j in range(1,len(row)-2):
            pr_0 *= st.norm(dist_locs[(row.index[j],0)]['mean'],
                            dist_locs[(row.index[j],0)]['sd']).pdf(row.values[j])
            pr_1 *= st.norm(dist_locs[(row.index[j],1)]['mean'], 
                            dist_locs[(row.index[j],1)]['sd']).pdf(row.values[j])
        # Add white variable into the model
        pr_0 *= white[(0,row.values[5])]
        pr_1 *= white[(1,row.values[5])]
        pr_0 *= pr_y0
        pr_1 *= pr_y1
        
        # Assign the class designation to the highest probability
        if pr_0 >= pr_1:
            class_pred = 0
        else:
            class_pred = 1
            
        store_preds.append([pr_0,pr_1,class_pred])
        
    return pd.DataFrame(store_preds,columns=["pr_0","pr_1","pred"])

# Run
preds_train = predict(train,dist_locs)

In [19]:
# Predicted output
preds_train.head(10)

Unnamed: 0,pr_0,pr_1,pred
0,2.062282e-05,1.41539e-05,0
1,2.210363e-05,8.876913e-05,1
2,7.562946e-11,3.185128e-07,1
3,7.405178e-05,0.0001699555,1
4,2.965753e-06,1.502657e-05,1
5,7.832032e-05,0.0001732741,1
6,2.017537e-05,5.439231e-05,1
7,1.52751e-05,2.723129e-05,1
8,8.329932e-05,0.0001252433,1
9,3.836915e-06,4.930827e-05,1


In [21]:
accuracy_train=sum(preds_train.pred==train.vote)/len(train.vote)
accuracy_train

0.74125

In [22]:
# Using test data to test accuracy
preds_test=predict(test,dist_locs)
accuracy_test=sum(preds_test.pred==test.vote)/len(test.vote)
accuracy_test

0.72

The prediction accuracy is better than chance. Also, I tried the model without white variable, and the accuracy didn't change. That indicates race does not have an influence on one's propensity to vote.