## Imports and Functions

#### The code in the file ML.py is modified from material for Rayid Ghani's Spring 2018 ML for Public Policy course.

In [1]:
from data_functions import *
from ML import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
def temporal_train_test_data_split(df, features, filter_var, outcome_var):
    start_date = '2004'
    training_end_date = '2008'
    testing_end_date = '2010'

    y_df = df[[filter_var, outcome_var]]

    x_train_filter_by_date = filter_df_by_date_range(df, filter_var, training_end_date, testing_end_date)
    x_test_filter_by_date = filter_df_by_date_range(df, filter_var, start_date, training_end_date)

    y_train_filter_by_date = filter_df_by_date_range(y_df, filter_var, training_end_date, testing_end_date)
    y_test_filter_by_date = filter_df_by_date_range(y_df, filter_var, start_date, training_end_date)

    x_train = x_train_filter_by_date[features]
    x_test = x_test_filter_by_date[features]

    y_train = y_train_filter_by_date[outcome_var]
    y_test = y_test_filter_by_date[outcome_var]

    return x_test, x_train, y_test, y_train

In [3]:
def single_year(df, year):
    
    voting_cols = ['voted2004','voted2006', 'voted2008', 'voted2010']
    common = ['county', 'name', 'city', 'zip', 'address', 'phone', 'gender', 'age',
       'precinct', 'cong_dist', 'nc_senate', 'nc_house', 'idx', 'white',
       'black', 'hispanic', 'dem', 'rep']

    df['election_year'] = year
    col_name = "voted" + year
    voting_cols.remove(col_name)
    df = df[common + ['election_year'] + [col_name]]
    df.columns = common + ['election_year'] + ['voted']

    return df

### Open and reformat data

In [4]:
data = read('voterfileNC.dta')
data = data.rename(index=str, columns={"id": "idx"})
data.columns

Index(['county', 'name', 'city', 'zip', 'address', 'phone', 'gender', 'age',
       'precinct', 'cong_dist', 'nc_senate', 'nc_house', 'idx', 'white',
       'black', 'hispanic', 'dem', 'rep', 'regyear', 'voted2004', 'voted2006',
       'voted2008', 'voted2010'],
      dtype='object')

### Reshape the data to “long” format such that each row pertains to an individual in a given election year.

In [5]:
v04, v06, v08, v10 = data.copy(), data.copy(), data.copy(), data.copy()

In [6]:
v04_only = single_year(v04, '2004')
v06_only = single_year(v06, '2006')
v08_only = single_year(v08, '2008')
v10_only = single_year(v10, '2010')

In [7]:
frames = [v04_only, v06_only, v08_only, v10_only]
long = pd.concat(frames)

In [8]:
l = long.copy()
l.shape

(40000, 20)

## Part 1: Predict likely voters

### Briefly discuss relevant predictors in the voter file (voterfileNC.dta)

Factors that can be predictive of turnout include race (white, black), ethnicity (hispanic), gender, and age. Young people in particular generally have low turnout rates. Their address or location could be influential as well, but that would likely be due to and reflected in other factors. 

#### Add columns that could be useful.

In [9]:
l['is_midterm'] = l.election_year.apply(lambda x: 1 if (int(x) % 4 == 0) else 1)
l['gender'] = l.gender.apply(lambda x: 1 if x == 'F' else 0)

In [10]:
l.head().style

Unnamed: 0,county,name,city,zip,address,phone,gender,age,precinct,cong_dist,nc_senate,nc_house,idx,white,black,hispanic,dem,rep,election_year,voted,is_midterm
0,COLUMBUS,MICHELLE MCPHERSON,CLARENDON,28432,2184 BEAVERDAM RD,910 653 3698,1,43,P24,7,13,46,BC48282,1,0,0,0,0,2004,1,1
1,LENOIR,JUSTIN MAURICE RHODES,KINSTON,28504,2204 TANGLEWOOD DR,252 523 0324,0,22,K9,1,5,12,CM55969,0,1,0,1,0,2004,0,1
2,ROBESON,LOTTIE BARTON BULLARD,MAXTON,28364,238 BARTON RD,910 843 3619,1,65,34,7,13,47,DR39304,0,0,0,1,0,2004,1,1
3,CUMBERLAND,GARRETT CHRISTIAN ALLEN,FAYETTEVILLE,28311,522 FOXLAIR DR,910 922 0136,0,25,LI65,4,19,45,BE261083,1,0,0,0,1,2004,0,1
4,POLK,CAROLYN R PACE,SALUDA,28773,360 ROCKY MOUNTAIN LN,828 749 3471,1,70,SA04,10,47,113,DM5357,1,0,0,0,1,2004,1,1


In [11]:
l.columns

Index(['county', 'name', 'city', 'zip', 'address', 'phone', 'gender', 'age',
       'precinct', 'cong_dist', 'nc_senate', 'nc_house', 'idx', 'white',
       'black', 'hispanic', 'dem', 'rep', 'election_year', 'voted',
       'is_midterm'],
      dtype='object')

#### I narrowed down the features so that I did not double count any of the characteristiscs like white vs. black or rep vs. dem. 

In [12]:
rfeats = ['gender', 'white', 'hispanic', 'rep', 'is_midterm', 'age']

In [13]:
x_train, x_test, y_train, y_test  = temporal_train_test_data_split(l, rfeats , 'election_year', 'voted')

### Models

In [14]:
results_df = run_simple_loop(x_train, x_test, y_train, y_test, 'small')

DT
KNN
LR


### Test their predictive performance.

In [15]:
results_df = results_df[['model_type', 'clf', 'parameters', 'baseline_p', 'auc-roc', 'p_at_10', 'r_at_10', 'f1_at_10']]

In [16]:
results_df.to_pickle('results_df.pkl')
results_df = pd.read_pickle('results_df.pkl')

#### I decided to use the metric of precision at 10% to select voters to target. We are looking for 1000-2000 candidates out of 10,000 potential voters, so 10%. I also decided that being able to correctly identify a potential voter, precision, was more important than classifying all voters correctly. Knowing which models had the highest precision would be more helpful in making predictions for the future.  I looked at a total of 98 models: 36 decision trees, 36 K-nearest neighbors, and 10 logistic regressions. 

In [17]:
dt_results = results_df[results_df.model_type == 'DT']
knn_results = results_df[results_df.model_type == 'KNN']
lr_results = results_df[results_df.model_type == 'LR']

print("Best Decision Tree")
print(find_best_model(dt_results, criteria='p_at_10'))
print("\n" + "Best K Nearest Neighbor")
print(find_best_model(knn_results, criteria='p_at_10'))
print("\n" + "Best Logistoic Regression")
print(find_best_model(lr_results, criteria='p_at_10'))

Best Decision Tree
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=100,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Best K Nearest Neighbor
KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=100, p=2,
           weights='distance')

Best Logistoic Regression
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [18]:
best_clf = find_best_model(results_df, criteria='p_at_10')
best_clf

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=100,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Rank the voters by their predicted propensity to vote

In [19]:
thresh, topN = 0.5, 5000

final,ids, cnt = [], [], 0
y_scores = best_clf.predict_proba(x_test)[:,1]
dfinal = x_test.copy()
dfinal['label_value'] = y_test
dfinal['scores'] = y_scores
dfinal['score'] = np.where(dfinal['scores'] > thresh, 1, 0)

dfinal = dfinal.sort_values(by='scores', ascending=False)
df_id = l[['idx']]
dF = pd.merge(df_id, dfinal, left_index=True, right_index=True)
dF = dF.sort_values(by='scores', ascending=False)
mini= dF[['idx', 'scores', 'score']]

top_lst = [tuple(x) for x in mini.values]

for idx, scores, score in top_lst:
    if idx not in ids and cnt <=topN-1:
        ids.append(idx)
        final.append(idx)
        cnt +=1

In [20]:
s = pd.DataFrame()
s['idx'] = final

In [21]:
demo = l[['idx', 'name', 'county', 'city', 'zip', 'address', 'phone', 'gender', 'age', 'white', 'black', 'hispanic', 'dem']]
demo = demo.drop_duplicates()

In [22]:
s = s.merge(demo, on=['idx'], how='left')

#### Remove Republicans to make sure you have 2000 potential democratic voters. 

In [23]:
dem_potentials = s[s.dem == 1.0][:2000]
dem_potentials.shape

(2000, 13)

### Make a list of the 1,000 people your door-to-door campaign should contact.

#### In order to minimize logistics issues, I decided to select the 1000 voters who lived in more similar areas for door to door contact. I counted how many people lived in each of the counties, and selected the 16 where the most likely voters lived. 

In [24]:
county_size = dem_potentials.county.value_counts()
counties = []

n = 0
for idx, val in county_size.iteritems():
    if n < 1000:
        print(idx, val)
        n += val
        counties.append(idx)

BUNCOMBE 150
NEW HANOVER 68
JOHNSTON 67
CUMBERLAND 66
CRAVEN 60
HARNETT 57
ROCKINGHAM 55
LENOIR 55
SURRY 52
HAYWOOD 49
CABARRUS 48
RUTHERFORD 46
HENDERSON 46
WILSON 46
WAYNE 46
BRUNSWICK 44
SAMPSON 41
CHATHAM 41


In [25]:
dem_potentials['gender'] = dem_potentials.gender.apply(lambda x: 'F' if x == 1 else 'M')

In [26]:
door = dem_potentials[dem_potentials.county.isin(counties)][:1000]
door.to_csv('PS4_door_to_door.csv')

### Make another list of the 1,000 people you would like to reach via direct mail. 

In [27]:
letters = dem_potentials[~dem_potentials.isin(door)].dropna()
letters.to_csv('PS4_by_direct_mail.csv')

## Part 2: Produce Content

### Describe how you would like to implement the door-to-door campaign: What is the message, who is the messenger etc.

#### This list is sorted by city so that volunteers from each city can focus on the people they need to target. 

In [28]:
door.sort_values(by=['city', 'black'])

Unnamed: 0,idx,name,county,city,zip,address,phone,gender,age,white,black,hispanic,dem
2101,AL207727,LUCY LORRAINE ROBERTSON,BUNCOMBE,ALEXANDER,28701,15 DONNA LN,828 484 9041,F,84,1.0,0.0,0.0,1.0
340,CJ113299,BETTY KING POOLE,JOHNSTON,ANGIER,27501,1625 MASSENGILL POND RD,919 639 6185,F,80,1.0,0.0,0.0,1.0
948,CA48278,PHYLLIS WOOD COZART,HARNETT,ANGIER,27501,115 HONEYCUTT DR #D,910 892 7040,F,62,1.0,0.0,0.0,1.0
1404,CA19263,RICHARD LEROY WELLS JR,HARNETT,ANGIER,27501,441 WELLS LN,919 639 4389,M,61,1.0,0.0,0.0,1.0
1681,CA91724,THERESA DEAN WILKINS,HARNETT,ANGIER,27501,2563 OAK GROVE CHURCH RD,919 639 4847,F,53,1.0,0.0,0.0,1.0
3219,CJ84411,ETHEL ROSE JOHNSON,JOHNSTON,ANGIER,27501,105 PLEASANT PINE DR,919 331 8085,F,66,1.0,0.0,0.0,1.0
3547,CA15102,BESSIE S PRUETT,HARNETT,ANGIER,27501,9767 NC 210 N,919 639 2284,F,88,1.0,0.0,0.0,1.0
3743,CA19055,EIRA TAYLOR WATKINS,HARNETT,ANGIER,27501,372 N BROAD ST E,919 639 4588,F,88,1.0,0.0,0.0,1.0
4615,CA82506,PAMELA TALLEY,HARNETT,ANGIER,27501,321 MANGUM RD,919 639 8167,F,49,1.0,0.0,0.0,1.0
378,AX25605,LAURIER ROSS BUSH,CHATHAM,APEX,27523,107 SHADY LANE CIR,919 828 6260,M,71,1.0,0.0,0.0,1.0


In [29]:
door.gender.value_counts()

F    625
M    375
Name: gender, dtype: int64

In [30]:
door.age.describe()

count    1000.000000
mean       66.056000
std        11.689633
min        36.000000
25%        58.000000
50%        65.000000
75%        74.000000
max       103.000000
Name: age, dtype: float64

In [31]:
"white", door.white.mean(), "black", door.black.mean(), "hispanic", door.hispanic.mean(), 

('white', 0.815, 'black', 0.174, 'hispanic', 0.003)

In [32]:
door[(door.white == 1) & (door.gender == 'F')].shape

(501, 13)

In [33]:
black_door = door[door.black == 1]
black_door.city.value_counts()[:5]

FAYETTEVILLE    20
WILSON          17
KINSTON         15
REIDSVILLE      11
GOLDSBORO       10
Name: city, dtype: int64

In [34]:
black_door.gender.value_counts()

F    117
M     57
Name: gender, dtype: int64

### Describe how you would like to implement the direct mail campaign: Write the letter you would like to be distributed.

In [35]:
"white", letters.white.mean(), "black", letters.black.mean(), "hispanic", letters.hispanic.mean(), 

('white', 0.764, 'black', 0.217, 'hispanic', 0.002)

In [36]:
letters.gender.value_counts()

F    628
M    372
Name: gender, dtype: int64

In [37]:
letters[letters.duplicated(keep=False)]

Unnamed: 0,idx,name,county,city,zip,address,phone,gender,age,white,black,hispanic,dem


In [38]:
letters.iloc[0]

idx                     CZ10393
name           ROBERT T VEASEY 
county                    MOORE
city                   ABERDEEN
zip                       28315
address     110  CAMPBELL ST   
phone              910 944 1666
gender                        M
age                          91
white                         1
black                         0
hispanic                      0
dem                           1
Name: 23, dtype: object

In [39]:
l[l.name == 'LEE E HINTON ']

Unnamed: 0,county,name,city,zip,address,phone,gender,age,precinct,cong_dist,...,nc_house,idx,white,black,hispanic,dem,rep,election_year,voted,is_midterm
1170,JOHNSTON,LEE E HINTON,SMITHFIELD,27577,410 E HOLT ST,919 934 7925,0,46,PR26,7.0,...,28.0,CJ90250,0.0,1.0,0.0,1.0,0.0,2004,1.0,1
1170,JOHNSTON,LEE E HINTON,SMITHFIELD,27577,410 E HOLT ST,919 934 7925,0,46,PR26,7.0,...,28.0,CJ90250,0.0,1.0,0.0,1.0,0.0,2006,0.0,1
1170,JOHNSTON,LEE E HINTON,SMITHFIELD,27577,410 E HOLT ST,919 934 7925,0,46,PR26,7.0,...,28.0,CJ90250,0.0,1.0,0.0,1.0,0.0,2008,0.0,1
1170,JOHNSTON,LEE E HINTON,SMITHFIELD,27577,410 E HOLT ST,919 934 7925,0,46,PR26,7.0,...,28.0,CJ90250,0.0,1.0,0.0,1.0,0.0,2010,0.0,1
