In [2]:
import pandas as pd
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
import warnings
pd.set_option("max_columns", 500)
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

In [3]:
# import cleaned dataset
data = pd.read_csv('features_v1.csv').drop('Unnamed: 0', 1)
data

Unnamed: 0,county,state,totalpop,income,professional,service,office,drive,carpool,transit,workathome,meancommute,unemployment,party,perc_men,perc_white,perc_private_work,perc_citizen,perc_employed
0,Los Angeles,CA,10038388.0,56196.0,0.357,0.191,0.246,0.730,0.099,0.068,0.051,0.300,0.100,D,0.492644,0.275897,0.789211,0.602363,0.461774
1,Cook,IL,5236393.0,55251.0,0.383,0.182,0.244,0.621,0.087,0.184,0.042,0.323,0.107,D,0.484541,0.438454,0.839000,0.668383,0.470487
2,Harris,TX,4356362.0,54457.0,0.347,0.174,0.235,0.792,0.111,0.029,0.033,0.282,0.075,D,0.497371,0.321501,0.834835,0.572664,0.477896
3,Maricopa,AZ,4018143.0,54229.0,0.366,0.184,0.272,0.765,0.110,0.024,0.059,0.255,0.077,R,0.494297,0.585890,0.824176,0.661961,0.453204
4,Miami-Dade,FL,2639042.0,43129.0,0.315,0.210,0.282,0.769,0.092,0.055,0.043,0.299,0.100,D,0.485108,0.152371,0.819000,0.576567,0.456556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,McPherson,NE,433.0,54306.0,0.338,0.104,0.279,0.475,0.097,0.000,0.226,0.320,0.009,R,0.515012,0.990854,0.693307,0.787529,0.512702
3041,Clark,ID,901.0,33672.0,0.229,0.206,0.120,0.677,0.156,0.000,0.038,0.170,0.039,R,0.488346,0.584337,0.744000,0.479467,0.490566
3042,Arthur,NE,448.0,39375.0,0.549,0.109,0.041,0.522,0.156,0.000,0.199,0.195,0.040,R,0.497768,1.000000,0.544000,0.694196,0.430804
3043,Kenedy,TX,565.0,36438.0,0.249,0.341,0.205,0.946,0.000,0.000,0.000,0.166,0.000,D,0.522124,0.336000,0.519000,0.559292,0.327434


In [4]:
# high correlation between professional, income, unemployment, and perc_employed
# will use income and unemployment as features, drop other two

data = data.drop(['professional', 'perc_employed'], 1)
data

Unnamed: 0,county,state,totalpop,income,service,office,drive,carpool,transit,workathome,meancommute,unemployment,party,perc_men,perc_white,perc_private_work,perc_citizen
0,Los Angeles,CA,10038388.0,56196.0,0.191,0.246,0.730,0.099,0.068,0.051,0.300,0.100,D,0.492644,0.275897,0.789211,0.602363
1,Cook,IL,5236393.0,55251.0,0.182,0.244,0.621,0.087,0.184,0.042,0.323,0.107,D,0.484541,0.438454,0.839000,0.668383
2,Harris,TX,4356362.0,54457.0,0.174,0.235,0.792,0.111,0.029,0.033,0.282,0.075,D,0.497371,0.321501,0.834835,0.572664
3,Maricopa,AZ,4018143.0,54229.0,0.184,0.272,0.765,0.110,0.024,0.059,0.255,0.077,R,0.494297,0.585890,0.824176,0.661961
4,Miami-Dade,FL,2639042.0,43129.0,0.210,0.282,0.769,0.092,0.055,0.043,0.299,0.100,D,0.485108,0.152371,0.819000,0.576567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,McPherson,NE,433.0,54306.0,0.104,0.279,0.475,0.097,0.000,0.226,0.320,0.009,R,0.515012,0.990854,0.693307,0.787529
3041,Clark,ID,901.0,33672.0,0.206,0.120,0.677,0.156,0.000,0.038,0.170,0.039,R,0.488346,0.584337,0.744000,0.479467
3042,Arthur,NE,448.0,39375.0,0.109,0.041,0.522,0.156,0.000,0.199,0.195,0.040,R,0.497768,1.000000,0.544000,0.694196
3043,Kenedy,TX,565.0,36438.0,0.341,0.205,0.946,0.000,0.000,0.000,0.166,0.000,D,0.522124,0.336000,0.519000,0.559292


In [6]:
# convert party into binary, drop county and state variables, reorder columns

def party_to_binary(party):
    if party == 'R':
        return 1
    else:
        return 0
    
data['party'] = data['party'].apply(lambda x: party_to_binary(x))
data

Unnamed: 0,county,state,totalpop,income,service,office,drive,carpool,transit,workathome,meancommute,unemployment,party,perc_men,perc_white,perc_private_work,perc_citizen
0,Los Angeles,CA,10038388.0,56196.0,0.191,0.246,0.730,0.099,0.068,0.051,0.300,0.100,0,0.492644,0.275897,0.789211,0.602363
1,Cook,IL,5236393.0,55251.0,0.182,0.244,0.621,0.087,0.184,0.042,0.323,0.107,0,0.484541,0.438454,0.839000,0.668383
2,Harris,TX,4356362.0,54457.0,0.174,0.235,0.792,0.111,0.029,0.033,0.282,0.075,0,0.497371,0.321501,0.834835,0.572664
3,Maricopa,AZ,4018143.0,54229.0,0.184,0.272,0.765,0.110,0.024,0.059,0.255,0.077,1,0.494297,0.585890,0.824176,0.661961
4,Miami-Dade,FL,2639042.0,43129.0,0.210,0.282,0.769,0.092,0.055,0.043,0.299,0.100,0,0.485108,0.152371,0.819000,0.576567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3040,McPherson,NE,433.0,54306.0,0.104,0.279,0.475,0.097,0.000,0.226,0.320,0.009,1,0.515012,0.990854,0.693307,0.787529
3041,Clark,ID,901.0,33672.0,0.206,0.120,0.677,0.156,0.000,0.038,0.170,0.039,1,0.488346,0.584337,0.744000,0.479467
3042,Arthur,NE,448.0,39375.0,0.109,0.041,0.522,0.156,0.000,0.199,0.195,0.040,1,0.497768,1.000000,0.544000,0.694196
3043,Kenedy,TX,565.0,36438.0,0.341,0.205,0.946,0.000,0.000,0.000,0.166,0.000,0,0.522124,0.336000,0.519000,0.559292


In [9]:
# save to csv

data = data[['totalpop', 'income', 'service', 'office', 'drive',
       'carpool', 'transit', 'workathome', 'meancommute', 'unemployment',
       'perc_men', 'perc_white', 'perc_private_work', 'perc_citizen', 'party']]
data.to_csv('features_v2.csv', index = False)