## Transform
## USGS 2012 Presidential Election Results by County

source:  https://catalog.data.gov/dataset/presidential-general-election-results-2012-direct-download

In [33]:
import pandas as pd
import numpy as np
import sys

version = ".".join(map(str, sys.version_info[:3]))
print('python version ', version)
print('numpy version ', np.__version__)
print('pandas version ',pd.__version__)

python version  3.5.2
numpy version  1.10.4
pandas version  0.18.1


In [9]:
ls ../data/usgs/elpo12p010g_nt00887/

[0m[01;32melpo12p010g.dbf[0m*  [01;32melpo12p010g.sbn[0m*  [01;32melpo12p010g.shp[0m*  [01;32melpo12p010g.txt[0m*
[01;32melpo12p010g.prj[0m*  [01;32melpo12p010g.sbx[0m*  [01;32melpo12p010g.shx[0m*  [01;32melpo12p010g.xml[0m*


In [10]:
filename_usgs_12 = '../data/usgs/elpo12p010g_nt00887/elpo12p010g.dbf'

In [11]:
## Load .dbf file and create pandas DataFrame
db = ps.open(filename_usgs_12)
d = {col: db.by_col(col) for col in db.header}
usgs_12_df = pd.DataFrame(d)
db.close()
usgs_12_df.shape

(3153, 14)

In [12]:
usgs_12_df.head(3)

Unnamed: 0,COUNTY,FIPS,OBAMA,OTHERS,PCT_OBM,PCT_OTHR,PCT_ROM,PCT_WNR,ROMNEY,STATE,STATE_FIPS,TTL_VT,WINNER,group
0,Autauga,1001,6363.0,231.0,26.54236,0.963584,72.494056,72.494056,17379.0,AL,1,23973.0,Romney,24
1,Barbour,1005,5912.0,55.0,51.332812,0.477555,48.189633,51.332812,5550.0,AL,1,11517.0,Obama,12
2,Bibb,1007,2202.0,86.0,26.152019,1.021378,72.826603,72.826603,6132.0,AL,1,8420.0,Romney,24


In [17]:
# Drop unwanted columns
usgs_12_df.drop(labels=['group','PCT_WNR','WINNER'],
                axis=1,
                inplace=True)

In [18]:
print('shape ', usgs_12_df.shape)
print('columns ', usgs_12_df.columns)

shape  (3153, 11)
columns  Index(['COUNTY', 'FIPS', 'OBAMA', 'OTHERS', 'PCT_OBM', 'PCT_OTHR', 'PCT_ROM',
       'ROMNEY', 'STATE', 'STATE_FIPS', 'TTL_VT'],
      dtype='object')


In [19]:
# Drop duplicate rows that are artifacts of multiple polygons per county
# Keep the first row and discard the remaining duplicate rows
usgs_12_df.drop_duplicates(keep='first',
                           inplace=True)

In [20]:
print('shape ', usgs_12_df.shape)
print('columns ', usgs_12_df.columns)
usgs_12_df.head(3)

shape  (3153, 11)
columns  Index(['COUNTY', 'FIPS', 'OBAMA', 'OTHERS', 'PCT_OBM', 'PCT_OTHR', 'PCT_ROM',
       'ROMNEY', 'STATE', 'STATE_FIPS', 'TTL_VT'],
      dtype='object')


Unnamed: 0,COUNTY,FIPS,OBAMA,OTHERS,PCT_OBM,PCT_OTHR,PCT_ROM,ROMNEY,STATE,STATE_FIPS,TTL_VT
0,Autauga,1001,6363.0,231.0,26.54236,0.963584,72.494056,17379.0,AL,1,23973.0
1,Barbour,1005,5912.0,55.0,51.332812,0.477555,48.189633,5550.0,AL,1,11517.0
2,Bibb,1007,2202.0,86.0,26.152019,1.021378,72.826603,6132.0,AL,1,8420.0


In [21]:
# Generate columns with the attributes
# year {2016,2012,2008,2004,2000, ...}
# office {'president', 'senator', 'representative', ...}
# level  {'federal','state','county', ...}
num_rows = usgs_12_df.shape[0]
year_list = [2012] * num_rows
office_list = ['president'] * num_rows 
level_list = ['federal'] * num_rows
usgs_12_df['YEAR'] = year_list
usgs_12_df['OFFICE'] = office_list
usgs_12_df['LEVEL'] = level_list

In [22]:
# Re order the columns
usgs_12_df = usgs_12_df[['YEAR',
                         'FIPS',
                         'STATE_FIPS',
                         'STATE',
                         'COUNTY',
                         'OFFICE',
                         'LEVEL',
                         'OBAMA',
                         'ROMNEY',
                         'OTHERS',
                         'PCT_OBM',
                         'PCT_ROM',
                         'PCT_OTHR',
                         'TTL_VT']]

In [25]:
# Rename the column labels
usgs_12_df.columns = ['YEAR','FIPS','STATE_FIPS','STATE_ABBR','COUNTY_NAME',
                      'OFFICE','LEVEL','VOTES_DEM','VOTES_REP','VOTES_OTHER',
                      'PERCENT_DEM','PERCENT_REP','PERCENT_OTHER','TOTAL_VOTES']

In [30]:
## Round percentage of votes to one place
round_percentage = lambda x: round(float(x),1)
usgs_12_df['PERCENT_DEM'] = usgs_12_df['PERCENT_DEM'].apply(round_percentage)
usgs_12_df['PERCENT_REP'] = usgs_12_df['PERCENT_REP'].apply(round_percentage)
usgs_12_df['PERCENT_OTHER'] = usgs_12_df['PERCENT_OTHER'].apply(round_percentage)

In [34]:
# Convert votes from floats to integers
# If you need a float for calculations you can cast it to a float
usgs_12_df['VOTES_DEM'] = usgs_12_df.VOTES_DEM.astype('int')
usgs_12_df['VOTES_REP'] = usgs_12_df.VOTES_REP.astype('int')
usgs_12_df['VOTES_OTHER'] = usgs_12_df.VOTES_OTHER.astype('int')
usgs_12_df['TOTAL_VOTES'] = usgs_12_df.TOTAL_VOTES.astype('int')

In [35]:
print('shape (num_rows,num_cols) ', usgs_12_df.shape)
usgs_12_df.head(3)

shape (num_rows,num_cols)  (3153, 14)


Unnamed: 0,YEAR,FIPS,STATE_FIPS,STATE_ABBR,COUNTY_NAME,OFFICE,LEVEL,VOTES_DEM,VOTES_REP,VOTES_OTHER,PERCENT_DEM,PERCENT_REP,PERCENT_OTHER,TOTAL_VOTES
0,2012,1001,1,AL,Autauga,president,federal,6363,17379,231,26.5,72.5,1.0,23973
1,2012,1005,1,AL,Barbour,president,federal,5912,5550,55,51.3,48.2,0.5,11517
2,2012,1007,1,AL,Bibb,president,federal,2202,6132,86,26.2,72.8,1.0,8420


## Output transformed csv file to output directory
Work in progress

In [36]:
filename_out = '../output/2012_pres_election_by_county.csv'
usgs_12_df.to_csv(filename_out,
                  index=False)