## Transform
## USGS 2008 Presidential Election Results by County

source:  https://catalog.data.gov/dataset/2008-presidential-general-election-county-results-direct-download

In [19]:
import pandas as pd
import numpy as np
import pysal as ps
import sys

version = ".".join(map(str, sys.version_info[:3]))
print('python version ', version)
print('numpy version ', np.__version__)
print('pandas version ',pd.__version__)

python version  3.5.2
numpy version  1.10.4
pandas version  0.18.1


In [20]:
ls ../data/usgs/elpo08p020_nt00335

elpo08p020.dbf  elpo08p020.txt


In [21]:
filename_usgs_08 = '../data/usgs/elpo08p020_nt00335/elpo08p020.dbf'

In [22]:
# Open .dbx file and create a pandas DataFrame
# shape returns a tuple with (num_rows, num_cols)
db = ps.open(filename_usgs_08)
d = {col: db.by_col(col) for col in db.header}
usgs_08_df = pd.DataFrame(d)
db.close()
usgs_08_df.shape

(4755, 16)

In [23]:
usgs_08_df.head(3)

Unnamed: 0,AREA,COUNTY,EL2004P020,FIPS,OBJECTID,PERCENT_DE,PERCENT_OT,PERCENT_RE,PERIMETER,STATE,STATE_FIPS,SYMBOL_COD,TOTAL_VOTE,VOTE_DEM,VOTE_OTH,VOTE_REP
0,7.009898,"State House District 8, Denali-University",8.0,2008,1,48.4,3.31,48.28,18.211322,AK,2,10,10320,4995,342,4983
1,16.636605,"State House District 37, Bristol Bay-Aleuti",37.0,2037,2,40.04,2.92,57.04,168.791686,AK,2,4,4665,1868,136,2661
2,12.090892,"State House District 12, Richardson-Glenn H",12.0,2012,3,25.22,2.74,72.04,36.21083,AK,2,2,7589,1914,208,5467


In [24]:
usgs_08_df.columns

Index(['AREA', 'COUNTY', 'EL2004P020', 'FIPS', 'OBJECTID', 'PERCENT_DE',
       'PERCENT_OT', 'PERCENT_RE', 'PERIMETER', 'STATE', 'STATE_FIPS',
       'SYMBOL_COD', 'TOTAL_VOTE', 'VOTE_DEM', 'VOTE_OTH', 'VOTE_REP'],
      dtype='object')

In [25]:
# Drop unwanted columns that are most likely geospatial
# axis equals one means operate on the columns. Think of the shape/dimension
usgs_08_df.drop(labels=['AREA','EL2004P020', 'OBJECTID','PERIMETER', 'SYMBOL_COD',],
                axis=1,
                inplace=True)

In [26]:
print('shape ', usgs_08_df.shape)
print('columns ', usgs_08_df.columns)

shape  (4755, 11)
columns  Index(['COUNTY', 'FIPS', 'PERCENT_DE', 'PERCENT_OT', 'PERCENT_RE', 'STATE',
       'STATE_FIPS', 'TOTAL_VOTE', 'VOTE_DEM', 'VOTE_OTH', 'VOTE_REP'],
      dtype='object')


In [27]:
# Drop duplicate rows that are artifacts of multiple polygons per county
# Keep the first row and discard the remaining duplicate rows
usgs_08_df.drop_duplicates(keep='first',
                           inplace=True)

In [28]:
print('shape ', usgs_08_df.shape)
print('columns ', usgs_08_df.columns)
usgs_08_df.head(3)

shape  (3166, 11)
columns  Index(['COUNTY', 'FIPS', 'PERCENT_DE', 'PERCENT_OT', 'PERCENT_RE', 'STATE',
       'STATE_FIPS', 'TOTAL_VOTE', 'VOTE_DEM', 'VOTE_OTH', 'VOTE_REP'],
      dtype='object')


Unnamed: 0,COUNTY,FIPS,PERCENT_DE,PERCENT_OT,PERCENT_RE,STATE,STATE_FIPS,TOTAL_VOTE,VOTE_DEM,VOTE_OTH,VOTE_REP
0,"State House District 8, Denali-University",2008,48.4,3.31,48.28,AK,2,10320,4995,342,4983
1,"State House District 37, Bristol Bay-Aleuti",2037,40.04,2.92,57.04,AK,2,4665,1868,136,2661
2,"State House District 12, Richardson-Glenn H",2012,25.22,2.74,72.04,AK,2,7589,1914,208,5467


In [29]:
# Generate columns with the attributes
# year {2016,2012,2008,2004,2000, ...}
# office {'president', 'senator', 'representative', ...}
# level  {'federal','state','county', ...}
num_rows = usgs_08_df.shape[0]
year_list = [2008] * num_rows
office_list = ['president'] * num_rows 
level_list = ['federal'] * num_rows
usgs_08_df['YEAR'] = year_list
usgs_08_df['OFFICE'] = office_list
usgs_08_df['LEVEL'] = level_list

In [30]:
# Re order the columns
usgs_08_df = usgs_08_df[['YEAR',
                         'FIPS',
                         'STATE_FIPS',
                         'STATE',
                         'COUNTY',
                         'OFFICE',
                         'LEVEL',
                         'VOTE_DEM',
                         'VOTE_REP',
                         'VOTE_OTH',
                         'PERCENT_DE',
                         'PERCENT_RE',
                         'PERCENT_OT',
                         'TOTAL_VOTE']]

In [31]:
# Rename the column labels
usgs_08_df.columns = ['YEAR','FIPS','STATE_FIPS','STATE_ABBR','COUNTY_NAME',
                      'OFFICE','LEVEL','VOTES_DEM','VOTES_REP','VOTES_OTHER',
                      'PERCENT_DEM','PERCENT_REP','PERCENT_OTHER','TOTAL_VOTES']

In [32]:
# Number of null values
# You can use DataFrame.info() as well
null_df = pd.DataFrame({'number of null values': usgs_08_df.isnull().sum(),
                        'data type' : usgs_08_df.dtypes })
null_df

Unnamed: 0,data type,number of null values
YEAR,int64,0
FIPS,object,0
STATE_FIPS,object,0
STATE_ABBR,object,0
COUNTY_NAME,object,0
OFFICE,object,0
LEVEL,object,0
VOTES_DEM,object,0
VOTES_REP,object,0
VOTES_OTHER,object,0


In [34]:
# Convert votes from floats to integers
# If you need a float for calculations you can cast it to a float
#usgs_08_df['VOTES_DEM'] = usgs_08_df['VOTES_DEM'].astype('int')
#usgs_08_df['VOTES_REP'] = usgs_08_df['VOTES_REP'].astype('int')
#usgs_08_df['VOTES_OTHER'] = usgs_08_df['VOTES_OTHER'].astype('int')
#usgs_08_df['TOTAL_VOTES'] = usgs_08_df['TOTAL_VOTES'].astype('int')

In [35]:
## Round percentage of votes to one place
round_percentage = lambda x: round(float(x),1)
#usgs_08_df['PERCENT_DEM'] = usgs_08_df['PERCENT_DEM'].apply(round_percentage)
#usgs_08_df['PERCENT_REP'] = usgs_08_df['PERCENT_REP'].apply(round_percentage)
#usgs_08_df['PERCENT_OTHER'] = usgs_08_df['PERCENT_OTHER'].apply(round_percentage)

In [36]:
usgs_08_df.head(3)

Unnamed: 0,YEAR,FIPS,STATE_FIPS,STATE_ABBR,COUNTY_NAME,OFFICE,LEVEL,VOTES_DEM,VOTES_REP,VOTES_OTHER,PERCENT_DEM,PERCENT_REP,PERCENT_OTHER,TOTAL_VOTES
0,2008,2008,2,AK,"State House District 8, Denali-University",president,federal,4995,4983,342,48.4,48.28,3.31,10320
1,2008,2037,2,AK,"State House District 37, Bristol Bay-Aleuti",president,federal,1868,2661,136,40.04,57.04,2.92,4665
2,2008,2012,2,AK,"State House District 12, Richardson-Glenn H",president,federal,1914,5467,208,25.22,72.04,2.74,7589


## Output transformed csv file to output directory
Work in progress

In [37]:
filename_out = '../output/2008_pres_election_by_county.csv'
usgs_08_df.to_csv(filename_out,
                  index=False)