## Transform
## USGS 2004 Presidential Election Results by County

source:  https://catalog.data.gov/dataset/2004-presidential-general-election-county-results-direct-download

In [1]:
import pandas as pd
import numpy as np
import pysal as ps
import sys

version = ".".join(map(str, sys.version_info[:3]))
print('python version ', version)
print('numpy version ', np.__version__)
print('pandas version ',pd.__version__)

python version  3.5.2
numpy version  1.10.4
pandas version  0.18.1


In [2]:
ls ../data/usgs/elpo04p020_nt00334/

elpo04p020.dbf  elpo04p020.txt


In [3]:
filename_usgs_04 = '../data/usgs/elpo04p020_nt00334/elpo04p020.dbf'

In [4]:
db = ps.open(filename_usgs_04)
d = {col: db.by_col(col) for col in db.header}
usgs_04_df = pd.DataFrame(d)
db.close()
usgs_04_df.shape

(4755, 16)

In [5]:
usgs_04_df.head(2)

Unnamed: 0,AREA,COUNTY,EL2004P020,FIPS,OBJECTID,PERCENT_DE,PERCENT_OT,PERCENT_RE,PERIMETER,STATE,STATE_FIPS,SYMBOL_COD,TOTAL_VOTE,VOTE_DEM,VOTE_OTH,VOTE_REP
0,7.009898,"State House District 8, Denali-University",8.0,2008,1,44.5,4.4,51.1,18.211322,AK,2,4,12942,5758,569,6615
1,16.636605,"State House District 37, Bristol Bay-Aleuti",37.0,2037,2,61.7,2.3,36.0,168.791686,AK,2,8,8446,5208,192,3046


In [6]:
usgs_04_df.columns

Index(['AREA', 'COUNTY', 'EL2004P020', 'FIPS', 'OBJECTID', 'PERCENT_DE',
       'PERCENT_OT', 'PERCENT_RE', 'PERIMETER', 'STATE', 'STATE_FIPS',
       'SYMBOL_COD', 'TOTAL_VOTE', 'VOTE_DEM', 'VOTE_OTH', 'VOTE_REP'],
      dtype='object')

In [7]:
# Drop unwanted columns that are most likely geospatial
usgs_04_df.drop(labels=['AREA','EL2004P020', 'OBJECTID','PERIMETER', 'SYMBOL_COD',],
                axis=1,
                inplace=True)

In [8]:
print('shape ', usgs_04_df.shape)
print('columns ', usgs_04_df.columns)

shape  (4755, 11)
columns  Index(['COUNTY', 'FIPS', 'PERCENT_DE', 'PERCENT_OT', 'PERCENT_RE', 'STATE',
       'STATE_FIPS', 'TOTAL_VOTE', 'VOTE_DEM', 'VOTE_OTH', 'VOTE_REP'],
      dtype='object')


In [9]:
# Drop duplicate rows that are artifacts of multiple polygons per county
# Keep the first row and discard the remaining duplicate rows
usgs_04_df.drop_duplicates(keep='first',
                           inplace=True)

In [10]:
print('shape ', usgs_04_df.shape)
print('columns ', usgs_04_df.columns)
usgs_04_df.head(3)

shape  (3164, 11)
columns  Index(['COUNTY', 'FIPS', 'PERCENT_DE', 'PERCENT_OT', 'PERCENT_RE', 'STATE',
       'STATE_FIPS', 'TOTAL_VOTE', 'VOTE_DEM', 'VOTE_OTH', 'VOTE_REP'],
      dtype='object')


Unnamed: 0,COUNTY,FIPS,PERCENT_DE,PERCENT_OT,PERCENT_RE,STATE,STATE_FIPS,TOTAL_VOTE,VOTE_DEM,VOTE_OTH,VOTE_REP
0,"State House District 8, Denali-University",2008,44.5,4.4,51.1,AK,2,12942,5758,569,6615
1,"State House District 37, Bristol Bay-Aleuti",2037,61.7,2.3,36.0,AK,2,8446,5208,192,3046
2,"State House District 12, Richardson-Glenn H",2012,28.5,3.0,68.5,AK,2,11845,3387,350,8108


In [11]:
# Generate columns with the attributes
# year {2016,2012,2008,2004,2000, ...}
# office {'president', 'senator', 'representative', ...}
# level  {'federal','state','county', ...}
num_rows = usgs_04_df.shape[0]
year_list = [2004] * num_rows
office_list = ['president'] * num_rows 
level_list = ['federal'] * num_rows
usgs_04_df['YEAR'] = year_list
usgs_04_df['OFFICE'] = office_list
usgs_04_df['LEVEL'] = level_list

In [12]:
# Re order the columns
usgs_04_df = usgs_04_df[['YEAR',
                         'FIPS',
                         'STATE_FIPS',
                         'STATE',
                         'COUNTY',
                         'OFFICE',
                         'LEVEL',
                         'VOTE_DEM',
                         'VOTE_REP',
                         'VOTE_OTH',
                         'PERCENT_DE',
                         'PERCENT_RE',
                         'PERCENT_OT',
                         'TOTAL_VOTE']]

In [13]:
# Rename the column labels
usgs_04_df.columns = ['YEAR','FIPS','STATE_FIPS','STATE_ABBR','COUNTY_NAME',
                      'OFFICE','LEVEL','VOTES_DEM','VOTES_REP','VOTES_OTHER',
                      'PERCENT_DEM','PERCENT_REP','PERCENT_OTHER','TOTAL_VOTES']

In [14]:
# Number of null values
# You can use DataFrame.info() as well
null_df = pd.DataFrame({'number of null values': usgs_04_df.isnull().sum(),
                        'data type' : usgs_04_df.dtypes })
null_df

Unnamed: 0,data type,number of null values
YEAR,int64,0
FIPS,object,0
STATE_FIPS,object,0
STATE_ABBR,object,0
COUNTY_NAME,object,0
OFFICE,object,0
LEVEL,object,0
VOTES_DEM,object,0
VOTES_REP,object,0
VOTES_OTHER,object,0


In [15]:
usgs_04_df.head(3)

Unnamed: 0,YEAR,FIPS,STATE_FIPS,STATE_ABBR,COUNTY_NAME,OFFICE,LEVEL,VOTES_DEM,VOTES_REP,VOTES_OTHER,PERCENT_DEM,PERCENT_REP,PERCENT_OTHER,TOTAL_VOTES
0,2004,2008,2,AK,"State House District 8, Denali-University",president,federal,5758,6615,569,44.5,51.1,4.4,12942
1,2004,2037,2,AK,"State House District 37, Bristol Bay-Aleuti",president,federal,5208,3046,192,61.7,36.0,2.3,8446
2,2004,2012,2,AK,"State House District 12, Richardson-Glenn H",president,federal,3387,8108,350,28.5,68.5,3.0,11845


## Output transformed csv file to output directory
Work in progress

In [16]:
filename_out = '../output/2004_pres_election_by_county.csv'
usgs_04_df.to_csv(filename_out,
                  index=False)