## Transform
## 2000 Presidential Election Results by County

David Lublin and D. Stephen Voss. 2001. "Federal Elections Project." American University, Washington, DC and the University of Kentucky, Lexington, KY.

source:  http://www.american.edu/spa/ccps/Data-Sets.cfm

In [1]:
import pandas as pd
import numpy as np
import sys

version = ".".join(map(str, sys.version_info[:3]))
print('python version ', version)
print('numpy version ', np.__version__)
print('pandas version ',pd.__version__)

python version  3.5.2
numpy version  1.10.4
pandas version  0.18.1


In [2]:
ls ../data/american-university/CY/

county.csv  COUNTY.csv  county.xlsx  CTYREADME.txt


In [30]:
## NOTE: edited encoded spaces. 
## Initially converted to an excel format, but the problem was encoding of
## some of the spaces in the STATE and COUNTY columns such as North Dakota,
## South Carolina, etc
filename = '../data/american-university/CY/county.csv'
y2k_df = pd.read_csv(filename)
print('shape (num_rows,num_cols) ', y2k_df.shape)
y2k_df.head(3)

shape (num_rows,num_cols)  (3155, 24)


Unnamed: 0,CID,STATE,COUNTY,PBUSH,PGORE,PNADER,POTHER,BUSH,GORE,NADER,...,HAGELIN,MCREYNOLDS,HARRIS,DODGE,NOTA,MOOREHEAD,BROWN,VENSON,YOUNGKEIT,LANE
0,1,Alabama,Autauga,0.6969,0.2872,0.0093,0.0066,11993,4942,160,...,5,0,0,0,0,0,0,0,0,0
1,2,Alabama,Baldwin,0.7237,0.2478,0.0183,0.0102,40872,13997,1033,...,14,0,0,0,0,0,0,0,0,0
2,3,Alabama,Barbour,0.4991,0.4902,0.0044,0.0063,5096,5188,46,...,2,0,0,0,0,0,0,0,0,0


In [31]:
# Generate columns with the attributes
# year {2016,2012,2008,2004,2000, ...}
# office {'president', 'senator', 'representative', ...}
# level  {'federal','state','county', ...}
num_rows = y2k_df.shape[0]
year_list = [2000] * num_rows
office_list = ['president'] * num_rows 
level_list = ['federal'] * num_rows
y2k_df['YEAR'] = year_list
y2k_df['OFFICE'] = office_list
y2k_df['LEVEL'] = level_list

In [32]:
# Number of null values and data types per column
# You can use DataFrame.info() as well
null_df = pd.DataFrame({'number of null values': y2k_df.isnull().sum(),
                        'data type' : y2k_df.dtypes })
null_df

Unnamed: 0,data type,number of null values
CID,int64,0
STATE,object,0
COUNTY,object,0
PBUSH,float64,0
PGORE,float64,0
PNADER,float64,0
POTHER,float64,0
BUSH,int64,0
GORE,int64,0
NADER,int64,0


In [33]:
## Convert decimal fractions to percentages
## Round percentage of votes to one place
round_percentage = lambda x: round(float(x),3)*100
y2k_df['PBUSH'] = y2k_df['PBUSH'].apply(round_percentage)
y2k_df['PGORE'] = y2k_df['PGORE'].apply(round_percentage)
y2k_df['PNADER'] = y2k_df['PNADER'].apply(round_percentage)
y2k_df['POTHER'] = y2k_df['POTHER'].apply(round_percentage)

In [34]:
y2k_df['STATE'] = y2k_df.STATE.astype('str')

In [35]:
state_abbr = { 'Alaska': 'AK',
           'Alabama': 'AL',
           'Arkansas': 'AR',
           'Arizona': 'AZ',
           'California': 'CA',
           'Colorado': 'CO',
           'Connecticut': 'CT',
           'District of Columbia': 'DC',
           'Delaware': 'DE',
           'Florida': 'FL',
           'Georgia': 'GA',
           'Guam': 'GU',
           'Hawaii': 'HI',
           'Iowa': 'IA',
           'Idaho': 'ID',
           'Illinois': 'IL',
           'Indiana': 'IN',
           'Kansas': 'KS',
           'Kentucky': 'KY',
           'Louisiana': 'LA',
           'Massachusetts': 'MA',
           'Maryland': 'MD',
           'Maine': 'ME',
           'Michigan': 'MI',
           'Minnesota': 'MN',
           'Missouri': 'MO',
           'Mississippi': 'MS',
           'Montana': 'MT',
           'North Carolina': 'NC',
           'North Dakota': 'ND',
           'Nebraska': 'NE',
           'New Hampshire': 'NH',
           'New Jersey': 'NJ',
           'New Mexico': 'NM',
           'Nevada': 'NV',
           'New York': 'NY',
           'Ohio': 'OH',
           'Oklahoma': 'OK',
           'Oregon': 'OR',
           'Pennsylvania': 'PA',
           'Puerto Rico': 'PR',
           'Rhode Island': 'RI',
           'South Carolina': 'SC',
           'South Dakota': 'SD',
           'Tennessee': 'TN',
           'Texas': 'TX',
           'Utah': 'UT',
           'Virginia': 'VA',
           'Virgin Islands': 'VI',
           'Vermont': 'VT',
           'Washington': 'WA',
           'Wisconsin': 'WI',
           'West Virginia': 'WV',
           'Wyoming': 'WY'
}

In [36]:
# Add state abbreviation to DataFrame
get_abbr = lambda s: state_abbr[str(s)]
y2k_df['STATE_ABBR'] = y2k_df['STATE'].apply(get_abbr)

In [39]:
y2k_df.columns

Index(['CID', 'STATE', 'COUNTY', 'PBUSH', 'PGORE', 'PNADER', 'POTHER', 'BUSH',
       'GORE', 'NADER', 'BUCHANAN', 'BROWNE', 'PHILLIPS', 'WRITEINS',
       'HAGELIN', 'MCREYNOLDS', 'HARRIS', 'DODGE', 'NOTA', 'MOOREHEAD',
       'BROWN', 'VENSON', 'YOUNGKEIT', 'LANE', 'YEAR', 'OFFICE', 'LEVEL',
       'STATE_ABBR'],
      dtype='object')

In [43]:
# Re order columns
y2k_df = y2k_df[['YEAR','STATE_ABBR','STATE','COUNTY',
              'OFFICE','LEVEL',
              'GORE','BUSH','NADER',
              'BUCHANAN','BROWNE','PHILLIPS',
              'WRITEINS','HAGELIN','MCREYNOLDS',
              'HARRIS', 'DODGE', 'NOTA','MOOREHEAD',
              'BROWN','VENSON','YOUNGKEIT','LANE',
              'PGORE','PBUSH','PNADER','POTHER']]

In [46]:
# Rename columns
y2k_df.columns = ['YEAR','STATE_ABBR','STATE','COUNTY_NAME',
                  'OFFICE','LEVEL',
                  'VOTES_DEM', 'VOTES_REP',
                  'NADER',
                  'BUCHANAN','BROWNE','PHILLIPS',
                  'WRITEINS','HAGELIN','MCREYNOLDS',
                  'HARRIS', 'DODGE', 'NOTA','MOOREHEAD',
                  'BROWN','VENSON','YOUNGKEIT','LANE',
                  'PERCENT_DEM', 'PERCENT_REP',
                  'PERCENT_NADER', 'PERCENT_OTHER'
                  ]

In [50]:
# Eliminate the STATE column and retain the STATE_ABBR for consistency 
# with other election by county files
del y2k_df['STATE']

In [51]:
print('shape (num_rows,num_cols) ', y2k_df.shape)
y2k_df.head(3)

shape (num_rows,num_cols)  (3155, 26)


Unnamed: 0,YEAR,STATE_ABBR,COUNTY_NAME,OFFICE,LEVEL,VOTES_DEM,VOTES_REP,NADER,BUCHANAN,BROWNE,...,NOTA,MOOREHEAD,BROWN,VENSON,YOUNGKEIT,LANE,PERCENT_DEM,PERCENT_REP,PERCENT_NADER,PERCENT_OTHER
0,2000,AL,Autauga,president,federal,4942,11993,160,43,51,...,0,0,0,0,0,0,28.7,69.7,0.9,0.7
1,2000,AL,Baldwin,president,federal,13997,40872,1033,287,226,...,0,0,0,0,0,0,24.8,72.4,1.8,1.0
2,2000,AL,Barbour,president,federal,5188,5096,46,27,27,...,0,0,0,0,0,0,49.0,49.9,0.4,0.6


In [38]:
## 2010 FIPS codes for states and counties from Census Bureau
## https://www.census.gov/geo/reference/codes/cou.html
## List of changes 1970,1980,1990,2000,2010
## https://www.census.gov/geo/reference/county-changes.html
## TODO: modify this for 2000
filename_fips = '../data/census/national_county.txt'
cols = ['STATE_ABBR','STATE_FIPS','COUNTY_FIPS','COUNTY_NAME','']
fips_df = pd.read_csv(filename_fips, header=None)
print('shape (num_rows, num_cols) ' ,fips_df.shape)
fips_df.head(3)

shape (num_rows, num_cols)  (3235, 5)


Unnamed: 0,0,1,2,3,4
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1


## Output csv file to output directory
Work in progress

Year 2000 does not have FIPS codes and is not in the same format

In [49]:
filename_out = '../output/2000_pres_election_by_county.csv'
y2k_df.to_csv(filename_out,
              index=False)