## Transform
## tonmcg 2016 Presidential Election Results by County
### Tony McGovern
source: https://github.com/tonmcg/County_Level_Election_Results_12-16
### TownHall.com
source: http://townhall.com/election/2016/president/

In [1]:
import pandas as pd
import numpy as np
import sys

version = ".".join(map(str, sys.version_info[:3]))
print('python version ', version)
print('numpy version ', np.__version__)
print('pandas version ',pd.__version__)

python version  3.5.2
numpy version  1.10.4
pandas version  0.18.1


In [2]:
ls ../data/tonmcg/

US_County_Level_Presidential_Results_12-16.csv


In [3]:
filename = '../data/tonmcg/US_County_Level_Presidential_Results_12-16.csv'
cols_subset = ['combined_fips','state_fips','state_abbr','county_name',
               'votes_dem_2016','votes_gop_2016',
               'per_dem_2016','per_gop_2016',
               'total_votes_2016']
tonmcg_16_df = pd.read_csv(filename,
                           usecols=cols_subset)
print('shape (num_rows, num_cols) ',tonmcg_16_df.shape)
tonmcg_16_df.head(3)

shape (num_rows, num_cols)  (3141, 9)


Unnamed: 0,combined_fips,votes_dem_2016,votes_gop_2016,total_votes_2016,per_dem_2016,per_gop_2016,state_abbr,county_name,state_fips
0,2013,93003.0,130413.0,246588.0,0.377159,0.52887,AK,Alaska,
1,2016,93003.0,130413.0,246588.0,0.377159,0.52887,AK,Alaska,
2,2020,93003.0,130413.0,246588.0,0.377159,0.52887,AK,Alaska,


In [4]:
# Generate columns with the attributes
# year {2016,2012,2008,2004,2000, ...}
# office {'president', 'senator', 'representative', ...}
# level  {'federal','state','county', ...}
num_rows = tonmcg_16_df.shape[0]
year_list = [2016] * num_rows
office_list = ['president'] * num_rows 
level_list = ['federal'] * num_rows
tonmcg_16_df['YEAR'] = year_list
tonmcg_16_df['OFFICE'] = office_list
tonmcg_16_df['LEVEL'] = level_list

In [5]:
# Re order the columns
tonmcg_16_df = tonmcg_16_df[['YEAR',
                             'combined_fips',
                             'state_fips',
                             'state_abbr',
                             'county_name',
                             'OFFICE',
                             'LEVEL',
                             'votes_dem_2016',
                             'votes_gop_2016',
                             'per_dem_2016',
                             'per_gop_2016',
                             'total_votes_2016']]

In [6]:
# Rename the column labels
tonmcg_16_df.columns = ['YEAR','FIPS','STATE_FIPS',
                        'STATE_ABBR','COUNTY_NAME',
                        'OFFICE','LEVEL',
                        'VOTES_DEM','VOTES_REP',
                        'PERCENT_DEM','PERCENT_REP',
                        'TOTAL_VOTES']

In [7]:
# Number of null values and data types per column
# You can use DataFrame.info() as well
null_df = pd.DataFrame({'number of null values': tonmcg_16_df.isnull().sum(),
                        'data type' : tonmcg_16_df.dtypes })
null_df

Unnamed: 0,data type,number of null values
YEAR,int64,0
FIPS,int64,0
STATE_FIPS,float64,29
STATE_ABBR,object,0
COUNTY_NAME,object,0
OFFICE,object,0
LEVEL,object,0
VOTES_DEM,float64,0
VOTES_REP,float64,0
PERCENT_DEM,float64,0


In [8]:
## Round percentage of votes to one place
round_percentage = lambda x: round(float(x),1)
tonmcg_16_df['PERCENT_DEM'] = tonmcg_16_df['PERCENT_DEM'].apply(round_percentage)
tonmcg_16_df['PERCENT_REP'] = tonmcg_16_df['PERCENT_REP'].apply(round_percentage)

In [9]:
# Convert votes from floats to integers
# If you need a float for calculations you can cast it to a float
tonmcg_16_df['STATE_FIPS'] = tonmcg_16_df.STATE_FIPS.astype('str')
tonmcg_16_df['VOTES_DEM'] = tonmcg_16_df.VOTES_DEM.astype('int')
tonmcg_16_df['VOTES_REP'] = tonmcg_16_df.VOTES_REP.astype('int')
tonmcg_16_df['TOTAL_VOTES'] = tonmcg_16_df.TOTAL_VOTES.astype('int')

In [10]:
print('shape (num_rows,num_cols) ', tonmcg_16_df.shape)
tonmcg_16_df.head(3)

shape (num_rows,num_cols)  (3141, 12)


Unnamed: 0,YEAR,FIPS,STATE_FIPS,STATE_ABBR,COUNTY_NAME,OFFICE,LEVEL,VOTES_DEM,VOTES_REP,PERCENT_DEM,PERCENT_REP,TOTAL_VOTES
0,2016,2013,,AK,Alaska,president,federal,93003,130413,0.4,0.5,246588
1,2016,2016,,AK,Alaska,president,federal,93003,130413,0.4,0.5,246588
2,2016,2020,,AK,Alaska,president,federal,93003,130413,0.4,0.5,246588


In [11]:
## 2010 FIPS codes for states and counties from Census Bureau
## https://www.census.gov/geo/reference/codes/cou.html
## List of changes 1970,1980,1990,2000,2010
## https://www.census.gov/geo/reference/county-changes.html
## TODO: modify this for 2000
filename_fips = '../data/census/national_county.txt'
cols = ['STATE_ABBR','STATE_FIPS','COUNTY_FIPS','COUNTY_NAME','FIPS_CLASS']
fips_df = pd.read_csv(filename_fips,
                      header=None,
                      names=cols,
                      dtype={'STATE_ABBR':str,
                             'STATE_FIPS':str,
                             'COUNTY_FIPS':str,
                             'COUNTY_NAME':str,
                             'FIPS_CLASS':str})
print('shape (num_rows, num_cols) ' ,fips_df.shape)
fips_df.head(5)

shape (num_rows, num_cols)  (3235, 5)


Unnamed: 0,STATE_ABBR,STATE_FIPS,COUNTY_FIPS,COUNTY_NAME,FIPS_CLASS
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1
3,AL,1,7,Bibb County,H1
4,AL,1,9,Blount County,H1


In [12]:
# Create a dictionary of state abbreviations to state fips codes
state_abbr = fips_df.STATE_ABBR.unique()
state_fips = fips_df.STATE_FIPS.unique()
state2state_fips_d = dict(zip(state_abbr,state_fips))

In [13]:
# Resolve missing STATE_FIPS encodings for AK/Alaska entries
tonmcg_16_df['STATE_FIPS'] = [state2state_fips_d[s] 
                              for s in tonmcg_16_df['STATE_ABBR']]

In [14]:
print('shape (num_rows,num_cols) ', tonmcg_16_df.shape)
tonmcg_16_df.head(3)

shape (num_rows,num_cols)  (3141, 12)


Unnamed: 0,YEAR,FIPS,STATE_FIPS,STATE_ABBR,COUNTY_NAME,OFFICE,LEVEL,VOTES_DEM,VOTES_REP,PERCENT_DEM,PERCENT_REP,TOTAL_VOTES
0,2016,2013,2,AK,Alaska,president,federal,93003,130413,0.4,0.5,246588
1,2016,2016,2,AK,Alaska,president,federal,93003,130413,0.4,0.5,246588
2,2016,2020,2,AK,Alaska,president,federal,93003,130413,0.4,0.5,246588


## Output transformed csv file to output directory
Work in progress

In [15]:
filename_out = '../output/2016_pres_election_by_county.csv'
tonmcg_16_df.to_csv(filename_out,
                    index=False)