## GPS Co-ordinates for all counties in the USA
We need the gps co-ordinates for plotting maps. The Bokeh sample data has this already.  
So let's just pull in what we need into a pkl file for later use.  

In [1]:
import pandas as pd

In [2]:
donations = pd.read_pickle('out/0/donations.pkl')

In [3]:
from bokeh.sampledata import us_states, us_counties
from bokeh.plotting import figure, show, output_file
import pandas as pd
import numpy as np

us_counties = us_counties.data.copy()
us_states = us_states.data.copy()


counties_list = list()
for county_id in us_counties:
    counties_list.append(
        [us_counties[county_id]['name'],
        us_counties[county_id]['state'].upper(),
        us_counties[county_id]['lats'],
        us_counties[county_id]['lons']])

states_list = list()
for code in us_states:
    states_list.append(
        [code,
        us_states[code]['lats'],
        us_states[code]['lons']])

counties_gps = pd.DataFrame(counties_list, columns=['county', 'state', 'lats', 'lons'])
states_gps = pd.DataFrame(states_list, columns=['state', 'lats', 'lons'])

In [4]:
def normalizecounty(county):
    '''
    Return a normalized county name.
    Different data sources seem to differ
    '''
    return county\
        .replace('County', '')\
        .replace('Parish', '')\
        .replace('City', '')\
        .replace('Borough', '')\
        .replace('.', '')\
        .replace(' ', '')\
        .lower()

pd.Series(['St. Lucie', 'Jefferson Parish', 'Anchorage Borough', 'King County']).apply(normalizecounty)

0      stlucie
1    jefferson
2    anchorage
3         king
dtype: object

In [5]:
# We have donations that have county names that don't match up with one of the sources.
missing = donations[
    donations.county.isin
    (set(donations.county.unique())\
          .difference(counties_gps.county.unique()))].county

# The normalizecounty function should fix the issue
missingafternormalization = donations[
    donations.county.isin
    (set(donations.county.apply(normalizecounty).unique())\
          .difference(counties_gps.county.apply(normalizecounty).unique()))].county

len(missing), len(missingafternormalization)

(2663, 0)

In [6]:
counties_gps['county_norm'] = counties_gps.county.apply(normalizecounty)
donations['county_norm'] = donations.county.apply(normalizecounty)

### Read in the Indian populations for each county

In [7]:
states_gps.head()

Unnamed: 0,state,lats,lons
0,WA,"[46.29443, 46.26451, 46.31405, 46.34919, 46.38...","[-124.03622, -124.16101, -124.15117, -124.1378..."
1,DE,"[39.63895, 39.68084, 39.72204, 39.72221, 39.72...","[-75.7878, -75.78909, -75.78861, -75.78861, -7..."
2,DC,"[38.97872, 38.98378, 38.96493, 38.95822, 38.95...","[-77.06276, -77.02561, -77.00141, -76.99288, -..."
3,WI,"[42.49273, 42.49433, 42.49562, 42.49561, 42.49...","[-87.8156, -87.93137, -88.10268, -88.20645, -8..."
4,WV,"[40.18683, 40.39711, 40.54795, 40.61628, 40.63...","[-80.67905, -80.62345, -80.64068, -80.57018, -..."


In [8]:
pop2014 = pd.read_csv('in/IndianPopulation_2010_2014.csv', names=['county_state', 'population'], skiprows=1)
pop2010 = pd.read_csv('in/IndianPopulation_2006_2010.csv', names=['county_state', 'population'], skiprows=1)
states = pd.read_csv('in/state_table.csv')

pop2010['county'] = pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2014['county'] = pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[0]
pop2010['state'] =  pop2010.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]
pop2014['state'] =  pop2014.county_state.str.extract('([a-z|A-Z|\'|\s]+), ([a-z|A-Z|\s]+)')[1]

pop2010 = pop2010.rename(columns={'population': 2008}).drop('county_state', axis=1)
pop2014 = pop2014.rename(columns={'population': 2012}).drop('county_state', axis=1)
pop2010['county_norm'] = pop2010.county.apply(normalizecounty)
pop2014['county_norm'] = pop2014.county.apply(normalizecounty)
data = pop2010.merge(pop2014, how='outer', on=['county_norm', 'state'])[['county_norm', 'state', 2008, 2012]]

In [9]:
data.head()

Unnamed: 0,county_norm,state,2008,2012
0,autauga,Alabama,37,0
1,baldwin,Alabama,87,97
2,barbour,Alabama,10,42
3,bibb,Alabama,12,0
4,blount,Alabama,77,0


In [10]:
data = data\
    .merge(states, how='left', left_on=['state'], right_on=['name'])[np.append(data.columns.values, 'abbreviation')]\
    .drop('state', axis=1)\
    .rename(columns={'abbreviation': 'state'})\
    .fillna(0)

In [11]:
for yr in range(2008, 2016):
    if yr not in data.columns:
        data[yr] = np.nan

In [12]:
# Re-index and sort the column names so we can interpolate
data = data.set_index(['county_norm','state'])
data = data[np.sort(data.columns.values)]

In [13]:
data = data.T.interpolate(method='linear', limit_direction='backward').T.astype('int')

In [14]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,2008,2009,2010,2011,2012,2013,2014,2015
county_norm,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
autauga,AL,37,27,18,9,0,0,0,0
baldwin,AL,87,89,92,94,97,97,97,97
barbour,AL,10,18,26,34,42,42,42,42
bibb,AL,12,9,6,3,0,0,0,0
blount,AL,77,57,38,19,0,0,0,0


In [17]:
!mkdir -p in
!mkdir -p out/11

In [18]:
# Finally save the data to a pickle file
states_gps.to_pickle('out/11/states_gps.pkl')
counties_gps.to_pickle('out/11/counties_gps.pkl')
data.to_pickle('out/11/counties_population.pkl')
donations.to_pickle('out/11/donations.pkl')