# Cartogram Prep

Normalizing the State-tweet-counts by state population.

#### Llibraries

In [1]:
%run utilities.py

#### State Populations

In [2]:
census_dtypes= {'STATE': 'str'}
census = pd.read_csv('https://www2.census.gov/programs-surveys/popest/datasets/2010-2016/national/totals/nst-est2016-alldata.csv',
    dtype=census_dtypes)
    
census.head(2)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RDOMESTICMIG2016,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016
0,10,0,0,0,United States,308745538,308758105,309348193,311663358,313998379,...,0.0,0.0,0.0,0.0,2.750744,2.875599,2.773619,3.080812,3.242191,3.102874
1,20,1,0,0,Northeast Region,55317240,55318353,55388056,55632766,55829059,...,-3.894853,-5.164953,-5.949518,-6.432749,1.419121,0.406543,0.34744,-0.452752,-1.054607,-1.622592


In [3]:
census = census[['STATE', 'NAME', 'POPESTIMATE2016']]
census.head(2)

Unnamed: 0,STATE,NAME,POPESTIMATE2016
0,0,United States,323127513
1,0,Northeast Region,56209510


#### State Abbreviation - State Crosswalk

In [4]:
census_cols = ['abbrv', 'STATE', 'STATEFP', 'COUNTYFP', 'COUNTYNAME']
xwalk_dtypes = {'STATE': 'str', 'STATEFP':'str'}
cnty_st_xwalk = pd.read_csv('https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt',
                          names=census_cols,
                          dtype=xwalk_dtypes)
cnty_st_xwalk.head()

Unnamed: 0,abbrv,STATE,STATEFP,COUNTYFP,COUNTYNAME
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1
3,AL,1,7,Bibb County,H1
4,AL,1,9,Blount County,H1


In [5]:
## get just the unique ST:FIPS pairings
# groupby to get pairing
cnty_st_xwalk = cnty_st_xwalk[['abbrv', 'STATE', 'COUNTYFP']].groupby(['abbrv', 'STATE']).count()

# reset index to add the abbreviations as columns
cnty_st_xwalk.reset_index(inplace=True)

# Drop the counts
cnty_st_xwalk = cnty_st_xwalk[['abbrv', 'STATE']]

# reindex using the abbreviation
cnty_st_xwalk.set_index('abbrv', inplace=True)

# check the results
cnty_st_xwalk.tail(2)

Unnamed: 0_level_0,STATE
abbrv,Unnamed: 1_level_1
WV,54
WY,56


#### Normalization DataFrame

Create a merged dataframe from the above to use as our ground truth.

In [6]:
temp = cnty_st_xwalk.reset_index()
merged = census.join(temp, lsuffix='02')

merged.tail(2)

Unnamed: 0,STATE02,NAME,POPESTIMATE2016,abbrv,STATE
55,56,Wyoming,585501,WV,54
56,72,Puerto Rico,3411307,WY,56


In [7]:
# clean that up
merged = merged[['NAME', 'POPESTIMATE2016', 'abbrv', 'STATE']]
merged.rename(columns={
        'NAME': 'name', 
        'POPESTIMATE2016': 'pop', 
        'STATE': 'fips'}, inplace=True)

merged.set_index('abbrv', inplace=True)
merged.head()

Unnamed: 0_level_0,name,pop,fips
abbrv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,United States,323127513,2
AL,Northeast Region,56209510,1
AR,Midwest Region,67941429,5
AS,South Region,122319574,60
AZ,West Region,76657000,4


In [8]:
# quick lookup function
def get_pop_from_abbrv(idx, attr):
    try:
        val = merged.loc[idx][attr]
        return val
    except:
        print (idx + ' didnt work!')
        return

#### State Tweet Counts

In [9]:
cnts_f = name_file_path('state-tweet-counts.csv', processed_web_dir)
cnts = pd.read_csv(cnts_f)

cnts.head(1)

Unnamed: 0,st,week,cnt
0,AL,-2.0,204


In [10]:
# add state population
cnts['pop'] = cnts['st'].map(lambda x: get_pop_from_abbrv(x, 'pop'))

# Calculate the tweet rate
cnts['rate'] = cnts['cnt'] / cnts['pop']

# slim down this columns
rates = cnts[['st', 'week', 'cnt', 'rate']]

rates.head()

Unnamed: 0,st,week,cnt,rate
0,AL,-2.0,204,4e-06
1,AL,-1.0,2103,3.7e-05
2,AL,1.0,49415,0.000879
3,AL,2.0,23908,0.000425
4,AL,4.0,6939,0.000123


In [11]:
# write it out
rates_f = name_file_path('state-tweet-rates.csv', processed_web_dir)
rates.to_csv(rates_f, index=False)