In [16]:
import matplotlib

In [17]:
%matplotlib inline

In [18]:
#  import useful classes of pandas
import pandas as pd
from pandas import Series, DataFrame, Index

In [19]:
import settings
import census
import requests

c = census.Census(key=settings.CENSUS_KEY)

In [20]:
census.__version__

'0.7.1'

In [21]:
c.sf1.fields

<bound method SF1Client.fields of <census.core.SF1Client object at 0x10c1544a8>>

# Fields in SF1

In [69]:
sf1_fields = c.sf1.fields(year=2010)

In [23]:
sorted(sf1_fields.keys())

[]

Let's parse more of pieces that are in the fields

In [71]:
# let's just parse sf1.xml ourselves to get the concepts
# http://lxml.de/parsing.html

from lxml import etree
from itertools import islice
import re

def parse_concept_name(concept_name):
    if concept_name != 'Geographic Characteristics':
        m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
        if m: 
           return {'label':m.group(1),
                  'clean_name':m.group(2),
                  'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
                  }
       # print m.groups()
        else:
           m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
           return {'label':m1.group(1),
                  'clean_name':m1.group(2),
                  'num_vars':0
                 }
    else:
        return None
    
def concepts_2010_sf1():
    # http://www.census.gov/developers/data/sf1.xml
    SF1_XML_PATH  = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"

    doc = etree.parse(SF1_XML_PATH)
    for concept in doc.findall("//concept"):
        concept_name = concept.attrib['name']
        
        if concept_name != 'Geographic Characteristics':
            m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
            if m: 
               yield {'label':m.group(1),
                      'clean_name':m.group(2),
                      'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
                      }
           # print m.groups()
            else:
               m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
               yield {'label':m1.group(1),
                      'clean_name':m1.group(2),
                      'num_vars':0
                     }
            
k = list(concepts_2010_sf1())    
k

[{'clean_name': 'Housing Units', 'label': 'H1.', 'num_vars': '1'},
 {'clean_name': 'TOTAL POPULATION IN OCCUPIED HOUSING UNITS',
  'label': 'H10.',
  'num_vars': '1'},
 {'clean_name': 'TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY TENURE',
  'label': 'H11.',
  'num_vars': '4'},
 {'clean_name': 'TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY TENURE (WHITE ALONE HOUSEHOLDER)',
  'label': 'H11A.',
  'num_vars': '4'},
 {'clean_name': 'TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY TENURE (BLACK OR AFRICAN AMERICAN ALONE HOUSEHOLDER)',
  'label': 'H11B.',
  'num_vars': '4'},
 {'clean_name': 'TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY TENURE (AMERICAN INDIAN AND ALASKA NATIVE ALONE HOUSEHOLDER)',
  'label': 'H11C.',
  'num_vars': '4'},
 {'clean_name': 'TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY TENURE (ASIAN ALONE HOUSEHOLDER)',
  'label': 'H11D.',
  'num_vars': '4'},
 {'clean_name': 'TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY TENURE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE H

In [25]:
df = DataFrame(k, columns=('label','clean_name','num_vars'))
df.head()

Unnamed: 0,label,clean_name,num_vars
0,H1.,Housing Units,1
1,H10.,TOTAL POPULATION IN OCCUPIED HOUSING UNITS,1
2,H11.,TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY ...,4
3,H11A.,TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY ...,4
4,H11B.,TOTAL POPULATION IN OCCUPIED HOUSING UNITS BY ...,4


In [26]:
import re

def sort_label(label):
    (l1, l2, l3) = re.search("([A-Z,a-z]+)(\d+)([A-Z,a-z]*)\.",label).groups()
    return l1 + " " + "{l2:03d}".format(l2=int(l2)) + l3

df['sort_label'] = df.label.apply(sort_label)

In [27]:
df[df.label.str.startswith("P5")]

Unnamed: 0,label,clean_name,num_vars,sort_label
232,P5.,HISPANIC OR LATINO ORIGIN BY RACE,17,P 005
233,P50.,ALLOCATION OF RELATIONSHIP,3,P 050
234,P51.,ALLOCATION OF POPULATION ITEMS FOR THE POPULAT...,3,P 051


In [28]:
# let's go right for the variables and generate a dict, DF

from lxml import etree
from itertools import islice
from collections import OrderedDict

SF1_XML_PATH  = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"

doc = etree.parse(SF1_XML_PATH)
variables = doc.findall("//variable")

variables_dict = OrderedDict([(v.attrib['name'], 
                               {'concept':v.attrib['concept'],
                                'text': v.text
                                }) for v in variables])



In [29]:
variables_dict['P0050001']

{'concept': 'P5. HISPANIC OR LATINO ORIGIN BY RACE [17]',
 'text': ' Total population '}

In [30]:
def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in range(n0,n1)))

P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)

[(v,variables_dict[v]['text']) for v in P005_vars]

[('P0050001', ' Total population '),
 ('P0050002', ' Not Hispanic or Latino: '),
 ('P0050003', ' Not Hispanic or Latino: !! White alone '),
 ('P0050004', ' Not Hispanic or Latino: !! Black or African American alone '),
 ('P0050005',
  ' Not Hispanic or Latino: !! American Indian and Alaska Native alone '),
 ('P0050006', ' Not Hispanic or Latino: !! Asian alone '),
 ('P0050007',
  ' Not Hispanic or Latino: !! Native Hawaiian and Other Pacific Islander alone '),
 ('P0050008', ' Not Hispanic or Latino: !! Some Other Race alone '),
 ('P0050009', ' Not Hispanic or Latino: !! Two or More Races '),
 ('P0050010', ' Hispanic or Latino: '),
 ('P0050011', ' Hispanic or Latino: !! White alone '),
 ('P0050012', ' Hispanic or Latino: !! Black or African American alone '),
 ('P0050013',
  ' Hispanic or Latino: !! American Indian and Alaska Native alone '),
 ('P0050014', ' Hispanic or Latino: !! Asian alone '),
 ('P0050015',
  ' Hispanic or Latino: !! Native Hawaiian and Other Pacific Islander alone '

In [31]:
variables_df = DataFrame(variables_dict)
variables_df.head()

Unnamed: 0,AIANHHCC,AIANHHFP,AIHHTLI,AITS,AITSCC,AITSCE,ANRC,CBSAPCI,CD113,CNECTA,...,PCT0090005,PCT0090006,PCT0090007,PCT0090008,PCT0090009,PCT0090010,PCT0090011,PCT0090012,PCT0090013,PCT0090014
concept,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,Geographic Characteristics,...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...,PCT9. Native Hawaiian And Other Pacific Island...
text,FIPS American Indian Area/Alaska Native Area/...,American Indian Area/Alaska Native Area/Hawai...,American Indian Trust Land/Hawaiian Home Land...,American Indian Tribal Subdivision (FIPS),FIPS American Indian Tribal Subdivision Class...,American Indian Tribal Subdivision (Census),Alaska Native Regional Corporation (FIPS),Metropolitan Statistical Area/Micropolitan St...,Congressional District - 113th Congress,New England City and Town Area,...,Polynesian (500-519) & (500-599): !! Tongan (...,Polynesian (500-519) & (500-599): !! Other Po...,"Micronesian (520-529, 531-541) & (500-599):","Micronesian (520-529, 531-541) & (500-599): !...","Micronesian (520-529, 531-541) & (500-599): !...","Micronesian (520-529, 531-541) & (500-599): !...",Melanesian (542-546) & (500-599):,Melanesian (542-546) & (500-599): !! Fijian (...,Melanesian (542-546) & (500-599): !! Other Me...,"Other Pacific Islander, not specified (530, 5..."


In [32]:
variables_df.T.concept.apply(parse_concept_name)

AIANHHCC                                                   None
AIANHHFP                                                   None
AIHHTLI                                                    None
AITS                                                       None
AITSCC                                                     None
AITSCE                                                     None
ANRC                                                       None
CBSAPCI                                                    None
CD113                                                      None
CNECTA                                                     None
CONCITSC                                                   None
GEOCOMP                                                    None
LOGRECNO                                                   None
MEMI                                                       None
METDIV                                                     None
NAME                                    

In [33]:
parse_concept_name(variables_dict['P0050001']['concept'])

{'clean_name': 'HISPANIC OR LATINO ORIGIN BY RACE',
 'label': 'P5.',
 'num_vars': '17'}

# api.json

In [34]:
# http://www.census.gov/developers/

import requests
url = "http://api.census.gov/data.json"
api_json = requests.get(url).json()
api_json

{'@context': 'https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld',
 '@id': 'http://api.census.gov/data/2010/acs5.json',
 '@type': 'dcat:Catalog',
 'conformsTo': 'https://project-open-data.cio.gov/v1.1/schema',
 'dataset': [{'@type': 'dcat:Dataset',
   'accessLevel': 'public',
   'bureauCode': ['006:07'],
   'c_dataset': ['acs5'],
   'c_documentationLink': 'http://www.census.gov/developers/',
   'c_geographyLink': 'http://api.census.gov/data/2010/acs5/geography.json',
   'c_isAggregate': True,
   'c_isAvailable': False,
   'c_isReleased': False,
   'c_tagsLink': 'http://api.census.gov/data/2010/acs5/tags.json',
   'c_unavailableMessage': 'This dataset is currently unavailable.',
   'c_variablesLink': 'http://api.census.gov/data/2010/acs5/variables.json',
   'c_vintage': 2010,
   'contactPoint': {'fn': 'Census Bureau Call Center',
    'hasEmail': 'pio@census.gov'},
   'description': "The American Community Survey (ACS) is a nationwide survey designed to provide communities a fr

In [35]:
len(api_json)

6

In [41]:
api_json.keys()

dict_keys(['@context', '@type', '@id', 'conformsTo', 'dataset', 'describedBy'])

In [43]:
len(api_json.get('dataset'))

95

In [46]:
df = DataFrame(api_json.get('dataset'))
df.columns

Index(['@type', 'accessLevel', 'bureauCode', 'c_dataset',
       'c_documentationLink', 'c_examplesLink', 'c_geographyLink',
       'c_isAggregate', 'c_isAvailable', 'c_isCube', 'c_isReleased',
       'c_isTimeseries', 'c_tagsLink', 'c_unavailableMessage',
       'c_variablesLink', 'c_vintage', 'contactPoint', 'description',
       'distribution', 'identifier', 'keyword', 'license', 'modified',
       'programCode', 'publisher', 'references', 'spatial', 'temporal',
       'title'],
      dtype='object')

In [53]:
# don't know why there's no 2010 census
df[df.title.apply(lambda s:'census' in s.lower())][['title']]

Unnamed: 0,title
2,2007 Economic Census - All Sectors: Economy-Wi...
14,2012 Economic Census - All Sectors: Economy-Wi...
20,1990 Decennial Census of Population and Housin...
25,2015 Census Planning Database: Block Group
87,2015 Census Planning Database: Tract Level
92,2002 Economic Census - All Sectors: Economy-Wi...


In [60]:
df[['c_vintage', 'title']]

Unnamed: 0,c_vintage,title
0,2010,2010 American Community Survey: 5-Year Estimates
1,2014,Vintage 2014 Population Estimates: County Popu...
2,2007,2007 Economic Census - All Sectors: Economy-Wi...
3,2013,2009-2013 American Community Survey 5-Year Est...
4,2013,2011-2013 American Community Survey - Summariz...
5,,Time Series International Database: Internatio...
6,2014,Vintage 2014 Population Estimates: State Popul...
7,2012,2012 Nonemployer Statistics: Nonemployer Stati...
8,,Economic Indicators Time Series - Manufacturer...
9,2011,2011 Nonemployer Statistics: Non Employer Stat...


In [62]:
# good way to see list of datasets
sorted(list(df['title'],))

['1990 Decennial Census of Population and Housing - Summary File 3: Summary File 3',
 '1990 Decennial: Summary File 1',
 '2000 Decennial: Summary File 1',
 '2000 Decennial: Summary File 3',
 '2002 Economic Census - All Sectors: Economy-Wide Key Statistics',
 '2007 Economic Census - All Sectors: Economy-Wide Key Statistics',
 '2008 County Business Patterns: Business Patterns',
 '2008 Nonemployer Statistics: Non Employer Statistics',
 '2008-2012 American Community Survey - Summarized Data: 5-Year Summary File',
 '2009 County Business Patterns: Business Patterns',
 '2009 Nonemployer Statistics: Non Employer Statistics',
 '2009-2013 American Community Survey 5-Year Estimates',
 '2009-2013 American Community Survey 5-Year Profiles',
 '2010 American Community Survey: 5-Year Estimates',
 '2010 County Business Patterns: Business Patterns',
 '2010 Decennial: Summary File 1',
 '2010 Nonemployer Statistics: Non Employer Statistics',
 '2011 American Community Survey 1-Year Profiles for the 113th C

# variables.json

In [63]:
import requests
url = "http://api.census.gov/data/2010/sf1/variables.json"
var_json = requests.get(url).json()
sorted(var_json['variables'].keys())

['AIANHH',
 'AIANHHCC',
 'AIANHHFP',
 'AIHHTLI',
 'AITS',
 'AITSCC',
 'AITSCE',
 'ANRC',
 'BLKGRP',
 'BLOCK',
 'CBSA',
 'CBSAPCI',
 'CD',
 'CD113',
 'CNECTA',
 'CONCIT',
 'CONCITSC',
 'COUNTY',
 'COUSUB',
 'CSA',
 'DIVISION',
 'GEOCOMP',
 'H00010001',
 'H0020001',
 'H0020002',
 'H0020003',
 'H0020004',
 'H0020005',
 'H0020006',
 'H0030001',
 'H0030002',
 'H0030003',
 'H0040001',
 'H0040002',
 'H0040003',
 'H0040004',
 'H0050001',
 'H0050002',
 'H0050003',
 'H0050004',
 'H0050005',
 'H0050006',
 'H0050007',
 'H0050008',
 'H0060001',
 'H0060002',
 'H0060003',
 'H0060004',
 'H0060005',
 'H0060006',
 'H0060007',
 'H0060008',
 'H0070001',
 'H0070002',
 'H0070003',
 'H0070004',
 'H0070005',
 'H0070006',
 'H0070007',
 'H0070008',
 'H0070009',
 'H0070010',
 'H0070011',
 'H0070012',
 'H0070013',
 'H0070014',
 'H0070015',
 'H0070016',
 'H0070017',
 'H0080001',
 'H0080002',
 'H0080003',
 'H0080004',
 'H0080005',
 'H0080006',
 'H0080007',
 'H0090001',
 'H0090002',
 'H0090003',
 'H0090004',
 'H0090

In [64]:
var_json['variables']['P0050002']

{'concept': 'P5. HISPANIC OR LATINO ORIGIN BY RACE [17]',
 'label': 'Not Hispanic or Latino:'}

In [65]:
from pandas import DataFrame
DataFrame(var_json['variables']).T

Unnamed: 0,concept,label,predicateOnly,predicateType
AIANHH,Geographic Summary Level,GEO PLACE HOLDER,,
AIANHHCC,Geographic Characteristics,GEO PLACE HOLDER,,
AIANHHFP,Geographic Characteristics,GEO PLACE HOLDER,,
AIHHTLI,Geographic Characteristics,GEO PLACE HOLDER,,
AITS,Geographic Characteristics,GEO PLACE HOLDER,,
AITSCC,Geographic Characteristics,GEO PLACE HOLDER,,
AITSCE,Geographic Characteristics,GEO PLACE HOLDER,,
ANRC,Geographic Characteristics,GEO PLACE HOLDER,,
BLKGRP,Geographic Summary Level,GEO PLACE HOLDER,,
BLOCK,,GEO PLACE HOLDER,,


# Plotting Age Distribution By Gender (Population Pyramid)

This example written by AJ Renold.  (and rewritten by R. Yee to adapt to changes in the census API.)

In [68]:
sf1_fields

{}

In [66]:
sf1_fields = c.sf1.fields(year=2010)

# Get the sf1 fields that are only P12 Sex By Age
gender_population_fields = sf1_fields.get('P12. Sex By Age [49]')

# Separate the by male and female
male_fields = { key: val for key, val in gender_population_fields.items() 
                                 if 'Male' in val and val != ' Male: ' }
female_fields = { key: val for key, val in gender_population_fields.items() 
                                   if 'Female' in val and val != ' Female: '}

AttributeError: 'NoneType' object has no attribute 'items'

In [None]:
# Query the census API with the gender_population_fields
query_results = c.sf1.get(('NAME', ','.join(gender_population_fields.keys())), geo={'for': 'state:*'})

# Create a DataFrame
gender_df = pd.DataFrame(query_results)

In [None]:
# Set the Index to the NAME column
gender_df = gender_df.set_index(gender_df['NAME'])

In [None]:
# Recast all numeric columns to be type int
for col in gender_df.columns:
    if col != "state" and col != "NAME":
        gender_df[col] = gender_df[col].astype(int)

In [None]:
from numpy import arange

def showPopulationPyramidPlot(df, state, male_fields, female_fields):
    
    # create a series with the row of the state
    s = Series(df.ix[state])
    #del s['NAME']
    #del s['state']
    
    # get the plot values and labels from the series
    male_list = sorted([ [key, s[key]] for key in s.keys() if key in male_fields ])
    female_list = sorted([ [key, s[key]] for key in s.keys() if key in female_fields ]) 
    
    # calculate the bar locations and the maximum value
    bar_ypos = arange(len(male_list))+.5
    max_val = max([ val for label, val in male_list + female_list ])
    
    # create the figures for the plots
    fig, (ax2, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(18,8))
    fig.suptitle('Population Age Pyramid for {state}'.format(state=state), fontsize=14)
    
    # plot the male populations
    bar1 = ax1.barh(bar_ypos, [ val for label, val in male_list ], align='center')
    ax1.set_xlim((0,max_val))
    ax1.set_yticks(bar_ypos)
    ax1.set_yticklabels([ male_fields[label][male_fields[label].find('!!')+3:] for label, val in male_list ])
    ax1.set_xlabel('People')
    ax1.set_title('Male Population by Age')
    ax1.grid(True)
    
    # plot the the female populations
    bar2 = ax2.barh(bar_ypos,[ val for label, val in female_list ], align='center', color='red')
    ax2.set_yticks([])
    #ax2.yaxis.tick_right()
    ax2.set_xlim(ax1.get_xlim()[::-1]) # reverses the x axis direction
    ax2.set_xlabel('People')
    ax2.set_title('Female Population by Age')
    ax2.grid(True)
    
    plt.subplots_adjust(wspace=0.22, hspace=0.0)
    plt.show()


In [None]:
showPopulationPyramidPlot(gender_df, 'Illinois', male_fields, female_fields)