In [1]:
%pylab --no-import-all inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#  import useful classes of pandas
import pandas as pd
from pandas import Series, DataFrame, Index

In [3]:
import settings
import census
import requests

c = census.Census(key=settings.CENSUS_KEY)

In [17]:
census.__version__

'0.7'

In [4]:
c.sf1.fields

<bound method SF1Client.fields of <census.core.SF1Client object at 0x10994bd90>>

# Fields in SF1

In [5]:
sf1_fields = c.sf1.fields(year=2010)

NameError: global name 'requests' is not defined

In [None]:
sorted(sf1_fields.keys())

Let's parse more of pieces that are in the fields

In [None]:
# let's just parse sf1.xml ourselves to get the concepts
# http://lxml.de/parsing.html

from lxml import etree
from itertools import islice
import re

def parse_concept_name(concept_name):
    if concept_name != 'Geographic Characteristics':
        m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
        if m: 
           return {'label':m.group(1),
                  'clean_name':m.group(2),
                  'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
                  }
       # print m.groups()
        else:
           m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
           return {'label':m1.group(1),
                  'clean_name':m1.group(2),
                  'num_vars':0
                 }
    else:
        return None
    
def concepts_2010_sf1():
    # http://www.census.gov/developers/data/sf1.xml
    SF1_XML_PATH  = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"

    doc = etree.parse(SF1_XML_PATH)
    for concept in doc.findall("//concept"):
        concept_name = concept.attrib['name']
        
        if concept_name != 'Geographic Characteristics':
            m = re.search("(.+\.)\s+(.*)\s+(\[\d+\])",concept_name)
            if m: 
               yield {'label':m.group(1),
                      'clean_name':m.group(2),
                      'num_vars':re.search("\[(\d+)\]", m.group(3)).group(1)
                      }
           # print m.groups()
            else:
               m1 = re.search("(.+\.)\s+(.*)\s+",concept_name)
               yield {'label':m1.group(1),
                      'clean_name':m1.group(2),
                      'num_vars':0
                     }
            
k = list(concepts_2010_sf1())    


In [None]:
df = DataFrame(k, columns=('label','clean_name','num_vars'))
df.head()

In [None]:
import re

def sort_label(label):
    (l1, l2, l3) = re.search("([A-Z,a-z]+)(\d+)([A-Z,a-z]*)\.",label).groups()
    return l1 + " " + "{l2:03d}".format(l2=int(l2)) + l3

df['sort_label'] = df.label.apply(sort_label)

In [None]:
df[df.label.str.startswith("P5")]

In [None]:
# let's go right for the variables and generate a dict, DF

from lxml import etree
from itertools import islice
from collections import OrderedDict

SF1_XML_PATH  = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"

doc = etree.parse(SF1_XML_PATH)
variables = doc.findall("//variable")

variables_dict = OrderedDict([(v.attrib['name'], 
                               {'concept':v.attrib['concept'],
                                'text': v.text
                                }) for v in variables])



In [None]:
variables_dict['P0050001']

In [None]:
def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1)))

P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)

[(v,variables_dict[v]['text']) for v in P005_vars]

In [None]:
variables_df = DataFrame(variables_dict)
variables_df.head()

In [None]:
variables_df.T.concept.apply(parse_concept_name)

In [None]:
parse_concept_name(variables_dict['P0050001']['concept'])

# api.json

In [6]:
# http://www.census.gov/developers/

import requests
url = "http://api.census.gov/data.json"
api_json = requests.get(url).json()
api_json

[{u'accessLevel': u'public',
  u'bureauCode': [u'006:07'],
  u'c_dataset': [u'acs5'],
  u'c_documentationLink': u'http://www.census.gov/developers/',
  u'c_geographyLink': u'http://api.census.gov/data/2010/acs5/geography.json',
  u'c_isAggregate': True,
  u'c_tagsLink': u'http://api.census.gov/data/2010/acs5/tags.json',
  u'c_variablesLink': u'http://api.census.gov/data/2010/acs5/variables.json',
  u'c_vintage': 2010,
  u'contactPoint': u'Census Bureau Call Center',
  u'description': u"The American Community Survey (ACS) is a nationwide survey designed to provide communities a fresh look at how they are changing. The ACS replaced the decennial census long form in 2010 and thereafter by collecting long form type information throughout the decade rather than only once every 10 years.  Questionnaires are mailed to a sample of addresses to obtain information about households -- that is, about each person and the housing unit itself.  The American Community Survey produces demographic, soci

In [7]:
len(api_json)

76

In [9]:
api_json[0].keys()

[u'publisher',
 u'accessLevel',
 u'c_dataset',
 u'webService',
 u'identifier',
 u'title',
 u'temporal',
 u'spatial',
 u'c_variablesLink',
 u'c_documentationLink',
 u'c_isAggregate',
 u'bureauCode',
 u'references',
 u'c_geographyLink',
 u'contactPoint',
 u'c_tagsLink',
 u'mbox',
 u'c_vintage',
 u'description']

In [12]:
[(api.get('c_vintage'), api.get('c_dataset'), api.get('c_variablesLink')) for api in api_json]

[(2010, [u'acs5'], u'http://api.census.gov/data/2010/acs5/variables.json'),
 (2007, [u'ewks'], u'http://api.census.gov/data/2007/ewks/variables.json'),
 (2013, [u'acs5'], u'http://api.census.gov/data/2013/acs5/variables.json'),
 (2013, [u'acs3'], u'http://api.census.gov/data/2013/acs3/variables.json'),
 (None,
  [u'eits', u'm3'],
  u'http://api.census.gov/data/eits/m3/variables.json'),
 (2011, [u'nonemp'], u'http://api.census.gov/data/2011/nonemp/variables.json'),
 (1990, [u'sf1'], u'http://api.census.gov/data/1990/sf1/variables.json'),
 (2008, [u'cbp'], u'http://api.census.gov/data/2008/cbp/variables.json'),
 (2013,
  [u'pep', u'stchar6'],
  u'http://api.census.gov/data/2013/pep/stchar6/variables.json'),
 (2012, [u'ewks'], u'http://api.census.gov/data/2012/ewks/variables.json'),
 (None,
  [u'bds', u'firms'],
  u'http://api.census.gov/data/bds/firms/variables.json'),
 (2013,
  [u'pep', u'housing'],
  u'http://api.census.gov/data/2013/pep/housing/variables.json'),
 (None,
  [u'eits', u'

# variables.json

In [13]:
import requests
url = "http://api.census.gov/data/2010/sf1/variables.json"
var_json = requests.get(url).json()
sorted(var_json['variables'].keys())

[u'AIANHH',
 u'AIANHHCC',
 u'AIANHHFP',
 u'AIHHTLI',
 u'AITS',
 u'AITSCC',
 u'AITSCE',
 u'ANRC',
 u'BLKGRP',
 u'BLOCK',
 u'CBSA',
 u'CBSAPCI',
 u'CD',
 u'CD113',
 u'CNECTA',
 u'CONCIT',
 u'CONCITSC',
 u'COUNTY',
 u'COUSUB',
 u'CSA',
 u'DIVISION',
 u'GEOCOMP',
 u'H00010001',
 u'H0020001',
 u'H0020002',
 u'H0020003',
 u'H0020004',
 u'H0020005',
 u'H0020006',
 u'H0030001',
 u'H0030002',
 u'H0030003',
 u'H0040001',
 u'H0040002',
 u'H0040003',
 u'H0040004',
 u'H0050001',
 u'H0050002',
 u'H0050003',
 u'H0050004',
 u'H0050005',
 u'H0050006',
 u'H0050007',
 u'H0050008',
 u'H0060001',
 u'H0060002',
 u'H0060003',
 u'H0060004',
 u'H0060005',
 u'H0060006',
 u'H0060007',
 u'H0060008',
 u'H0070001',
 u'H0070002',
 u'H0070003',
 u'H0070004',
 u'H0070005',
 u'H0070006',
 u'H0070007',
 u'H0070008',
 u'H0070009',
 u'H0070010',
 u'H0070011',
 u'H0070012',
 u'H0070013',
 u'H0070014',
 u'H0070015',
 u'H0070016',
 u'H0070017',
 u'H0080001',
 u'H0080002',
 u'H0080003',
 u'H0080004',
 u'H0080005',
 u'H0080006

In [14]:
var_json['variables']['P0050002']

{u'concept': u'P5. HISPANIC OR LATINO ORIGIN BY RACE [17]',
 u'label': u'Not Hispanic or Latino:'}

In [15]:
from pandas import DataFrame
DataFrame(var_json['variables']).T

Unnamed: 0,concept,label,predicateOnly,predicateType
AIANHH,Geographic Summary Level,GEO PLACE HOLDER,,
AIANHHCC,Geographic Characteristics,GEO PLACE HOLDER,,
AIANHHFP,Geographic Characteristics,GEO PLACE HOLDER,,
AIHHTLI,Geographic Characteristics,GEO PLACE HOLDER,,
AITS,Geographic Characteristics,GEO PLACE HOLDER,,
AITSCC,Geographic Characteristics,GEO PLACE HOLDER,,
AITSCE,Geographic Characteristics,GEO PLACE HOLDER,,
ANRC,Geographic Characteristics,GEO PLACE HOLDER,,
BLKGRP,Geographic Summary Level,GEO PLACE HOLDER,,
BLOCK,,GEO PLACE HOLDER,,


# Plotting Age Distribution By Gender (Population Pyramid)

This example written by AJ Renold.

In [16]:
sf1_fields = c.sf1.fields(year=2010)

# Get the sf1 fields that are only P12 Sex By Age
gender_population_fields = sf1_fields.get('P12. Sex By Age [49]')

# Separate the by male and female
male_fields = { key: val for key, val in gender_population_fields.items() 
                                 if 'Male' in val and val != ' Male: ' }
female_fields = { key: val for key, val in gender_population_fields.items() 
                                   if 'Female' in val and val != ' Female: '}

NameError: global name 'requests' is not defined

In [None]:
# Query the census API with the gender_population_fields
query_results = c.sf1.get(('NAME', ','.join(gender_population_fields.keys())), geo={'for': 'state:*'})

# Create a DataFrame
gender_df = pd.DataFrame(query_results)

In [None]:
# Set the Index to the NAME column
gender_df = gender_df.set_index(gender_df['NAME'])

In [None]:
# Recast all numeric columns to be type int
for col in gender_df.columns:
    if col != "state" and col != "NAME":
        gender_df[col] = gender_df[col].astype(int)

In [None]:
from numpy import arange

def showPopulationPyramidPlot(df, state, male_fields, female_fields):
    
    # create a series with the row of the state
    s = Series(df.ix[state])
    #del s['NAME']
    #del s['state']
    
    # get the plot values and labels from the series
    male_list = sorted([ [key, s[key]] for key in s.keys() if key in male_fields ])
    female_list = sorted([ [key, s[key]] for key in s.keys() if key in female_fields ]) 
    
    # calculate the bar locations and the maximum value
    bar_ypos = arange(len(male_list))+.5
    max_val = max([ val for label, val in male_list + female_list ])
    
    # create the figures for the plots
    fig, (ax2, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(18,8))
    fig.suptitle('Population Age Pyramid for {state}'.format(state=state), fontsize=14)
    
    # plot the male populations
    bar1 = ax1.barh(bar_ypos, [ val for label, val in male_list ], align='center')
    ax1.set_xlim((0,max_val))
    ax1.set_yticks(bar_ypos)
    ax1.set_yticklabels([ male_fields[label][male_fields[label].find('!!')+3:] for label, val in male_list ])
    ax1.set_xlabel('People')
    ax1.set_title('Male Population by Age')
    ax1.grid(True)
    
    # plot the the female populations
    bar2 = ax2.barh(bar_ypos,[ val for label, val in female_list ], align='center', color='red')
    ax2.set_yticks([])
    #ax2.yaxis.tick_right()
    ax2.set_xlim(ax1.get_xlim()[::-1]) # reverses the x axis direction
    ax2.set_xlabel('People')
    ax2.set_title('Female Population by Age')
    ax2.grid(True)
    
    plt.subplots_adjust(wspace=0.22, hspace=0.0)
    plt.show()


In [None]:
showPopulationPyramidPlot(gender_df, 'Illinois', male_fields, female_fields)