In [174]:
import requests
import pandas as pd
import numpy as np
from ast import literal_eval
import pickle

"""
Summary of tables and concepts available in 2015 ACS 5-year estimates. 
Also includes detailed variable descriptions, including data collection methods:

http://old.socialexplorer.com/pub/reportdata/metabrowser.aspx?survey=ACS2015_5yr&header=True
"""

API_KEY = 'c8d52047c115ae3d605397c2e2ab846560365abd'

def get_acs_url(variable_names):
    return (
        'http://api.census.gov/data/2015/acs5'
        '?get='
        '%s'
#       '&for=block+group:*&in=state:06+county:037&key='
        '&for=tract:*&in=state:06+county:037&key='
#        '&for=zip+code+tabulation+area:*&in=state:06+county:037&key='
        '%s'
        % (','.join(variable_names), API_KEY)
    )

def get_acs_data(variable_names):
    """
    Retrieves all values for variable_names for all block groups in Los Angeles County.
    Source: American Community Survey, 2015 5-year estimates
    @return Pandas dataframe
    """
    # Split variables into sublists of <= 50 vars to comply with API limits
    # Then, build and merge dataframes with each set of 50 and return the whole.
    var_chunks = [variables[x:x+50] for x in range(0, len(variables), 50)]
    for i, chunk in enumerate(var_chunks):
        url = get_acs_url(chunk)
        data = requests.get(url).json()
        
        if i == 0:
            df = pd.DataFrame(data[1:], columns=data[0])
        else:
            df = pd.merge(df, pd.DataFrame(data[1:], columns=data[0]))

    # Convert cols to numeric types
    non_numeric = ('NAME', 'state', 'county', 'tract', 'block group',
                   'zip code tabulation area')
    for col in df.columns:
        if col not in non_numeric:
            df[col] = pd.to_numeric(df[col])
    return df
    
def get_variable_metadata(variable_name):
    """
    Retrieves metadata associated with variable_name.
    @return dict
    """
    url = ('http://api.census.gov/data/2015/acs5/variables/%s.json' 
           % variable_name)
    return requests.get(url).json()

def table_variables(table_name, number_list):
    """
    Generates list of variables by concatenating table_name with each number in number_list.
    """
    return tuple(
        '%s_%03dE' % (table_name, num)
        for num in number_list
    )

In [164]:
# Variables for 2015 ACS 5-Year Estimates Table
variables = (
    ()
    + ('B01003_001E',) # total population
    + ('B01002_001E',) # median age
    + table_variables('B01001', range(1,50)) # age brackets by sex
    + table_variables('B15003', (1, 21, 22, 23, 24, 25)) # educational attainment (higher ed degrees only)
    + ('B19301_001E',) # per capita income
    + ('B19013_001E',) # median household income
    + table_variables('B19001', range(1,18)) # household income
    + ('B25001_001E',) # total housing units
    + ('B25077_001E',) # median gross rent
    + ('B25064_001E',) # median home value (owner-occupied)
    + table_variables('B25002', range(1,4)) # housing vacancy status
)

In [165]:
# Record metadata for each variable (concept, label, data type)
variable_info = {
    v: get_variable_metadata(v)
    for v in variables
}

# Quick helper function to find variable tags
def find_vars(search_key):
    return [
        variable_info[v]
        for v in variable_info.keys()
        if search_key.lower() in variable_info[v]['concept'].lower()
    ]

In [166]:
# Get DataFrame
df = get_acs_data(variables)

# Create FIPS geoid column
df['geoid'] = df['state'] + df['county'] + df['tract'] # + df['block group']

In [167]:
# Determine which columns contain null values
from pprint import pprint
for v in df.columns[df.isnull().any()]:
    pprint(variable_info[v])

{'concept': 'B01002.  Median Age by Sex',
 'label': 'Median age --!!Total:',
 'name': 'B01002_001E',
 'predicateType': 'int'}
{'concept': 'B19301. Per Capita Income in the Past 12 Months (in 2015 '
            'Inflation-Adjusted Dollars)',
 'label': 'Per capita income in the past 12 months (in 2015 Inflation-adjusted '
          'dollars)',
 'name': 'B19301_001E',
 'predicateType': 'int'}
{'concept': 'B19013. Median Household Income in the Past 12 Months (in 2015 '
            'Inflation-Adjusted Dollars)',
 'label': 'Median household income in the past 12 months (in 2015 '
          'Inflation-adjusted dollars)',
 'name': 'B19013_001E',
 'predicateType': 'int'}
{'concept': 'B25077.  Median Value (Dollars) for Owner-Occupied Housing Units',
 'label': 'Median value (dollars)',
 'name': 'B25077_001E',
 'predicateType': 'int'}
{'concept': 'B25064.  Median Gross Rent (Dollars)',
 'label': 'Median gross rent',
 'name': 'B25064_001E',
 'predicateType': 'int'}


In [168]:
# Total population within tracts that contain at least 1 null value
df[pd.isnull(df).any(axis=1)]['B01003_001E'].sum()

353556

In [169]:
# Percent of population within tracts that contain at least 1 null value
df[pd.isnull(df).any(axis=1)]['B01003_001E'].sum() / df['B01003_001E'].sum()

0.035220395944049979

In [170]:
# Add neighborhood relationships to dataframe
data = [[t.geoid, t.neighborhood_id]
        for t in Tract.objects.all()]
df2 = pd.DataFrame(data, columns=('geoid', 'neighborhood_id'))
df = pd.merge(df, df2)

# Pickle the data and metadata
census_data = {
    'variable_info': variable_info,
    'dataframe': df
}
pickle.dump(census_data, open('../pickles/census_data.p', 'wb'))

In [171]:
# Test an aggregation: estimate the population of Beverly Hills
print('Tract population sum:', df[df['neighborhood_id'] == 35]['B01003_001E'].sum())
print('LA Times:', Neighborhood.objects.get(pk=35).fixed_data)

Tract population sum: 30892
LA Times: {'population_latimes_2000_census': 33829}


In [223]:
# Define a function to aggregate tract data relating to a neighborhood
def neighborhood_weighted_avg(total_field, target_field, neighborhood_id, make_percent=False):
    """
    Compute weighted average of statistic for neighborhood.
    Note: returns None if total_field column summation is not > 0
    
    @param total_field: the applicable totals field (e.g., total population in tract)
    @param target_field: the value of interest (e.g., # of college graduates)
    @param neighborhood_id: the neighborhood id to select
    @param make_percent: convert count-based target values to decimal fractions
    """
    nhood = df[df['neighborhood_id'] == neighborhood_id] 
    weights = nhood[total_field] / nhood[total_field].sum()
    if not nhood[total_field].sum() > 0:
        return None
    if (make_percent):
        # Return the weighted sum of all target values, each divided by the applicable total
        return_val = ((nhood[target_field] / nhood[total_field]) * weights).sum()
    else:
        # Return the weighted sum of all target values
        return_val = (nhood[target_field] * weights).sum()
    if np.isnan(return_val):
        return None
    else:
        return return_val


In [224]:
"""
Compute stats for neighborhoods. To avoid a lot of boilerplate code,
we'll define a dict that encapsulates the settings for each stat
that's automatable. (Some others may require a custom approach.)

Totals fields:

'B25001_001E' # total num. of housing units
'B25002_001E' # total housing units for vacancy status
'B01003_001E' # total population
'B15003_001E' # total pop for educational attainment
'B19001_001E' # total num. of households

Target fields:

'B01002_001E' # median age
'B19301_001E' # per capita income
'B15003_021E' # educat attain: assoc degree (%)
'B15003_022E' # educat attain: bachelor's degree (%)
'B15003_023E' # educat attain: masters degree (%)
'B15003_024E' # educat attain: professional school (%)
'B15003_025E' # educat attain: doctorate degree (%)
'B19013_001E' # median household income

'B25064_001E' # median gross rent
'B25064_001E' # median home value (owner-occupied)
'B25002_003E' # vacant homes (%)

"""

stats_meta = {
    'median_age': {
        'total_field': 'B01003_001E',
        'target_field': 'B01002_001E',
        'make_percent': False
    },
    'per_capita_income': {
        'total_field': 'B01003_001E',
        'target_field': 'B19301_001E',
        'make_percent': False
    },
    'median_household_income': {
        'total_field': 'B19301_001E',
        'target_field': 'B19013_001E',
        'make_percent': False
    },
    'median_gross_rent': {
        'total_field': 'B25001_001E',
        'target_field': 'B25064_001E',
        'make_percent': False
    },
    'median_home_value_owner_occupied': {
        'total_field': 'B25001_001E',
        'target_field': 'B25077_001E',
        'make_percent': False
    },
    'percent_homes_vacant': {
        'total_field': 'B25002_001E',
        'target_field': 'B25002_003E',
        'make_percent': True
    },
    'percent_associate_degree': {
        'total_field': 'B15003_001E',
        'target_field': 'B15003_021E',
        'make_percent': True
    },
    'percent_bachelors_degree': {
        'total_field': 'B15003_001E',
        'target_field': 'B15003_022E',
        'make_percent': True
    },
    'percent_masters_degree': {
        'total_field': 'B15003_001E',
        'target_field': 'B15003_023E',
        'make_percent': True
    },
    'percent_professional_degree': {
        'total_field': 'B15003_001E',
        'target_field': 'B15003_024E',
        'make_percent': True
    },
    'percent_doctoral_degree': {
        'total_field': 'B15003_001E',
        'target_field': 'B15003_025E',
        'make_percent': True
    },    
}

def compute_stats(neighborhood):
    return {
        k: neighborhood_weighted_avg(
            total_field=stats_meta[k]['total_field'],
            target_field=stats_meta[k]['target_field'],
            neighborhood_id=neighborhood.id,
            make_percent=stats_meta[k]['make_percent']
        )
        for k in stats_meta.keys()
    }

compute_stats(Neighborhood.objects.get(name="Tujunga"))

{'median_age': 41.425382923572386,
 'median_gross_rent': 1423.2760891590681,
 'median_home_value_owner_occupied': 417053.05977710237,
 'median_household_income': 62780.628095069173,
 'per_capita_income': 28672.086756735527,
 'percent_associate_degree': 0.11835049335049334,
 'percent_bachelors_degree': 0.15256327756327756,
 'percent_doctoral_degree': 0.012012012012012012,
 'percent_homes_vacant': 0.050962512664640323,
 'percent_masters_degree': 0.061400686400686404,
 'percent_professional_degree': 0.0086872586872586872}

In [None]:
# Compute stats for each Neighborhood, and save on the model
for n in Neighborhood.objects.all():
    stats = compute_stats(n)
    for key, value in stats.items():
        n.fixed_data[key] = value
    n.save()
    print(n.name, '*', end='')

In [226]:
Neighborhood.objects.first().data

{'avg_estimated_bookings_per_listing_per_month': 4.34049479166666,
 'avg_estimated_revenue_per_listing_per_month': 5307.615234375,
 'avg_host_experience_years': 0.543310082435003,
 'avg_listing_price': 451.0,
 'crime_count': 842,
 'listing_count': 3,
 'median_age': 45.1756062767475,
 'median_gross_rent': 670.6654991243432,
 'median_home_value_owner_occupied': 415123.50262697024,
 'median_household_income': 83798.29791631277,
 'per_capita_income': 34437.263908701854,
 'percent_associate_degree': 0.10245824732595235,
 'percent_bachelors_degree': 0.16513417151435544,
 'percent_doctoral_degree': 0.013698630136986302,
 'percent_homes_vacant': 0.09737302977232926,
 'percent_masters_degree': 0.08707074498029649,
 'percent_professional_degree': 0.012385062863576657,
 'population_latimes_2000_census': 6522,
 'total_estimated_bookings_per_month': 6.5107421875}