In [46]:
from bs4 import BeautifulSoup
import requests
import re
import json

In [7]:
# Collect neighborhood names formatted as LA Times URL slugs
neighborhood_slugs = {
    n.pk: n.name.lower().replace(' ', '-').replace('/','').replace('ñ','n')
    for n in Neighborhood.objects.all()
}

In [44]:
# Scrape the neighborhood profile pages for the 2000 census population
population_counts = {}
for primary_key, slug in neighborhood_slugs.items():
    r = requests.get('http://maps.latimes.com/neighborhoods/neighborhood/%s' % slug)
    soup = BeautifulSoup(r.content, "lxml")
    selection = soup.select('.neighborhoods-accordion-drawer li')
    if selection:
        bullet_item = selection[0]
        p = re.compile('([\d,]+) population in 2000')
        m = p.match(bullet_item.text)
        population_counts[primary_key] = int(m.group(1).replace(',',''))
    else:
        population_counts[primary_key] = None
    

In [57]:
# Some neighborhoods have no reported population.
# These neighborhoods are mainly parks or commercial zones with no or very sparse housing.
[
    Neighborhood.objects.get(pk=pk).name
    for pk in population_counts.keys()
    if population_counts[pk] == None
]

['Chatsworth Reservoir',
 'Griffith Park',
 'Hansen Dam',
 'Sepulveda Basin',
 'South Diamond Bar',
 'Universal City',
 'Whittier Narrows']

In [None]:
# Save the result to JSON file for future use
with open("../res/neighborhood_population_counts.json", "w") as f:
    f.write(json.dumps(population_counts))

In [66]:
# Update the database models
for pk, population in population_counts.items():
    n = Neighborhood.objects.get(pk=pk)
    n.data['population_latimes_2000_census'] = population
    n.save()
    