In [4]:
from main.models import Neighborhood, Zipcode, BlockGroup
from django.contrib.gis.db.models.functions import Distance
from datetime import datetime
import pytz
from django.contrib.gis.geos import Point
import pandas as pd
import numpy as np
from ast import literal_eval

In [2]:
# Define data types for initial parsing of CSV
# np.int64, str, np.float64
dtype = {
    'Date.Rptd': str,
    'DR.NO': np.int64,
    'DATE.OCC': str,
    'TIME.OCC': str,
    'Crm.Cd': int,
    'CrmCd.Desc': str,
    'Location.1': str, # parse as (float, float) pair
}
     
# Specify columns that should be parsed as dates
parse_dates = ['Date.Rptd', 'DATE.OCC']

# Specify conversion functions for particular columns
converters = {
    'TIME.OCC': lambda t: ('000' + t)[-4:] # Fill with leading zeros, e.g. '30' --> '0030'
}

In [3]:
# Read CSV contents into dataframe
df = pd.read_csv(
    '../res/Crimes_2012-2015.csv.gz',
    usecols=list(dtype.keys()),
    dtype=dtype, 
    parse_dates=parse_dates, 
    converters=converters
)

In [6]:
# Check for columns containing null values
df.isnull().any()

Date.Rptd     False
DR.NO         False
DATE.OCC      False
TIME.OCC      False
Crm.Cd        False
CrmCd.Desc    False
Location.1    False
dtype: bool

In [5]:
# Fill certain null / missing values
# df['reviews_per_month'].fillna(value=0.0, inplace=True)

# Drop entries with certain missing values
df.dropna(subset=['Location.1', 'CrmCd.Desc'], inplace=True)

In [9]:
# Delete all existing Crimes
Crime.objects.all().delete()

# Instantiate and save a Listing for each row in the dataframe
for index, row in df.iterrows():
    coords = literal_eval(row['Location.1'])
    point = Point(x=coords[1], y=coords[0], srid=4326)
    # Combine date, time of occurrence
    date_occurred = row['DATE.OCC'].to_pydatetime().replace(
        hour=int(row['TIME.OCC'][:2]),
        minute=int(row['TIME.OCC'][2:]),
        tzinfo=pytz.timezone('America/Los_Angeles')
    )
    crime = Crime(
        report_number=row['DR.NO'],
        date_reported=row['Date.Rptd'],
        date_occurred=date_occurred,
        crime_code=row['Crm.Cd'],
        crime_code_desc=row['CrmCd.Desc'],
        point=point
    )
    crime.save() # commit to DB
    if index % 1000 == 0:
        print('.', end='')

..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [None]:
# Tag each crime with neighborhood, block group, zipcode

# Tag Neighborhood
for i, neighborhood in enumerate(Neighborhood.objects.all()):
    crimes = Crime.objects.filter(point__within=neighborhood.mpoly)
    crimes.update(neighborhood=neighborhood)    
    print(neighborhood, crimes.count())


In [26]:
# Tag Zipcode
for i, zipcode in enumerate(Zipcode.objects.all()):
    crimes = Crime.objects.filter(point__within=zipcode.mpoly)
    crimes.update(zipcode=zipcode)    
    if i % 100 == 0:
        print('.', end='')

............................................................................................................................................................................................................................................................................................................................................

In [27]:
# Tag Block Group
for i, block_group in enumerate(BlockGroup.objects.all()):
    crimes = Crime.objects.filter(point__within=block_group.mpoly)
    crimes.update(block_group=block_group)    
    if i % 100 == 0:
        print('.', end='')

........................................................................................................................................................................................................................................

In [14]:
# Compute the nearest Neighborhood to crimes 
# that are missing a neighborhood
for crime in Crime.objects.filter(neighborhood=None):
    neighborhood = Neighborhood.objects.annotate(
        distance=Distance('mpoly', crime.point)).order_by('distance').first()
    print(crime.pk, crime.point.coords, neighborhood, neighborhood.distance)

1583 (-118.2413, 33.7767) Wilmington 1.93488533 m
62988 (-118.2413, 33.7767) Wilmington 1.93488533 m
95847 (-118.2413, 33.7767) Wilmington 1.93488533 m
100001 (-118.3614, 34.0872) West Hollywood 0.06925057 m
115370 (-118.3614, 34.0872) West Hollywood 0.06925057 m
129461 (-117.6596, 34.4527) Northeast Antelope Valley 15.79142378 m
138998 (-118.2413, 33.7767) Wilmington 1.93488533 m
142039 (-118.3614, 34.0872) West Hollywood 0.06925057 m
220515 (-118.2737, 33.7589) Wilmington 9.95183629 m
258894 (-118.2762, 33.72) San Pedro 45.9989447 m
294805 (-118.2705, 33.7351) San Pedro 4.05969477 m
312159 (-118.2759, 33.7189) San Pedro 14.79602465 m
320162 (-118.3614, 34.0872) West Hollywood 0.06925057 m
350518 (-118.2413, 33.7767) Wilmington 1.93488533 m
373814 (-118.2705, 33.7351) San Pedro 4.05969477 m
392814 (-118.2494, 33.7667) Wilmington 3.18894499 m
402388 (-118.3614, 34.0872) West Hollywood 0.06925057 m
419332 (-118.3614, 34.0872) West Hollywood 0.06925057 m
428782 (-118.2705, 33.7351) San P

In [13]:
# Delete crimes with invalid location data
Crime.objects.filter(point__equals=Point(0.0, 0.0, srid=4326)).delete()

(6771, {'main.Crime': 6771})

In [38]:
from django.db.models import Count
Neighborhood.objects.annotate(crime_count=Count('crime')).order_by('crime_count')

<QuerySet [<Neighborhood: Bradbury>, <Neighborhood: Val Verde>, <Neighborhood: North El Monte>, <Neighborhood: Vincent>, <Neighborhood: Whittier Narrows>, <Neighborhood: Lake Hughes>, <Neighborhood: West Whittier-Los Nietos>, <Neighborhood: Rolling Hills>, <Neighborhood: Quartz Hill>, <Neighborhood: Maywood>, <Neighborhood: La Cañada Flintridge>, <Neighborhood: Avocado Heights>, <Neighborhood: Charter Oak>, <Neighborhood: Hidden Hills>, <Neighborhood: East Pasadena>, <Neighborhood: West San Dimas>, <Neighborhood: Baldwin Park>, <Neighborhood: Lawndale>, <Neighborhood: South Diamond Bar>, <Neighborhood: East La Mirada>, '...(remaining elements truncated)...']>