In [8]:
from main.models import Neighborhood, Zipcode, BlockGroup
from django.contrib.gis.db.models.functions import Distance
from datetime import datetime
import pytz
from django.contrib.gis.geos import Point
import pandas as pd
import numpy as np
from ast import literal_eval
from dateutil import parser

In [16]:
# Define data types for initial parsing of CSV
# np.int64, str, np.float64
dtype = {
    'INCIDENT_DATE': str,
    'INCIDENT_REPORTED_DATE': str,
    'CATEGORY': str,
    'STAT': int,
    'STAT_DESC': str,
    'X_COORDINATE': np.float64,
    'Y_COORDINATE': np.float64,
    'INCIDENT_ID': str,
}
     
# Specify columns that should be parsed as dates
# (Skip INCIDENT_DATE because of inconsistencies in the column)
parse_dates = ['INCIDENT_REPORTED_DATE',] 

# Specify conversion functions for particular columns
converters = {
    'INCIDENT_ID': lambda x: int(x.replace('-',''))
}

In [19]:
# Read CSV contents into dataframe

# Since the 2012-2015 county crime data comes in multiple
# files, concatenate them.
dataframes = (
    pd.read_csv(
        filename,
        usecols=list(dtype.keys()),
        dtype=dtype, 
        parse_dates=parse_dates, 
        converters=converters
    )
    for filename in (
        '../res/2012-PART_I_AND_II_CRIMES.csv.gz',
        '../res/2013-PART_I_AND_II_CRIMES.csv.gz',
        '../res/2014-PART_I_AND_II_CRIMES.csv.gz',
        '../res/2015-PART_I_AND_II_CRIMES.csv.gz',
    )
)

df = pd.concat(dataframes, ignore_index=True)


In [25]:
# Check for columns containing null values
df.isnull().any()

INCIDENT_DATE             False
INCIDENT_REPORTED_DATE    False
CATEGORY                  False
STAT                      False
STAT_DESC                 False
ADDRESS                   False
X_COORDINATE              False
Y_COORDINATE              False
INCIDENT_ID               False
dtype: bool

In [23]:
# Drop entries with certain missing values
df.dropna(subset=['X_COORDINATE', 'Y_COORDINATE'], inplace=True)

In [26]:
# Pickle the dataframe for future use
df.to_pickle('../res/la_county_sheriff_crime_2012-2015.pickle')

In [27]:
df.head()

Unnamed: 0,INCIDENT_DATE,INCIDENT_REPORTED_DATE,CATEGORY,STAT,STAT_DESC,ADDRESS,X_COORDINATE,Y_COORDINATE,INCIDENT_ID
0,01/01/2012 12:22:03 AM,2012-01-01,DRUNK / ALCOHOL / DRUGS,201,DRUNK: Alcohol,"4100 ADMIRALTY WAY, MARINA DEL REY, CA 90292",6422198.0,1815995.0,912000012760
1,01/01/2012 01:21:00 AM,2012-01-01,ROBBERY,47,"ROBBERY, STRONG-ARM: Other","16800 INYO ST, LA PUENTE, CA",6581071.0,1826603.0,912000031434
2,01/01/2012 12:45:00 AM,2012-01-01,BURGLARY,62,"BURGLARY, RESIDENCE: Night, Entry No Force","5100 GREER AVE, COVINA, CA 91724",6607301.0,1860788.0,912000020871
3,01/01/2012 01:45:00 AM,2012-01-01,NON-AGGRAVATED ASSAULTS,144,"ASSAULT, NON-AGG: Hands, Feet, Fist, Etc.","4900 SUMMERBREEZE CT, PALMDALE, CA",6549080.0,2020509.0,12000032603
4,01/01/2012 02:20:00 AM,2012-01-01,GRAND THEFT AUTO,91,GRAND THEFT VEHICLE (GTA): Automobile/Passenge...,"600 N COLONIA DE LOS CEDROS, LOS ANGELES, CA ...",6513313.0,1838244.0,912000070291


In [33]:
point = Point(x=df.iloc[0]['X_COORDINATE'], y=df.iloc[0]['Y_COORDINATE'], srid=102645) # State Plane 5 Coordinate Sys
point.transform(4326) # Convert to 4326 in-place


In [None]:
# Instantiate and save a Listing for each row in the dataframe
for index, row in df.iterrows():
    point = Point(x=row['X_COORDINATE'], y=row['Y_COORDINATE'], srid=102645) # CA State Plane 5 Coordinate System
    point.transform(4326) # Convert to standard lat/long projection
    crime = Crime(
        data_source='LACS',
        report_number=row['INCIDENT_ID'],
        date_reported=row['INCIDENT_REPORTED_DATE'].to_pydatetime(),
        date_occurred=pytz.timezone('America/Los_Angeles').localize(parser.parse(row['INCIDENT_DATE'])),
        crime_code=row['STAT'],
        crime_code_desc=row['STAT_DESC'],
        category=row['CATEGORY'],
        point=point,
    )
    crime.save() # commit to DB

    # Print progress bar
    if index % 100 == 0:
        print('.', end='')

In [61]:
for n in Neighborhood.objects.all():
    Crime.objects.filter(data_source='LACS').filter(point__within=n.mpoly).update(neighborhood=n)

In [63]:
for z in Zipcode.objects.all():
    Crime.objects.filter(data_source='LACS').filter(point__within=z.mpoly).update(zipcode=z)

In [64]:
for bg in BlockGroup.objects.all():
    Crime.objects.filter(data_source='LACS').filter(point__within=bg.mpoly).update(block_group=bg)

In [65]:
for n in Neighborhood.objects.all():
    n.update_stats()

In [67]:
for n in Neighborhood.objects.annotate(Count('crime')):
    print(n.name, n.crime__count)

Lake Balboa 4837
Walnut 3015
Rowland Heights 6260
Harbor Gateway 8142
La Cañada Flintridge 2626
Alondra Park 1590
Winnetka 9252
East Pasadena 1188
Paramount 14699
South Diamond Bar 5
Hidden Hills 116
Palms 7247
Harbor City 4844
Charter Oak 1063
Broadway-Manchester 9162
Glendora 207
West San Dimas 29
Santa Clarita 27651
Malibu 4730
Montecito Heights 2067
Carthay 1550
West Compton 1747
Koreatown 24592
Granada Hills 9815
Wilmington 12472
Irwindale 462
Del Aire 1153
Whittier Narrows 664
Silver Lake 6577
Pasadena 1004
Lawndale 5909
Playa Vista 1603
Avocado Heights 2799
Echo Park 7379
Baldwin Park 118
Bel-Air 989
Signal Hill 13
Lakewood 15895
Huntington Park 576
Rancho Park 910
West Whittier-Los Nietos 4482
Pacoima 13724
Marina del Rey 2499
South Pasadena 31
San Dimas 6446
Universal City 1225
Bellflower 17984
Toluca Lake 1969
Maywood 3394
Lopez/Kagel Canyons 178
Glendale 348
Northridge 14940
Santa Fe Springs 747
San Fernando 189
Quartz Hill 1614
Hollywood Hills West 3084
Rolling Hills 319
No