In [2]:
## imports
import pandas as pd
import numpy as np
import plotnine
from plotnine import *
import random

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from datetime import datetime, timedelta

## Load data

In [17]:
## load data on 2020 crimes in DC
dc_crim_2020 = pd.read_csv("https://opendata.arcgis.com/datasets/f516e0dd7b614b088ad781b0c4002331_2.csv")

## create report_dt column
dc_crim_2020['report_dt'] = pd.to_datetime(dc_crim_2020.REPORT_DAT)

## define crimes to look for and crimes to look within
CCN_examples = ['20165648', '20123250']
crimes_lookfor = dc_crim_2020.loc[dc_crim_2020.CCN.astype(str).isin(CCN_examples),
                ['CCN', 'WARD', 'OFFENSE', 'report_dt']].copy()
other_crimes = dc_crim_2020[~dc_crim_2020.CCN.astype(str).isin(CCN_examples)].copy()

## print crimes_lookfor
crimes_lookfor.head()
# other_crimes.head()

Unnamed: 0,CCN,WARD,OFFENSE,report_dt
14416,20123250,2,MOTOR VEHICLE THEFT,2020-08-29 05:00:25+00:00
15322,20165648,6,MOTOR VEHICLE THEFT,2020-11-20 02:25:50+00:00


**Task**: we have two crimes we want to look for. We want to look in the remaining crime reports for crime reports that are:

- Located in the same ward as the two focal crimes
- Reported at the same time as the focal crime or up to 1000 minutes later (changed from slides which stated 20 mins since crime ids changed since last time so this long bandwidth helps us find matches!)

Solutions compare two ways to solve:

- Using a for loop
- Using a function

## 1. Loop approach

In [32]:
## create empty container to store results 
store_matches = {}

## loop through two example crimes
for i in range(0, crimes_lookfor.shape[0]):
    
    ## extract row
    one_row = crimes_lookfor.iloc[i]
    
    ## first, subset to crimes in same ward
    same_wards = other_crimes[other_crimes.WARD == one_row.WARD].copy()
    
    ## second, with those same-ward crimes, construct indicator for reported within 20 minutes
    ## (interpreting as after but could do either)
    ### substep: get time cutoff
    cutoff = one_row.report_dt +  timedelta(minutes=1000)
    
    ### substep: use that to subset
    same_wards_sametime = same_wards[(same_wards.report_dt >= one_row.report_dt) & 
                                    (same_wards.report_dt <= cutoff)].copy()
    
    ## third, store the results
    store_matches[str(one_row.CCN)] = same_wards_sametime
    
## finally, concatenate results into one df
all_matches = pd.concat(store_matches)
all_matches.head()


Unnamed: 0,Unnamed: 1,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,CENSUS_TRACT,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,report_dt
20123250,14428,-77.050519,38.913357,20123422,2020/08/29 16:45:57+00,DAY,OTHERS,THEFT F/AUTO,2200 - 2399 BLOCK OF DECATUR PLACE NW,395618.81,138388.39,...,4100.0,Precinct 13,38.913349,-77.050517,,2020/08/26 22:00:29+00,2020/08/27 12:00:51+00,98429488,,2020-08-29 16:45:57+00:00
20123250,15883,-77.038482,38.913729,20401318,2020/08/29 14:29:59+00,DAY,OTHERS,THEFT/OTHER,1724 - 1799 BLOCK OF 17TH STREET NW,396662.8,138429.15,...,5302.0,Precinct 15,38.913721,-77.038479,,2020/08/28 20:55:00+00,2020/08/28 21:05:00+00,98433313,,2020-08-29 14:29:59+00:00
20123250,16137,-77.040082,38.909653,20123389,2020/08/29 16:05:18+00,DAY,OTHERS,THEFT F/AUTO,1700 - 1799 BLOCK OF P STREET NW,396523.77,137976.79,...,5303.0,Precinct 15,38.909645,-77.04008,,2020/08/28 22:00:23+00,2020/08/29 08:00:27+00,98433893,,2020-08-29 16:05:18+00:00
20123250,16140,-77.021919,38.899137,20123419,2020/08/29 17:15:19+00,DAY,OTHERS,THEFT/OTHER,700 - 799 BLOCK OF 7TH STREET NW,398098.85,136808.92,...,5801.0,Precinct 129,38.89913,-77.021917,DOWNTOWN,2020/08/29 16:05:40+00,2020/08/29 16:08:33+00,98433896,,2020-08-29 17:15:19+00:00
20165648,29,-76.999513,38.891484,20165709,2020/11/20 04:27:36+00,MIDNIGHT,OTHERS,MOTOR VEHICLE THEFT,100 - 199 BLOCK OF 5TH STREET NE,400042.45,135959.06,...,8200.0,Precinct 89,38.891476,-76.999511,,2020/11/20 03:02:27+00,,98177004,,2020-11-20 04:27:36+00:00


## 2. Function approach

### 2.1 define the function

In [27]:
def proximate_crimes(search_for: pd.DataFrame,
                    search_in: pd.DataFrame,
                    time_cutoff = 1000):
    
    ## first, subset to crimes in same ward
    same_wards = search_in[search_in.WARD == search_for.WARD].copy()
    
    ## second, with those same-ward crimes, construct indicator for reported within 20 minutes
    ### substep: get time cutoff
    cutoff = search_for.report_dt +  timedelta(minutes=time_cutoff)
    
    ### substep: use that to subset
    same_wards_sametime = same_wards[(same_wards.report_dt >= search_for.report_dt) & 
                                    (same_wards.report_dt <= cutoff)].copy()
    
    ## add focal match
    same_wards_sametime['focal_crime'] = search_for.CCN
    
    ## return
    return(same_wards_sametime)
    

### 2.2 apply it to one of the focal crimes

In [28]:
one_match = proximate_crimes(search_for = crimes_lookfor.iloc[0],
                            search_in = other_crimes)
one_match

Unnamed: 0,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,report_dt,focal_crime
14428,-77.050519,38.913357,20123422,2020/08/29 16:45:57+00,DAY,OTHERS,THEFT F/AUTO,2200 - 2399 BLOCK OF DECATUR PLACE NW,395618.81,138388.39,...,Precinct 13,38.913349,-77.050517,,2020/08/26 22:00:29+00,2020/08/27 12:00:51+00,98429488,,2020-08-29 16:45:57+00:00,20123250
15883,-77.038482,38.913729,20401318,2020/08/29 14:29:59+00,DAY,OTHERS,THEFT/OTHER,1724 - 1799 BLOCK OF 17TH STREET NW,396662.8,138429.15,...,Precinct 15,38.913721,-77.038479,,2020/08/28 20:55:00+00,2020/08/28 21:05:00+00,98433313,,2020-08-29 14:29:59+00:00,20123250
16137,-77.040082,38.909653,20123389,2020/08/29 16:05:18+00,DAY,OTHERS,THEFT F/AUTO,1700 - 1799 BLOCK OF P STREET NW,396523.77,137976.79,...,Precinct 15,38.909645,-77.04008,,2020/08/28 22:00:23+00,2020/08/29 08:00:27+00,98433893,,2020-08-29 16:05:18+00:00,20123250
16140,-77.021919,38.899137,20123419,2020/08/29 17:15:19+00,DAY,OTHERS,THEFT/OTHER,700 - 799 BLOCK OF 7TH STREET NW,398098.85,136808.92,...,Precinct 129,38.89913,-77.021917,DOWNTOWN,2020/08/29 16:05:40+00,2020/08/29 16:08:33+00,98433896,,2020-08-29 17:15:19+00:00,20123250


### 2.3 Use list comprehension to iterate and apply it over the other focal crimes

In [36]:
all_matches_list = [proximate_crimes(crimes_lookfor.iloc[i],
                               other_crimes) 
                    for i in range(0, crimes_lookfor.shape[0])]
all_matches = pd.concat(all_matches_list)
all_matches[["report_dt", "WARD", "focal_crime"]].head()
all_matches[["report_dt", "WARD", "focal_crime"]].tail()

## could then improve by adding attributes of the focal crime
## to the dataframe to compare more easily

Unnamed: 0,report_dt,WARD,focal_crime
14428,2020-08-29 16:45:57+00:00,2,20123250
15883,2020-08-29 14:29:59+00:00,2,20123250
16137,2020-08-29 16:05:18+00:00,2,20123250
16140,2020-08-29 17:15:19+00:00,2,20123250
29,2020-11-20 04:27:36+00:00,6,20165648


Unnamed: 0,report_dt,WARD,focal_crime
35,2020-11-20 14:45:06+00:00,6,20165648
36,2020-11-20 15:06:04+00:00,6,20165648
41,2020-11-20 15:37:59+00:00,6,20165648
15328,2020-11-20 12:46:32+00:00,6,20165648
15474,2020-11-20 18:56:18+00:00,6,20165648
