In [1]:
import pandas as pd
import geopandas as gpd
import fiona

Two different systems for police incidents:
PIMS: 2018 June migration
CAPRS: Pre June 2018

In [2]:
url_crime_dict = {  
2019: "https://opendata.arcgis.com/datasets/8cd15449ac344aa5a55be7840d67c52d_0.geojson",
2018_1: "https://opendata.arcgis.com/datasets/055e662af18c4488b54dcbd496f897b7_0.geojson", #PIMS
2018_2: "https://opendata.arcgis.com/datasets/58e6f399e0f04c568b3ba45086d15818_0.geojson", #CAPRS    
2017: "https://opendata.arcgis.com/datasets/3d33a4f94a004fb5816936708642e045_0.geojson",
2016: "https://opendata.arcgis.com/datasets/0b12e290edb64816a7cd5270fdd6bacb_0.geojson",
2015: "https://opendata.arcgis.com/datasets/08ff2c3bec594dd2a7a8566b2a81d452_0.geojson",
2014: "https://opendata.arcgis.com/datasets/f0279f3673394c66a96c03e6e42287f4_0.geojson"
}

In [3]:
def merge_years(url_dict):
    df_dict={}
    for i in url_dict.keys():
        df = gpd.read_file(url_dict.get(i)) #ping url for geojson file and read in as geopandas df
        df_dict.update({i:df}) #assign each df to a year in the dict
    return(df_dict)

In [4]:
dfCrime=merge_years(url_crime_dict)

In [5]:
def merge_gpd_dfs(df_dict):
    df_list=list(df_dict.values()) #take dataframes from dict and turn into list
    df = pd.concat(df_list, ignore_index=True, sort=True) #concatenate the dataframes
    crs = df_list[0].crs #capture coordinate system from first record
    df = gpd.GeoDataFrame(df,crs=crs) # transform dataframe into geopandas dataframe
    return(df)

In [6]:
df=merge_gpd_dfs(dfCrime)

In [7]:
df.columns

Index(['BeginDate', 'CCN', 'ControlNbr', 'Description', 'ESRI_OID',
       'EnteredDate', 'GBSID', 'LastChanged', 'LastUpdateDate',
       'LastUpdateDateETL', 'Lat', 'Long', 'Neighborhood', 'OBJECTID',
       'Offense', 'Precinct', 'PublicAddress', 'ReportedDate', 'Time',
       'UCRCode', 'X', 'Y', 'beginDate', 'beginTime', 'caseNumber',
       'centerLat', 'centerLong', 'centerX', 'centerY', 'centergbsid',
       'description', 'enteredDate', 'geometry', 'lastchanged', 'neighborhood',
       'offense', 'precinct', 'publicaddress', 'reportedDate',
       'reportedDateTime', 'reportedTime'],
      dtype='object')

In [8]:
dateCols= [col for col in df.columns if 'Date' in col]
df[dateCols].dtypes

BeginDate            object
EnteredDate          object
LastUpdateDate       object
LastUpdateDateETL    object
ReportedDate         object
beginDate            object
enteredDate          object
reportedDate         object
reportedDateTime     object
dtype: object

In [9]:
df[dateCols]

Unnamed: 0,BeginDate,EnteredDate,LastUpdateDate,LastUpdateDateETL,ReportedDate,beginDate,enteredDate,reportedDate,reportedDateTime
0,,,,2019-08-25T08:15:46,,2019-08-12T00:00:00,2019-08-19T00:00:00,2019-08-19T00:00:00,2019-08-19T00:00:00
1,,,,2019-08-25T08:15:46,,2019-08-22T00:00:00,2019-08-24T00:00:00,2019-08-23T00:00:00,2019-08-23T11:50:00
2,,,,2019-08-25T08:15:46,,2019-08-23T00:00:00,2019-08-24T00:00:00,2019-08-23T00:00:00,2019-08-23T16:06:00
3,,,,2019-08-25T08:15:46,,2019-08-23T00:00:00,2019-08-24T00:00:00,2019-08-23T00:00:00,2019-08-23T14:20:00
4,,,,2019-08-25T08:15:46,,2019-08-10T00:00:00,2019-08-24T00:00:00,2019-08-23T00:00:00,2019-08-23T15:13:00
...,...,...,...,...,...,...,...,...,...
129683,2014-11-12T21:00:00,2014-11-13T08:15:11,2017-03-03T13:40:06,,2014-11-13T08:15:00,,,,
129684,2014-11-12T20:00:00,2014-11-13T08:30:15,2017-03-03T13:40:06,,2014-11-13T08:30:15,,,,
129685,2014-10-29T10:00:00,2014-11-13T09:16:32,2017-03-03T13:40:06,,2014-11-13T09:10:00,,,,
129686,2014-11-13T01:00:00,2014-11-13T09:51:39,2017-03-03T13:40:06,,2014-11-13T09:52:00,,,,


In [10]:
df.loc[:, dateCols] = df.loc[:, dateCols].apply(pd.to_datetime, errors='coerce')

In [11]:
df.enteredDate.describe()

count                   38568
unique                    645
top       2019-09-16 00:00:00
freq                      133
first     2018-06-05 00:00:00
last      2020-03-10 00:00:00
Name: enteredDate, dtype: object

In [12]:
df['enteredDate']=df.enteredDate.fillna(df.EnteredDate)
df=df.drop(columns='EnteredDate')

In [13]:
df['reportedDateTime']=df.reportedDateTime.fillna(df.ReportedDate)
df=df.drop(columns=['ReportedDate','reportedDate'])

In [14]:
df['LastUpdateDateETL']=df.LastUpdateDateETL.fillna(df.LastUpdateDate)
df=df.drop(columns='LastUpdateDate')

In [15]:
df['description']=df.description.fillna(df.Description)
df=df.drop(columns='Description')

In [16]:
df['offense']=df.offense.fillna(df.Offense)
df=df.drop(columns='Offense')

In [17]:
df['beginDate']=df.beginDate.fillna(df.BeginDate)
df=df.drop(columns=['BeginDate','beginTime'])

In [None]:
# pd.to_datetime(df['beginTime'],format='%H%M')
# pd.to_datetime(df[df['beginTime'].isna()==False]['beginTime'], format='%H%M')
# df[df['beginTime'].isna()==False].beginTime.astype(float).astype(int).astype(str)
# df['beginDateTime']=df['beginDate'].dt.strftime(date_format='%Y%m%d')+df[df['beginTime'].isna()==False].beginTime.astype(float).astype(int).astype(str)
# pd.to_datetime(df['beginDateTime'],format='%Y%m%d%H%M')

In [18]:
df.columns

Index(['CCN', 'ControlNbr', 'ESRI_OID', 'GBSID', 'LastChanged',
       'LastUpdateDateETL', 'Lat', 'Long', 'Neighborhood', 'OBJECTID',
       'Precinct', 'PublicAddress', 'Time', 'UCRCode', 'X', 'Y', 'beginDate',
       'caseNumber', 'centerLat', 'centerLong', 'centerX', 'centerY',
       'centergbsid', 'description', 'enteredDate', 'geometry', 'lastchanged',
       'neighborhood', 'offense', 'precinct', 'publicaddress',
       'reportedDateTime', 'reportedTime'],
      dtype='object')

In [19]:
df=df.drop(columns=['Lat', 'Long','centerLat', 'centerLong', 'centerX', 'centerY','X','Y'])

In [20]:
df=df.drop(columns=["CCN",'caseNumber','ControlNbr','centergbsid','GBSID',
                    'ESRI_OID','Precinct','PublicAddress',
                    'neighborhood','Neighborhood','precinct','publicaddress','reportedTime','UCRCode','Time','LastChanged','lastchanged','OBJECTID'])

In [21]:
pd.set_option('display.max_rows', 150)
df.offense.value_counts()

THEFT       25669
TFMV        14099
BURGD       12794
THEFT        8401
AUTOTH       7825
TFMV         7571
AUTOTH       4828
BURGD        4278
SHOPLF       4262
ROBPAG       3896
BIKETF       3390
ASLT2        3185
ROBPER       2920
BURGB        2803
BIKETF       1833
SHOPLF       1644
TFPER        1437
THFTSW       1387
BURGB        1318
CSCR         1303
ASLT2        1247
DASTR        1220
ROBPAG       1150
DASLT2       1133
ROBPER        883
ASLT3         825
THFTSW        697
CSCR          665
DASTR         654
DASLT2        652
TMVP          617
TFPER         569
ROBBIZ        525
TMVP          459
ARSON         433
ASLT4         399
TBLDG         395
TBLDG         380
ASLT3         286
DASLT3        282
ASLT4         181
ROBBIZ        154
DASLT3        138
ARSON         120
MVTHFT        114
MURDR         112
ASLT1          97
MURDR          69
MVTHFT         61
ONLTHT         55
COINOP         46
NOPAY          37
ASLT1          35
POCKET         21
SCRAP          19
PETIT     

In [22]:
df.offense=[row.strip().lower() for row in df.offense]

In [23]:
df.description=[row.strip().lower() for row in df.description]

In [24]:
df.description.unique()

array(['csc - rape', 'theft by swindle', 'theft from motr vehc',
       'shoplifting', 'burglary of dwelling',
       'obs-cscr - use ext 1, 2 or 3', 'automobile theft',
       'theft-motr veh parts', 'robbery per agg', '2nd deg domes aslt',
       'robbery of person', 'other theft',
       'domestic assault/strangulation', 'asslt w/dngrs weapon',
       'burglary of business', 'robbery of business',
       'aslt-sgnfcnt bdly hm', 'bike theft',
       'theft from person snatch/grab', 'aslt4-less than subst harm',
       'theft from building', 'murder (general)', 'aslt-great bodily hm',
       'on-line theft', 'csc - sodomy', 'other vehicle theft',
       '3rd deg domes aslt', 'pocket-picking', 'murder - 2nd degree',
       'arson', 'obs - petty theft', 'csc - penetrate with object',
       '1st deg domes aslt', 'hacking - theft of service',
       'aslt4-subst harm or weapon', 'obs - computer hacking',
       'theft/coinop device', 'murder - 1st degree',
       'gas station driv-off', 

In [None]:
#datetime.strptime(df[df.beginDateTime.isna()==False].beginDateTime[0], '%Y%m%d%H%M')
#datetime.strptime(df[df.beginDateTime.isna()==False].beginDateTime.min(),'%Y%m%d%H%M')
#datetime.strptime(df[df.beginDateTime.isna()==False].beginDateTime[0][8:], '%H%M')

In [25]:
df[[col for col in df.columns if 'Date' in col]]

Unnamed: 0,LastUpdateDateETL,beginDate,enteredDate,reportedDateTime
0,2019-08-25 08:15:46,2019-08-12 00:00:00,2019-08-19 00:00:00,2019-08-19 00:00:00
1,2019-08-25 08:15:46,2019-08-22 00:00:00,2019-08-24 00:00:00,2019-08-23 11:50:00
2,2019-08-25 08:15:46,2019-08-23 00:00:00,2019-08-24 00:00:00,2019-08-23 16:06:00
3,2019-08-25 08:15:46,2019-08-23 00:00:00,2019-08-24 00:00:00,2019-08-23 14:20:00
4,2019-08-25 08:15:46,2019-08-10 00:00:00,2019-08-24 00:00:00,2019-08-23 15:13:00
...,...,...,...,...
129683,2017-03-03 13:40:06,2014-11-12 21:00:00,2014-11-13 08:15:11,2014-11-13 08:15:00
129684,2017-03-03 13:40:06,2014-11-12 20:00:00,2014-11-13 08:30:15,2014-11-13 08:30:15
129685,2017-03-03 13:40:06,2014-10-29 10:00:00,2014-11-13 09:16:32,2014-11-13 09:10:00
129686,2017-03-03 13:40:06,2014-11-13 01:00:00,2014-11-13 09:51:39,2014-11-13 09:52:00


In [26]:
df[[col for col in df.columns if 'Time' in col]]

Unnamed: 0,reportedDateTime
0,2019-08-19 00:00:00
1,2019-08-23 11:50:00
2,2019-08-23 16:06:00
3,2019-08-23 14:20:00
4,2019-08-23 15:13:00
...,...
129683,2014-11-13 08:15:00
129684,2014-11-13 08:30:15
129685,2014-11-13 09:10:00
129686,2014-11-13 09:52:00


In [27]:
df.offense.value_counts()

theft     34070
tfmv      21670
burgd     17072
autoth    12653
shoplf     5906
biketf     5223
robpag     5046
aslt2      4432
burgb      4121
robper     3803
thftsw     2084
tfper      2006
cscr       1968
dastr      1874
daslt2     1785
aslt3      1111
tmvp       1076
tbldg       775
robbiz      679
aslt4       580
arson       553
daslt3      420
murdr       181
mvthft      175
aslt1       132
onltht       66
coinop       58
nopay        40
comput       31
scrap        24
pocket       24
daslt1       21
petit        18
disarm        9
loot          2
Name: offense, dtype: int64

In [28]:
df['description'].value_counts()

other theft                       34070
theft from motr vehc              21670
burglary of dwelling              17072
motor vehicle theft                7825
shoplifting                        5906
bike theft                         5223
robbery per agg                    5046
automobile theft                   4828
asslt w/dngrs weapon               4432
burglary of business               4121
robbery of person                  3803
theft by swindle                   2084
domestic assault/strangulation     1874
2nd deg domes aslt                 1785
theft from person                  1437
crim sex cond-rape                 1303
aslt-sgnfcnt bdly hm               1111
theft-motr veh parts               1076
theft from building                 775
robbery of business                 679
csc - rape                          578
theft from person snatch/grab       569
arson                               514
3rd deg domes aslt                  420
aslt-police/emerg p                 399


In [32]:
df.groupby(['offense','description']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,LastUpdateDateETL,beginDate,enteredDate,geometry,reportedDateTime
offense,description,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arson,arson,514,514,514,514,514
arson,arson-1st degree,39,39,39,39,39
aslt1,aslt-great bodily hm,132,132,132,132,132
aslt2,asslt w/dngrs weapon,4432,4432,4432,4432,4432
aslt3,aslt-sgnfcnt bdly hm,1111,1111,1111,1111,1111
aslt4,aslt-police/emerg p,399,399,399,399,399
aslt4,aslt4-less than subst harm,121,121,121,121,121
aslt4,aslt4-subst harm or weapon,11,11,11,11,11
aslt4,obs - aslt-police/emerg p,49,49,49,49,49
autoth,automobile theft,4828,4828,4828,4828,4828


Should this be burglary only? That is what Chicago is doing

In [35]:
dfFinal=df[df.offense.isin(['autoth','biketf','burgb','burgd','coinop','mvthft','robbiz','robpag','robper','shoplf',
                 'tbldg','tfmv','tfper','theft','thftsw','tmvp'])]

In [30]:
df[['beginDate','description','offense','geometry']]

Unnamed: 0,beginDate,description,offense,geometry
0,2019-08-12 00:00:00,csc - rape,cscr,POINT (-93.26503 44.97776)
1,2019-08-22 00:00:00,theft by swindle,thftsw,POINT (-93.24486 44.94565)
2,2019-08-23 00:00:00,theft from motr vehc,tfmv,POINT (-93.29320 45.03054)
3,2019-08-23 00:00:00,shoplifting,shoplf,POINT (-93.27880 44.94834)
4,2019-08-10 00:00:00,burglary of dwelling,burgd,POINT (-93.30521 44.94843)
...,...,...,...,...
129683,2014-11-12 21:00:00,motor vehicle theft,autoth,POINT (-93.28127 44.96001)
129684,2014-11-12 20:00:00,theft from motr vehc,tfmv,POINT (-93.28811 44.94748)
129685,2014-10-29 10:00:00,other theft,theft,POINT (-93.21638 44.96458)
129686,2014-11-13 01:00:00,burglary of dwelling,burgd,POINT (-93.23222 44.96097)


In [36]:
dfFinal.groupby(['offense','description']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,LastUpdateDateETL,beginDate,enteredDate,geometry,reportedDateTime
offense,description,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
autoth,automobile theft,4828,4828,4828,4828,4828
autoth,motor vehicle theft,7825,7825,7825,7825,7825
biketf,bike theft,5223,5223,5223,5223,5223
burgb,burglary of business,4121,4121,4121,4121,4121
burgd,burglary of dwelling,17072,17072,17072,17072,17072
coinop,theft/coinop device,58,58,58,58,58
mvthft,other vehicle theft,175,175,175,175,175
robbiz,robbery of business,679,679,679,679,679
robpag,robbery per agg,5046,5046,5046,5046,5046
robper,robbery of person,3803,3803,3803,3803,3803


In [46]:
dfFinal.groupby(['reportedDateTime','offense']).count()['geometry']>1

reportedDateTime     offense
2014-01-01 01:17:00  shoplf     False
2014-01-01 03:24:00  burgd      False
2014-01-01 04:02:00  robper     False
2014-01-01 04:39:00  robper     False
2014-01-01 07:45:00  tfper      False
                                ...  
2020-03-09 13:36:00  biketf     False
2020-03-09 14:00:00  tfmv       False
2020-03-09 14:33:00  theft      False
2020-03-09 20:27:00  theft      False
2020-03-09 20:45:00  burgb      False
Name: geometry, Length: 115738, dtype: bool

In [47]:
dfFinal.head()

Unnamed: 0,LastUpdateDateETL,beginDate,description,enteredDate,geometry,offense,reportedDateTime
1,2019-08-25 08:15:46,2019-08-22,theft by swindle,2019-08-24,POINT (-93.24486 44.94565),thftsw,2019-08-23 11:50:00
2,2019-08-25 08:15:46,2019-08-23,theft from motr vehc,2019-08-24,POINT (-93.29320 45.03054),tfmv,2019-08-23 16:06:00
3,2019-08-25 08:15:46,2019-08-23,shoplifting,2019-08-24,POINT (-93.27880 44.94834),shoplf,2019-08-23 14:20:00
4,2019-08-25 08:15:46,2019-08-10,burglary of dwelling,2019-08-24,POINT (-93.30521 44.94843),burgd,2019-08-23 15:13:00
6,2019-08-25 08:15:46,2019-08-23,automobile theft,2019-08-24,POINT (-93.22687 44.99325),autoth,2019-08-23 20:23:00


In [60]:
dfBurg=df[df.offense.isin(['burgb','burgd'])]

In [54]:
dfBurg.count()

LastUpdateDateETL    21193
beginDate            21193
description          21193
enteredDate          21193
geometry             21193
offense              21193
reportedDateTime     21193
dtype: int64

In [61]:
dfBurg=dfBurg[['beginDate','offense','geometry']]

In [62]:
dfBurg.to_pickle('burglaries.pickle')