# Sourcing Crime Data 

## Imports

In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

## Fetch Crime Data from WakeGov (daily policy incidents as a proxy)

In [2]:
# https://data-wake.opendata.arcgis.com/datasets/ral::raleigh-police-incidents-nibrs/explore?location=35.786000%2C-78.643000%2C4.74&showTable=true
def getCrimeDataDf(forceFetch=False):
        path = Path()
        key = 'crimedata.csv'
        filename = path/key
        
        # If the file does already exist in the directory, download it
        if os.path.exists(filename) and not forceFetch:
            print('Using pre-fetched crime data...')
            df = pd.read_csv(key)
            print('crime df shape:', df.shape)
            return df
        else:
            print('Fetching police incidents data...')
            
            endpoint = 'https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.geojson'

            # Sending get request and saving the response as response object
            # extracting data in json 
            r = requests.get(url = endpoint)
            rows = []
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
            df = pd.DataFrame(rows)
            print('crime df shape:', df.shape)
            df.to_csv('crimedata.csv', index=False)
            print('Done')
            return df

In [3]:
crime_df_raw = getCrimeDataDf(forceFetch=False)
display(crime_df_raw.head())

Using pre-fetched crime data...
crime df shape: (371514, 21)


Unnamed: 0,OBJECTID,GlobalID,case_number,crime_category,crime_code,crime_description,crime_type,reported_block_address,city_of_incident,city,...,reported_date,reported_year,reported_month,reported_day,reported_hour,reported_dayofwk,latitude,longitude,agency,updated_date
0,12001,9cdee08d-11c8-4789-864b-6965a1b2e620,,MISCELLANEOUS,81H,Miscellaneous/Missing Person (18 & over),,,,RALEIGH,...,2017-01-15T03:28:00Z,2017,1,14,22,Saturday,0.0,0.0,RPD,2017-01-19T20:11:28.950Z
1,12002,6f6731f4-dd64-44c7-895c-555de2703c8a,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2017-07-29T12:35:00Z,2017,7,29,8,Saturday,0.0,0.0,RPD,2017-08-01T14:06:50.553Z
2,12003,f0fd0e92-448e-4ca8-86c9-e6594564318b,,MISCELLANEOUS,81F,Miscellaneous/Mental Commitment,,,,RALEIGH,...,2016-03-07T03:52:00Z,2016,3,6,22,Sunday,0.0,0.0,RPD,2016-04-14T14:43:38.923Z
3,12004,8a212e84-7b53-478a-b225-c212aa25d7fd,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-03-24T04:59:00Z,2015,3,24,0,Tuesday,0.0,0.0,RPD,2015-03-25T19:24:28.430Z
4,12005,01614b98-48f5-4374-a561-17c4b29d8857,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-12-23T00:57:00Z,2015,12,22,19,Tuesday,0.0,0.0,RPD,2016-01-13T19:29:51.767Z


In [4]:
crime_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371514 entries, 0 to 371513
Data columns (total 21 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   OBJECTID                371514 non-null  int64  
 1   GlobalID                371514 non-null  object 
 2   case_number             277959 non-null  object 
 3   crime_category          371514 non-null  object 
 4   crime_code              371514 non-null  object 
 5   crime_description       371514 non-null  object 
 6   crime_type              217738 non-null  object 
 7   reported_block_address  277913 non-null  object 
 8   city_of_incident        277958 non-null  object 
 9   city                    371512 non-null  object 
 10  district                371514 non-null  object 
 11  reported_date           371514 non-null  object 
 12  reported_year           371514 non-null  int64  
 13  reported_month          371514 non-null  int64  
 14  reported_day        

In [5]:
crime_df_raw.nunique()

OBJECTID                  371514
GlobalID                  371514
case_number               244078
crime_category                33
crime_code                   127
crime_description            131
crime_type                     3
reported_block_address     33364
city_of_incident              14
city                          41
district                       7
reported_date             319191
reported_year                  8
reported_month                12
reported_day                  31
reported_hour                 24
reported_dayofwk               7
latitude                   65570
longitude                  65572
agency                         1
updated_date              335285
dtype: int64

In [6]:
print(crime_df_raw.columns[crime_df_raw.isna().mean() >= 0.25])
print(crime_df_raw['city'].unique())

Index(['case_number', 'crime_type', 'reported_block_address',
       'city_of_incident'],
      dtype='object')
['RALEIGH' 'GARNER' 'WAKE FOREST' 'Raleigh'
 'Miscellaneous/Missing Person (16-over)' 'raleigh' 'Garner' 'MORRISVILLE'
 'Cary' 'MAITLAND' 'APEX' 'wake forest' 'garner' 'FUQUAY VARINA'
 'HOLLY SPRINGS' 'KNIGHTDALE' 'CARY' 'Fuquay Varina' 'ZEBULON' 'WENDELL'
 'Wilmington' 'COLUMBIA' 'MELBOURNE' 'SALEMBURG' 'BENTON HARBOR, MI'
 'GOSHEN' nan 'Knightdale' 'Wendell' 'Rolesville' 'Wake Forest' 'Durham'
 'cary' 'Youngsville' 'Morrisville' 'durham' 'Holly Springs' '`' 'Zebulon'
 'Clayton' 'SPARTA' 'DURHAM']


In [7]:
crime_df_raw.head()

Unnamed: 0,OBJECTID,GlobalID,case_number,crime_category,crime_code,crime_description,crime_type,reported_block_address,city_of_incident,city,...,reported_date,reported_year,reported_month,reported_day,reported_hour,reported_dayofwk,latitude,longitude,agency,updated_date
0,12001,9cdee08d-11c8-4789-864b-6965a1b2e620,,MISCELLANEOUS,81H,Miscellaneous/Missing Person (18 & over),,,,RALEIGH,...,2017-01-15T03:28:00Z,2017,1,14,22,Saturday,0.0,0.0,RPD,2017-01-19T20:11:28.950Z
1,12002,6f6731f4-dd64-44c7-895c-555de2703c8a,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2017-07-29T12:35:00Z,2017,7,29,8,Saturday,0.0,0.0,RPD,2017-08-01T14:06:50.553Z
2,12003,f0fd0e92-448e-4ca8-86c9-e6594564318b,,MISCELLANEOUS,81F,Miscellaneous/Mental Commitment,,,,RALEIGH,...,2016-03-07T03:52:00Z,2016,3,6,22,Sunday,0.0,0.0,RPD,2016-04-14T14:43:38.923Z
3,12004,8a212e84-7b53-478a-b225-c212aa25d7fd,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-03-24T04:59:00Z,2015,3,24,0,Tuesday,0.0,0.0,RPD,2015-03-25T19:24:28.430Z
4,12005,01614b98-48f5-4374-a561-17c4b29d8857,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-12-23T00:57:00Z,2015,12,22,19,Tuesday,0.0,0.0,RPD,2016-01-13T19:29:51.767Z


## Preprocessing

In [8]:
def preprocess_crimedata(df):
    # Drop duplicates inplace
    df.drop_duplicates(inplace=True)
    
    # normalize cities
    df['city'] = df['city'].str.upper()
    
    df = df[df['reported_year'] >= 2016]

    # Drop irrelevant columns
    df = df.drop(['GlobalID', 'case_number', 'updated_date', 'agency', 
                  'city_of_incident', 'reported_block_address','district', 
                  'latitude','longitude', 'reported_hour'], axis=1)
    
        
    # Drop columns w/ > 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    print('remove columns with more than 25% missing values:', missing_value_columns)
    df = df.drop(missing_value_columns, axis=1)
    
    df['reported_date'] = pd.to_datetime(df['reported_date'])
    df['reported_date'] = df['reported_date'].dt.date

    return df

In [9]:
crimedatadf = preprocess_crimedata(crime_df_raw.copy())
crimedatadf.head()

remove columns with more than 25% missing values: Index(['crime_type'], dtype='object')


Unnamed: 0,OBJECTID,crime_category,crime_code,crime_description,city,reported_date,reported_year,reported_month,reported_day,reported_dayofwk
0,12001,MISCELLANEOUS,81H,Miscellaneous/Missing Person (18 & over),RALEIGH,2017-01-15,2017,1,14,Saturday
1,12002,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,RALEIGH,2017-07-29,2017,7,29,Saturday
2,12003,MISCELLANEOUS,81F,Miscellaneous/Mental Commitment,RALEIGH,2016-03-07,2016,3,6,Sunday
5,12006,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,RALEIGH,2016-07-12,2016,7,12,Tuesday
6,12007,MISCELLANEOUS,81K,Miscellaneous/Overdose Death,RALEIGH,2017-04-10,2017,4,10,Monday


In [10]:
print(crimedatadf['reported_year'].unique())
print(crimedatadf['city'].unique())
print('total cities:',crimedatadf['city'].nunique())
print('crime categories:',crimedatadf['crime_category'].value_counts())
print('crime crime_code:',crimedatadf['crime_code'].value_counts())
display(crimedatadf.info())

[2017 2016 2018 2019 2020 2021]
['RALEIGH' 'GARNER' 'WAKE FOREST' 'MAITLAND' 'FUQUAY VARINA' 'MORRISVILLE'
 'KNIGHTDALE' 'CARY' 'HOLLY SPRINGS' 'ZEBULON' 'WENDELL' 'WILMINGTON'
 'COLUMBIA' 'MELBOURNE' 'SALEMBURG' 'BENTON HARBOR, MI' 'GOSHEN' nan
 'ROLESVILLE' 'APEX' '`' 'CLAYTON' 'SPARTA' 'DURHAM']
total cities: 23
crime categories: MISCELLANEOUS                     68556
ASSAULT                           36628
LARCENY                           29565
DRUGS                             22376
FRAUD                             18543
VANDALISM                         18270
LARCENY FROM MV                   18086
ALL OTHER OFFENSES                13161
DRUG VIOLATIONS                    9043
BURGLARY/RESIDENTIAL               7951
WEAPONS VIOLATION                  7907
HUMANE                             7329
TRAFFIC                            6572
MV THEFT                           5660
SEX OFFENSES                       3650
ROBBERY                            3312
DISORDERLY CONDUCT       

None

In [11]:
crimedatadf.to_csv('preprocessed_crimedata.csv', index=False)

In [12]:
def crime_cat_dummies(df): 
    return pd.get_dummies(df, columns=['crime_category'])

In [15]:
def getAggregatedCrimeDf(crime_df):
    crimedfagg = crime_df.groupby(['reported_date']).size() 
    aggcode = crimedatadf.groupby(['reported_date'])['crime_code'].agg(lambda x: pd.Series.mode(x)[0])
    df1 = pd.DataFrame(crimedfagg,columns=['crime_count']).reset_index()
    df2 = pd.DataFrame(aggcode).reset_index()
    df2.columns = ['reported_date', 'frequent_crime_code']
    finaldf = df1.merge(df2, how='left', on='reported_date')
    return finaldf

aggcrime = getAggregatedCrimeDf(crimedatadf)
display(aggcrime.head())
print(aggcrime.info())

aggcrime.to_csv('feature_crimedata.csv', index=False)
print(aggcrime['reported_date'].nunique())

Unnamed: 0,reported_date,crime_count,frequent_crime_code
0,2016-01-01,140,25E
1,2016-01-02,138,81A
2,2016-01-03,143,25E
3,2016-01-04,117,35H
4,2016-01-05,110,81A


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2143 entries, 0 to 2142
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   reported_date        2143 non-null   object
 1   crime_count          2143 non-null   int64 
 2   frequent_crime_code  2143 non-null   object
dtypes: int64(1), object(2)
memory usage: 67.0+ KB
None
2143
