## Imports

In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

## Fetch Crime Data from WakeGov (daily policy incidents as a proxy)

In [2]:
# https://data-wake.opendata.arcgis.com/datasets/ral::raleigh-police-incidents-nibrs/explore?location=35.786000%2C-78.643000%2C4.74&showTable=true
def getCrimeDataDf(forceFetch=False):
        path = Path()
        key = 'crimedata.csv'
        filename = path/key
        
        # If the file does already exist in the directory, download it
        if os.path.exists(filename) and not forceFetch:
            print('Using pre-fetched crime data...')
            df = pd.read_csv(key)
            print('crime df shape:', df.shape)
            return df
        else:
            print('Fetching police incidents data...')
            
            endpoint = 'https://opendata.arcgis.com/datasets/24c0b37fa9bb4e16ba8bcaa7e806c615_0.geojson'

            # Sending get request and saving the response as response object
            # extracting data in json 
            r = requests.get(url = endpoint)
            rows = []
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
            df = pd.DataFrame(rows)
            print('crime df shape:', df.shape)
            df.to_csv('crimedata.csv', index=False)
            print('Done')
            return df

In [3]:
crime_df_raw = getCrimeDataDf(forceFetch=True)
display(crime_df_raw.head())

Fetching police incidents data...
crime df shape: (371051, 21)
Done


Unnamed: 0,OBJECTID,GlobalID,case_number,crime_category,crime_code,crime_description,crime_type,reported_block_address,city_of_incident,city,...,reported_date,reported_year,reported_month,reported_day,reported_hour,reported_dayofwk,latitude,longitude,agency,updated_date
0,12001,9cdee08d-11c8-4789-864b-6965a1b2e620,,MISCELLANEOUS,81H,Miscellaneous/Missing Person (18 & over),,,,RALEIGH,...,2017-01-15T03:28:00Z,2017,1,14,22,Saturday,0.0,0.0,RPD,2017-01-19T20:11:28.950Z
1,12002,6f6731f4-dd64-44c7-895c-555de2703c8a,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2017-07-29T12:35:00Z,2017,7,29,8,Saturday,0.0,0.0,RPD,2017-08-01T14:06:50.553Z
2,12003,f0fd0e92-448e-4ca8-86c9-e6594564318b,,MISCELLANEOUS,81F,Miscellaneous/Mental Commitment,,,,RALEIGH,...,2016-03-07T03:52:00Z,2016,3,6,22,Sunday,0.0,0.0,RPD,2016-04-14T14:43:38.923Z
3,12004,8a212e84-7b53-478a-b225-c212aa25d7fd,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-03-24T04:59:00Z,2015,3,24,0,Tuesday,0.0,0.0,RPD,2015-03-25T19:24:28.430Z
4,12005,01614b98-48f5-4374-a561-17c4b29d8857,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-12-23T00:57:00Z,2015,12,22,19,Tuesday,0.0,0.0,RPD,2016-01-13T19:29:51.767Z


In [4]:
crime_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371051 entries, 0 to 371050
Data columns (total 21 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   OBJECTID                371051 non-null  int64  
 1   GlobalID                371051 non-null  object 
 2   case_number             371051 non-null  object 
 3   crime_category          371051 non-null  object 
 4   crime_code              371051 non-null  object 
 5   crime_description       371051 non-null  object 
 6   crime_type              371051 non-null  object 
 7   reported_block_address  371051 non-null  object 
 8   city_of_incident        371051 non-null  object 
 9   city                    371051 non-null  object 
 10  district                371051 non-null  object 
 11  reported_date           371051 non-null  object 
 12  reported_year           371051 non-null  int64  
 13  reported_month          371051 non-null  int64  
 14  reported_day        

In [5]:
crime_df_raw.nunique()

OBJECTID                  371051
GlobalID                  371051
case_number               243802
crime_category                33
crime_code                   127
crime_description            131
crime_type                     4
reported_block_address     33345
city_of_incident              15
city                          42
district                       7
reported_date             318780
reported_year                  8
reported_month                12
reported_day                  31
reported_hour                 24
reported_dayofwk               7
latitude                   65501
longitude                  65503
agency                         1
updated_date              334855
dtype: int64

In [6]:
print(crime_df_raw.columns[crime_df_raw.isna().mean() >= 0.25])
print(crime_df_raw['city'].unique())

Index([], dtype='object')
['RALEIGH' 'GARNER' 'WAKE FOREST' 'Raleigh'
 'Miscellaneous/Missing Person (16-over)' 'raleigh' 'Garner' 'MORRISVILLE'
 'Cary' 'MAITLAND' 'APEX' 'wake forest' 'garner' 'FUQUAY VARINA'
 'HOLLY SPRINGS' 'KNIGHTDALE' 'CARY' 'Fuquay Varina' 'ZEBULON' 'WENDELL'
 'Wilmington' 'COLUMBIA' 'MELBOURNE' 'SALEMBURG' 'BENTON HARBOR, MI'
 'GOSHEN' '' 'Knightdale' 'Wendell' 'Rolesville' 'Wake Forest' 'Durham'
 'cary' 'Youngsville' 'Morrisville' 'durham' 'Holly Springs' '`' 'Zebulon'
 'Clayton' 'SPARTA' 'DURHAM']


In [7]:
crime_df_raw.head()

Unnamed: 0,OBJECTID,GlobalID,case_number,crime_category,crime_code,crime_description,crime_type,reported_block_address,city_of_incident,city,...,reported_date,reported_year,reported_month,reported_day,reported_hour,reported_dayofwk,latitude,longitude,agency,updated_date
0,12001,9cdee08d-11c8-4789-864b-6965a1b2e620,,MISCELLANEOUS,81H,Miscellaneous/Missing Person (18 & over),,,,RALEIGH,...,2017-01-15T03:28:00Z,2017,1,14,22,Saturday,0.0,0.0,RPD,2017-01-19T20:11:28.950Z
1,12002,6f6731f4-dd64-44c7-895c-555de2703c8a,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2017-07-29T12:35:00Z,2017,7,29,8,Saturday,0.0,0.0,RPD,2017-08-01T14:06:50.553Z
2,12003,f0fd0e92-448e-4ca8-86c9-e6594564318b,,MISCELLANEOUS,81F,Miscellaneous/Mental Commitment,,,,RALEIGH,...,2016-03-07T03:52:00Z,2016,3,6,22,Sunday,0.0,0.0,RPD,2016-04-14T14:43:38.923Z
3,12004,8a212e84-7b53-478a-b225-c212aa25d7fd,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-03-24T04:59:00Z,2015,3,24,0,Tuesday,0.0,0.0,RPD,2015-03-25T19:24:28.430Z
4,12005,01614b98-48f5-4374-a561-17c4b29d8857,,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,,,RALEIGH,...,2015-12-23T00:57:00Z,2015,12,22,19,Tuesday,0.0,0.0,RPD,2016-01-13T19:29:51.767Z


## Preprocessing

In [8]:
def preprocess_crimedata(df):
    # Drop duplicates inplace
    df.drop_duplicates(inplace=True)
    
    # Drop irrelevant columns
    df = df.drop(['GlobalID', 'case_number', 'updated_date', 'agency', 
                  'city_of_incident', 'reported_block_address','district', 
                  'latitude','longitude', 'reported_hour', 'reported_date'], axis=1)
    
    # normalize cities
    df['city'] = df['city'].str.upper()
    
    df = df[df['reported_year'] >= 2018]
    
    # Drop columns w/ > 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    print('remove columns with more than 25% missing values:', missing_value_columns)
    df = df.drop(missing_value_columns, axis=1)

    return df

In [9]:
crimedatadf = preprocess_crimedata(crime_df_raw)
crimedatadf.head()

remove columns with more than 25% missing values: Index([], dtype='object')


Unnamed: 0,OBJECTID,crime_category,crime_code,crime_description,crime_type,city,reported_year,reported_month,reported_day,reported_dayofwk
35,12042,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,RALEIGH,2018,2,4,Sunday
105,12126,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,RALEIGH,2018,2,27,Tuesday
109,12130,MISCELLANEOUS,81B,Miscellaneous/Deceased Person,,RALEIGH,2018,2,14,Wednesday
119,12142,MISCELLANEOUS,81E,Miscellaneous/Lost Property,,RALEIGH,2018,1,2,Tuesday
174,12203,MISCELLANEOUS,81A,Miscellaneous/All Other Non-Offenses,,RALEIGH,2018,1,10,Wednesday


In [10]:
print(crimedatadf['reported_year'].unique())
print(crimedatadf['crime_type'].unique())
print(crimedatadf['city'].unique())
print('total cities:',crimedatadf['city'].nunique())
display(crimedatadf.info())

[2018 2019 2020 2021]
['' 'CRIMES AGAINST PERSONS' 'CRIMES AGAINST PROPERTY'
 'CRIMES AGAINST SOCIETY']
['RALEIGH' 'CARY' 'WENDELL' 'KNIGHTDALE' 'WILMINGTON' 'WAKE FOREST'
 'COLUMBIA' 'MELBOURNE' 'SALEMBURG' 'BENTON HARBOR, MI' 'GOSHEN' ''
 'GARNER' 'ROLESVILLE' 'FUQUAY VARINA' 'MORRISVILLE' 'HOLLY SPRINGS' '`'
 'ZEBULON' 'CLAYTON' 'SPARTA' 'DURHAM']
total cities: 22
<class 'pandas.core.frame.DataFrame'>
Int64Index: 189202 entries, 35 to 371050
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   OBJECTID           189202 non-null  int64 
 1   crime_category     189202 non-null  object
 2   crime_code         189202 non-null  object
 3   crime_description  189202 non-null  object
 4   crime_type         189202 non-null  object
 5   city               189202 non-null  object
 6   reported_year      189202 non-null  int64 
 7   reported_month     189202 non-null  int64 
 8   reported_day       189202 non-nul

None

In [11]:
crimedatadf.to_csv('preprocessed_crimedata.csv', index=False)