In [5]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

## Fetch Crime Data from WakeGov (daily policy incidents as a proxy)

In [8]:
def getCrimeDataDf(forceFetch=False):
        path = Path()
        key = 'crimedata.csv'
        filename = path/key
        
        # If the file does already exist in the directory, download it
        if os.path.exists(filename) and not forceFetch:
            print('Using pre-fetched crime data')
            df = pd.read_csv(key)
            print('crime df shape:', df.shape)
            return df
        else:
            print('Fetching police incidents data...')
            
            endpoint = 'https://opendata.arcgis.com/datasets/693811eb361f4da286891eca1fae5943_0.geojson'

            # Sending get request and saving the response as response object
            # extracting data in json 
            r = requests.get(url = endpoint)
            rows = []
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
            df = pd.DataFrame(rows)
            print('crime df shape:', df.shape)
            df.to_csv('crimedata.csv', index=False)
            print('Done')
            return df

In [10]:
crime_df_raw = getCrimeDataDf(forceFetch=False)
display(crime_df_raw.head())

Using pre-fetched crime data
crime df shape: (1341, 25)


Unnamed: 0,OBJECTID,GlobalID,CreationDate,Creator,EditDate,Editor,case_number,crime_category,crime_code,crime_description,...,reported_date,reported_year,reported_month,reported_day,reported_hour,reported_dayofwk,latitude,longitude,agency,updated_date
0,529022,c330e7f9-d92e-494e-ac86-5a7668506b52,2020-02-18T20:14:01.341Z,RaleighGIS,2020-02-18T20:14:01.341Z,RaleighGIS,P20009234,LARCENY,35D,Larceny/Theft from Building,...,2020-02-17T05:15:00Z,2020,2,17,0,Monday,35.874504,-78.622925,RPD,2020-02-17T22:02:04.983Z
1,529023,842c43c3-9411-4d1e-b4d0-c6f8de560892,2020-02-18T20:14:01.341Z,RaleighGIS,2020-02-18T20:14:01.341Z,RaleighGIS,P20009234,FRAUD,56B,Fraud/Credit Card-ATM Fraud,...,2020-02-17T05:15:00Z,2020,2,17,0,Monday,35.874504,-78.622925,RPD,2020-02-17T22:02:04.983Z
2,529035,aa32c9dd-318e-4a7c-906c-3fceec8a0f7e,2020-02-18T17:58:16.023Z,RaleighGIS,2020-02-18T17:58:16.023Z,RaleighGIS,P20009235,ALL OTHER OFFENSES,80A,All Other/All Other Offenses,...,2020-02-17T05:48:00Z,2020,2,17,0,Monday,35.778107,-78.6342,RPD,2020-02-17T16:38:21.820Z
3,529038,2ef2112c-3cad-4ddc-a0ed-c9ca3a609288,2020-02-18T17:58:16.023Z,RaleighGIS,2020-02-18T17:58:16.023Z,RaleighGIS,,MISCELLANEOUS,81F,Miscellaneous/Mental Commitment,...,2020-02-17T06:35:00Z,2020,2,17,1,Monday,0.0,0.0,RPD,2020-02-17T16:22:49.540Z
4,529046,ac21533f-7ece-4adc-ac07-48a506f27c4f,2020-02-18T17:58:16.023Z,RaleighGIS,2020-02-18T17:58:16.023Z,RaleighGIS,,MISCELLANEOUS,81C,Miscellaneous/Found Property,...,2020-02-17T08:11:00Z,2020,2,17,3,Monday,0.0,0.0,RPD,2020-02-17T16:39:12.820Z


In [12]:
crime_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341 entries, 0 to 1340
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   OBJECTID                1341 non-null   int64  
 1   GlobalID                1341 non-null   object 
 2   CreationDate            1341 non-null   object 
 3   Creator                 1341 non-null   object 
 4   EditDate                1341 non-null   object 
 5   Editor                  1341 non-null   object 
 6   case_number             875 non-null    object 
 7   crime_category          1341 non-null   object 
 8   crime_code              1341 non-null   object 
 9   crime_description       1341 non-null   object 
 10  crime_type              591 non-null    object 
 11  reported_block_address  873 non-null    object 
 12  city_of_incident        875 non-null    object 
 13  city                    1341 non-null   object 
 14  district                1341 non-null   

In [14]:
crime_df_raw.nunique()

OBJECTID                  1341
GlobalID                  1341
CreationDate               106
Creator                      1
EditDate                   106
Editor                       1
case_number                821
crime_category              25
crime_code                  69
crime_description           70
crime_type                   3
reported_block_address     682
city_of_incident             2
city                         1
district                     7
reported_date             1256
reported_year                2
reported_month              11
reported_day                30
reported_hour               24
reported_dayofwk             7
latitude                   756
longitude                  756
agency                       1
updated_date              1283
dtype: int64

In [16]:
crime_df_raw.columns[crime_df_raw.isna().mean() >= 0.25]

Index(['case_number', 'crime_type', 'reported_block_address',
       'city_of_incident'],
      dtype='object')

## Preprocessing

In [41]:
def preprocess_crimedata(df):
    # Drop duplicates inplace
    df.drop_duplicates(inplace=True)
    
    # Drop irrelevant columns
    df = df.drop(['CreationDate', 'city_of_incident', 'EditDate', 'GlobalID',
                  'Creator', 'Editor', 'case_number', 'reported_hour', 'updated_date', 'agency', 'city'], axis=1) 
    
    # Drop columns w/ > 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    print('Dropping columns with more than 25% missing values:', missing_value_columns)
    df = df.drop(missing_value_columns, axis=1)
    
    return df

In [42]:
crimedatadf = preprocess_crimedata(crime_df_raw)
crimedatadf.head()

Dropping columns with more than 25% missing values: Index(['crime_type', 'reported_block_address'], dtype='object')


Unnamed: 0,OBJECTID,crime_category,crime_code,crime_description,district,reported_date,reported_year,reported_month,reported_day,reported_dayofwk,latitude,longitude
0,529022,LARCENY,35D,Larceny/Theft from Building,North,2020-02-17T05:15:00Z,2020,2,17,Monday,35.874504,-78.622925
1,529023,FRAUD,56B,Fraud/Credit Card-ATM Fraud,North,2020-02-17T05:15:00Z,2020,2,17,Monday,35.874504,-78.622925
2,529035,ALL OTHER OFFENSES,80A,All Other/All Other Offenses,Downtown,2020-02-17T05:48:00Z,2020,2,17,Monday,35.778107,-78.6342
3,529038,MISCELLANEOUS,81F,Miscellaneous/Mental Commitment,Northwest,2020-02-17T06:35:00Z,2020,2,17,Monday,0.0,0.0
4,529046,MISCELLANEOUS,81C,Miscellaneous/Found Property,Downtown,2020-02-17T08:11:00Z,2020,2,17,Monday,0.0,0.0


In [43]:
display(crimedatadf.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1341 entries, 0 to 1340
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OBJECTID           1341 non-null   int64  
 1   crime_category     1341 non-null   object 
 2   crime_code         1341 non-null   object 
 3   crime_description  1341 non-null   object 
 4   district           1341 non-null   object 
 5   reported_date      1341 non-null   object 
 6   reported_year      1341 non-null   int64  
 7   reported_month     1341 non-null   int64  
 8   reported_day       1341 non-null   int64  
 9   reported_dayofwk   1341 non-null   object 
 10  latitude           1341 non-null   float64
 11  longitude          1341 non-null   float64
dtypes: float64(2), int64(4), object(6)
memory usage: 136.2+ KB


None

In [44]:
crimedatadf.columns[crimedatadf.isna().mean() >= 0.25]

Index([], dtype='object')