<b>Create Sidebar Navigation - best viewed in full screen</b>

In [1]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');

<IPython.core.display.Javascript object>

# Read In Data

In [2]:
# Import dependencies
import pandas as pd
import re
import numpy as np

In [3]:
# CSV files downloaded from:
# https://www.wake.gov/departments-government/city-county-bureau-identification-ccbi/criminal-arrest-records
# Part 1 contains records from 01/01/2022 to 06/30/2022; part 2 contains records from 07/01/2022 to 12/31/2022
original_df_pt1 = pd.read_csv("../resources/original/wake_county_crime_data_2022_01-01_06-30.csv", 
                         usecols=["Name", "Date of Arrest", "Arrest Location", "Arresting Agency", 
                                  "Charge"])
original_df_pt1 = original_df_pt1.fillna("OTHER")
original_df_pt2 = pd.read_csv("../resources/original/wake_county_crime_data_2022_07-01_12-31.csv", 
                         usecols=["Name", "Date of Arrest", "Arrest Location", "Arresting Agency", 
                                  "Charge"])
original_df_pt2 = original_df_pt2.fillna("OTHER")

# Combine into one dataframe
original_df = pd.concat([original_df_pt1, original_df_pt2], axis=0)
original_df = original_df.reset_index(drop=True)
original_df.head()

Unnamed: 0,Name,Date of Arrest,Arrest Location,Arresting Agency,Charge
0,"BROTHERS,GLEN DEAN",06-30-2022 11:57:00,"401 WAKE CHAPEL RD FUQUAY-VARINA, NC",FUQUAY POLICE DEPARTMENT,FAIL REPRT NEW ADDRESS-SEX OFF
1,"WADE,JERRELL KUTROI",06-30-2022 22:40:00,"I-540/BUFFALOE RD KNIGHTDALE, NC",STATE HIGHWAY PATROL,DRIVING WHILE IMPAIRED
2,"SMITH,STEVEN CARMOS",06-30-2022 23:00:00,"KNIGHTDALE BLVD/HODGE RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE
3,"HAMMOND,JASON BRADLEY",06-30-2022 21:45:00,"2100 GLASCOCK ST RALEIGH, NC",RALEIGH POLICE DEPARTMENT,FELONY POSSESSION OF COCAINE
4,"STEWART,MITCHELL TYSON",06-30-2022 22:45:00,"KNIGHTDALE BLVD/HODGES RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE


# Extract City of Arrest Location

In [4]:
# Create city column for easier grouping based on the arrest location
original_df["City"] = original_df["Arrest Location"].str.extract(r'\D+\s(\S+\s\D+),\sNC\Z', expand=True)

# Drop rows where there is no city to be found
original_df = original_df.dropna()

original_df

Unnamed: 0,Name,Date of Arrest,Arrest Location,Arresting Agency,Charge,City
0,"BROTHERS,GLEN DEAN",06-30-2022 11:57:00,"401 WAKE CHAPEL RD FUQUAY-VARINA, NC",FUQUAY POLICE DEPARTMENT,FAIL REPRT NEW ADDRESS-SEX OFF,RD FUQUAY-VARINA
1,"WADE,JERRELL KUTROI",06-30-2022 22:40:00,"I-540/BUFFALOE RD KNIGHTDALE, NC",STATE HIGHWAY PATROL,DRIVING WHILE IMPAIRED,RD KNIGHTDALE
2,"SMITH,STEVEN CARMOS",06-30-2022 23:00:00,"KNIGHTDALE BLVD/HODGE RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE,RD CARY
3,"HAMMOND,JASON BRADLEY",06-30-2022 21:45:00,"2100 GLASCOCK ST RALEIGH, NC",RALEIGH POLICE DEPARTMENT,FELONY POSSESSION OF COCAINE,ST RALEIGH
4,"STEWART,MITCHELL TYSON",06-30-2022 22:45:00,"KNIGHTDALE BLVD/HODGES RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE,RD CARY
...,...,...,...,...,...,...
35971,"PEACE,TIMOTHY MAC",07-01-2022 01:00:00,"4510 CAPITAL BLVD RALEIGH, NC",RALEIGH POLICE DEPARTMENT,OBTAIN PROPERTY FALSE PRETENSE,BLVD RALEIGH
35972,"PEACE,TIMOTHY MAC",07-01-2022 01:00:00,"4510 CAPITAL BLVD RALEIGH, NC",RALEIGH POLICE DEPARTMENT,POSS STOLEN GOODS/PROP (M),BLVD RALEIGH
35973,"REN,XIAOYU",07-01-2022 01:00:00,"2 BROUGHTON DR RALEIGH, NC",NC STATE UNIVERSITY POLICE DEP,FIRST DEG TRESP ENTER/REMAIN,DR RALEIGH
35974,"LINCOLN,REBECCA LOUISE",07-01-2022 02:00:00,"3400 WAKE FOREST RD RALEIGH, NC",RALEIGH POLICE DEPARTMENT,SECOND DEGREE TRESPASS,RD RALEIGH


## Find Misspelled Cities

In [5]:
# View misspelled cities
counts = original_df["City"].value_counts().to_dict()
city_counts = pd.DataFrame(data = list(counts.items()), columns=["City", "Count"])
city_counts[~city_counts["City"].isin(["ANGIER", "APEX", "CARY", "CLAYTON", "CREEDMOOR", "DURHAM", "FUQUAY-VARINA", 
                                         "GARNER", "HOLLY SPRINGS", "KNIGHTDALE", "MORRISVILLE", "NEW HILL", "RALEIGH", 
                                         "ROLESVILLE", "WAKE FOREST", "WENDELL", "WILLOW SPRING","YOUNGSVILLE", "ZEBULON"])]

Unnamed: 0,City,Count
0,RD RALEIGH,7846
1,ST RALEIGH,4784
2,AVE RALEIGH,2961
3,DR RALEIGH,2389
4,BLVD RALEIGH,1505
...,...,...
1204,VIEW CARY,1
1205,TRL WENDELL,1
1206,HWY RALEIGH,1
1207,RUN CARY,1


## Account for Valid Misspellings

In [6]:
# Account for misspellings of the 19 municipalities in Wake County
original_df.loc[original_df["City"].str.contains('ANG|IER|EIR'), 'City'] = 'ANGIER'
original_df.loc[original_df["City"].str.contains('APE|EX'), 'City'] = 'APEX'
original_df.loc[original_df["City"].str.contains('CAR|CRA'), 'City'] = 'CARY'
original_df.loc[original_df["City"].str.contains('CLAY|TON'), 'City'] = 'CLAYTON'
original_df.loc[original_df["City"].str.contains('MOOR$'), 'City'] = 'CREEDMOOR'
original_df.loc[original_df["City"].str.contains('DUR|HAM'), 'City'] = 'DURHAM'
original_df.loc[original_df["City"].str.contains('FUQ|VAR'), 'City'] = 'FUQUAY VARINA'
original_df.loc[original_df["City"].str.contains('GAR'), 'City'] = 'GARNER'
original_df.loc[original_df["City"].str.contains('HOL'), 'City'] = 'HOLLY SPRINGS'
original_df.loc[original_df["City"].str.contains('KNIG|DALE'), 'City'] = 'KNIGHTDALE'
original_df.loc[original_df["City"].str.contains('MORIS|MORRIS|MOORI|MORRS'), 'City'] = 'MORRISVILLE'
original_df.loc[original_df["City"].str.contains('NEW|HILL'), 'City'] = 'NEW HILL'
original_df.loc[original_df["City"].str.contains('RAL|RLA|RLE|RA;E'), 'City'] = 'RALEIGH'
original_df.loc[original_df["City"].str.contains('ROL'), 'City'] = 'ROLESVILLE'
original_df.loc[original_df["City"].str.contains('WAKE|FOREST|FORST'), 'City'] = 'WAKE FOREST'
original_df.loc[original_df["City"].str.contains('WEND|DEL'), 'City'] = 'WENDELL'
original_df.loc[original_df["City"].str.contains('WILL'), 'City'] = 'WILLOW SPRING'
original_df.loc[original_df["City"].str.contains('YOUNG'), 'City'] = 'YOUNGSVILLE'
original_df.loc[original_df["City"].str.contains('ZEB|ZEU'), 'City'] = 'ZEBULON'

# View non-matched values to see if any were missed
counts = original_df["City"].value_counts().to_dict()
city_counts = pd.DataFrame(data = list(counts.items()), columns=["City", "Count"])
city_counts[~city_counts["City"].isin(["ANGIER", "APEX", "CARY", "CLAYTON", "CREEDMOOR", "DURHAM", "FUQUAY VARINA", 
                                         "GARNER", "HOLLY SPRINGS", "KNIGHTDALE", "MORRISVILLE", "NEW HILL", "RALEIGH", 
                                         "ROLESVILLE", "WAKE FOREST", "WENDELL", "WILLOW SPRING","YOUNGSVILLE", "ZEBULON"])]

Unnamed: 0,City,Count
17,MOUNT OLIVE,8
18,DR L,7
19,161 EFLAND,7
20,4 HENDERSON,7
22,RD WILSON,5
23,RD FAYETTEVILLE,5
25,DR SANFORD,4
26,ST GOLDSBORO,3
27,DR GREENVILLE,3
28,RD SHANNON,2


## Remove Crimes Not in Wake County

In [7]:
# Remove crimes not charged in Wake County
original_df = original_df[original_df["City"].isin(["ANGIER", "APEX", "CARY", "CLAYTON", "CREEDMOOR", "DURHAM", "FUQUAY VARINA", 
                                         "GARNER", "HOLLY SPRINGS", "KNIGHTDALE", "MORRISVILLE", "NEW HILL", "RALEIGH", 
                                         "ROLESVILLE", "WAKE FOREST", "WENDELL", "WILLOW SPRING","YOUNGSVILLE", "ZEBULON"])]
original_df.City.value_counts()

RALEIGH          23995
CARY              1765
GARNER            1678
WAKE FOREST       1421
HOLLY SPRINGS      994
FUQUAY VARINA      966
APEX               915
KNIGHTDALE         625
MORRISVILLE        532
ZEBULON            477
WENDELL            383
ROLESVILLE         217
WILLOW SPRING      111
DURHAM              68
CLAYTON             43
NEW HILL            36
CREEDMOOR           18
ANGIER               5
YOUNGSVILLE          4
Name: City, dtype: int64

# Create Categories

## Personal Violence Crimes

In [8]:
# Filter crimes - personal violence crimes (assault, battery, identity theft, trespassing, etc.)
results_df = pd.DataFrame(original_df[original_df["Charge"].str.contains('^(?=.*ABUSE|.*ASS|.*AWDW|.*BATT|.*EXPLOIT|.*HUMAN|.*ID|.*TRESP).*$')])
                                                                         
# Add results to series
crime_series = pd.Series(results_df.groupby(["City"])["Charge"].count(), name="Personal_Crime_Count")
name_series = pd.Series(results_df.groupby(["City"])["Name"].nunique(), name="Personal_Name_Count")

# Combine created series into a dataframe
personal_df = pd.concat([crime_series, name_series], axis=1)

# Add row for missing municipalities for easier readability later
personal_df.loc["ANGIER"] = 0, 0
personal_df = personal_df.sort_index()

personal_df

Unnamed: 0_level_0,Personal_Crime_Count,Personal_Name_Count
City,Unnamed: 1_level_1,Unnamed: 2_level_1
ANGIER,0,0
APEX,144,112
CARY,404,319
CLAYTON,11,6
CREEDMOOR,1,1
DURHAM,18,9
FUQUAY VARINA,154,116
GARNER,259,202
HOLLY SPRINGS,111,88
KNIGHTDALE,109,83


## Property Crimes

In [9]:
# Filter crimes - property crimes (theft, larceny, etc.)
results_df = pd.DataFrame(original_df[original_df["Charge"].str.contains('^(?=.*BREAK|.*ENTER|.*LAR|.*PROP|.*ROBB|.*STOL|.*STLN|.*THEFT)(?:(?!ID).)*$')])

# Add results to series
crime_series = pd.Series(results_df.groupby(["City"])["Charge"].count(), name="Property_Crime_Count")
name_series = pd.Series(results_df.groupby(["City"])["Name"].nunique(), name="Property_Name_Count")

# Combine created series into a dataframe
property_df = pd.concat([crime_series, name_series], axis=1)

# Add row for missing municipalities for easier readability later
property_df.loc["YOUNGSVILLE"] = 0, 0
property_df = property_df.sort_index()

property_df

Unnamed: 0_level_0,Property_Crime_Count,Property_Name_Count
City,Unnamed: 1_level_1,Unnamed: 2_level_1
ANGIER,2,1
APEX,215,130
CARY,293,187
CLAYTON,9,4
CREEDMOOR,13,1
DURHAM,23,8
FUQUAY VARINA,139,87
GARNER,441,286
HOLLY SPRINGS,205,110
KNIGHTDALE,143,95


## Drug Crimes

In [10]:
# Filter crimes - drug-based crimes (possession, trafficking, etc.)
results_df = pd.DataFrame(original_df[original_df["Charge"].str.contains('^(?=.*DRUG|.*MARIJ|.*MARJ|.*POSS|.*PWISD)(?:(?!FRAUD|FIREARM|ID|STOLEN|STLN).)*$')])

# Add results to series
crime_series = pd.Series(results_df.groupby(["City"])["Charge"].count(), name="Drug_Crime_Count")
name_series = pd.Series(results_df.groupby(["City"])["Name"].nunique(), name="Drug_Name_Count")

# Combine created series into a dataframe
drug_df = pd.concat([crime_series, name_series], axis=1)

# Add row for missing municipalities for easier readability later
drug_df.loc["ANGIER"] = 0, 0
drug_df.loc["YOUNGSVILLE"] = 0, 0
drug_df = drug_df.sort_index()

drug_df

Unnamed: 0_level_0,Drug_Crime_Count,Drug_Name_Count
City,Unnamed: 1_level_1,Unnamed: 2_level_1
ANGIER,0,0
APEX,146,77
CARY,239,145
CLAYTON,7,4
CREEDMOOR,1,1
DURHAM,6,4
FUQUAY VARINA,223,126
GARNER,285,171
HOLLY SPRINGS,272,133
KNIGHTDALE,95,54


## Total Crimes

In [11]:
# Create dataframe for total crime counts
crime_series = pd.Series(original_df.groupby(["City"])["Charge"].count(), name="Total_Crime_Count")
name_series = pd.Series(original_df.groupby(["City"])["Name"].nunique(), name="Total_Name_Count")

total_df = pd.concat([crime_series, name_series], axis=1)

total_df

Unnamed: 0_level_0,Total_Crime_Count,Total_Name_Count
City,Unnamed: 1_level_1,Unnamed: 2_level_1
ANGIER,5,4
APEX,915,434
CARY,1765,895
CLAYTON,43,17
CREEDMOOR,18,1
DURHAM,68,22
FUQUAY VARINA,966,421
GARNER,1678,851
HOLLY SPRINGS,994,448
KNIGHTDALE,625,326


# Combine Categories

In [12]:
# Combine the three dataframes
crime_count_df = pd.concat([personal_df, property_df, drug_df, total_df], axis=1)
crime_count_df = crime_count_df.astype(int)
crime_count_df = crime_count_df.reset_index()
crime_count_df

Unnamed: 0,City,Personal_Crime_Count,Personal_Name_Count,Property_Crime_Count,Property_Name_Count,Drug_Crime_Count,Drug_Name_Count,Total_Crime_Count,Total_Name_Count
0,ANGIER,0,0,2,1,0,0,5,4
1,APEX,144,112,215,130,146,77,915,434
2,CARY,404,319,293,187,239,145,1765,895
3,CLAYTON,11,6,9,4,7,4,43,17
4,CREEDMOOR,1,1,13,1,1,1,18,1
5,DURHAM,18,9,23,8,6,4,68,22
6,FUQUAY VARINA,154,116,139,87,223,126,966,421
7,GARNER,259,202,441,286,285,171,1678,851
8,HOLLY SPRINGS,111,88,205,110,272,133,994,448
9,KNIGHTDALE,109,83,143,95,95,54,625,326


# Export to CSV

In [13]:
# Export to csv file
crime_count_df.to_csv("../resources/clean/crime_count_data_clean.csv", index=False)