In [1]:
# Import dependencies
import pandas as pd
import re
import numpy as np

In [2]:
# CSV files downloaded from:
# https://www.wake.gov/departments-government/city-county-bureau-identification-ccbi/criminal-arrest-records
# Part 1 contains records from 01/01/2022 to 06/30/2022; part 2 contains records from 07/01/2022 to 12/31/2022
original_df_pt1 = pd.read_csv("resources/original/wake_county_crime_data_2022_01-01_06-30.csv", 
                         usecols=["Name", "Date of Arrest", "Arrest Location", "Arresting Agency", 
                                  "Charge"])
original_df_pt1 = original_df_pt1.fillna("OTHER")
original_df_pt2 = pd.read_csv("resources/original/wake_county_crime_data_2022_07-01_12-31.csv", 
                         usecols=["Name", "Date of Arrest", "Arrest Location", "Arresting Agency", 
                                  "Charge"])
original_df_pt2 = original_df_pt2.fillna("OTHER")

# Combine into one dataframe
original_df = pd.concat([original_df_pt1, original_df_pt2], axis=0)
original_df = original_df.reset_index(drop=True)
original_df.head()

Unnamed: 0,Name,Date of Arrest,Arrest Location,Arresting Agency,Charge
0,"BROTHERS,GLEN DEAN",06-30-2022 11:57:00,"401 WAKE CHAPEL RD FUQUAY-VARINA, NC",FUQUAY POLICE DEPARTMENT,FAIL REPRT NEW ADDRESS-SEX OFF
1,"WADE,JERRELL KUTROI",06-30-2022 22:40:00,"I-540/BUFFALOE RD KNIGHTDALE, NC",STATE HIGHWAY PATROL,DRIVING WHILE IMPAIRED
2,"SMITH,STEVEN CARMOS",06-30-2022 23:00:00,"KNIGHTDALE BLVD/HODGE RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE
3,"HAMMOND,JASON BRADLEY",06-30-2022 21:45:00,"2100 GLASCOCK ST RALEIGH, NC",RALEIGH POLICE DEPARTMENT,FELONY POSSESSION OF COCAINE
4,"STEWART,MITCHELL TYSON",06-30-2022 22:45:00,"KNIGHTDALE BLVD/HODGES RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE


In [3]:
# Create city column for easier grouping based on the arrest location
original_df["CITY"] = original_df["Arrest Location"].str.extract(r'\D+\s(\D+),\sNC\Z', expand=True)

# Drop rows where there is no city to be found
original_df = original_df.dropna()

original_df

Unnamed: 0,Name,Date of Arrest,Arrest Location,Arresting Agency,Charge,CITY
0,"BROTHERS,GLEN DEAN",06-30-2022 11:57:00,"401 WAKE CHAPEL RD FUQUAY-VARINA, NC",FUQUAY POLICE DEPARTMENT,FAIL REPRT NEW ADDRESS-SEX OFF,FUQUAY-VARINA
1,"WADE,JERRELL KUTROI",06-30-2022 22:40:00,"I-540/BUFFALOE RD KNIGHTDALE, NC",STATE HIGHWAY PATROL,DRIVING WHILE IMPAIRED,KNIGHTDALE
2,"SMITH,STEVEN CARMOS",06-30-2022 23:00:00,"KNIGHTDALE BLVD/HODGE RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE,CARY
3,"HAMMOND,JASON BRADLEY",06-30-2022 21:45:00,"2100 GLASCOCK ST RALEIGH, NC",RALEIGH POLICE DEPARTMENT,FELONY POSSESSION OF COCAINE,RALEIGH
4,"STEWART,MITCHELL TYSON",06-30-2022 22:45:00,"KNIGHTDALE BLVD/HODGES RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE,CARY
...,...,...,...,...,...,...
35971,"PEACE,TIMOTHY MAC",07-01-2022 01:00:00,"4510 CAPITAL BLVD RALEIGH, NC",RALEIGH POLICE DEPARTMENT,OBTAIN PROPERTY FALSE PRETENSE,RALEIGH
35972,"PEACE,TIMOTHY MAC",07-01-2022 01:00:00,"4510 CAPITAL BLVD RALEIGH, NC",RALEIGH POLICE DEPARTMENT,POSS STOLEN GOODS/PROP (M),RALEIGH
35973,"REN,XIAOYU",07-01-2022 01:00:00,"2 BROUGHTON DR RALEIGH, NC",NC STATE UNIVERSITY POLICE DEP,FIRST DEG TRESP ENTER/REMAIN,RALEIGH
35974,"LINCOLN,REBECCA LOUISE",07-01-2022 02:00:00,"3400 WAKE FOREST RD RALEIGH, NC",RALEIGH POLICE DEPARTMENT,SECOND DEGREE TRESPASS,RALEIGH


In [4]:
# View misspelled cities
counts = original_df["CITY"].value_counts().to_dict()
city_counts = pd.DataFrame(data = list(counts.items()), columns=["CITY", "COUNT"])
city_counts[~city_counts["CITY"].isin(["ANGIER", "APEX", "CARY", "CLAYTON", "CREEDMOOR", "DURHAM", "FUQUAY-VARINA", 
                                         "GARNER", "HOLLY SPRINGS", "KNIGHTDALE", "MORRISVILLE", "NEW HILL", "RALEIGH", 
                                         "ROLESVILLE", "WAKE FOREST", "WENDELL", "WILLOW SPRING","YOUNGSVILLE", "ZEBULON"])]

Unnamed: 0,CITY,COUNT
3,FOREST,1423
4,SPRINGS,1047
6,VARINA,702
13,SPRING,54
16,HILL,17
17,RALIEGH,15
18,APEX`,12
19,OLIVE,8
20,FAYETTEVILLE,7
21,L,7


In [5]:
# Account for the misspellings for the 19 municipalities in Wake County (others will be removed)
original_df.loc[original_df["CITY"].str.contains('APEX'), 'city'] = 'APEX'
original_df.loc[original_df["CITY"].str.contains('VARINA|FUQUAY'), 'city'] = 'FUQUAY-VARINA'
original_df.loc[original_df["CITY"].str.contains('SPRINGS|HOLLYSPRINGS|SRPINGS|SPIRINGS'), 'city'] = 'HOLLY SPRINGS'
original_df.loc[original_df["CITY"].str.contains('KNIG'), 'city'] = 'KNIGHTDALE'
original_df.loc[original_df["CITY"].str.contains('MORIS|MORRIS|MOOR'), 'city'] = 'MORRISVILLE'
original_df.loc[original_df["CITY"].str.contains('HILL'), 'city'] = 'NEW HILL'
original_df.loc[original_df["CITY"].str.contains('RAL|RLA|RLE'), 'city'] = 'RALEIGH'
original_df.loc[original_df["CITY"].str.contains('FOREST|FORST'), 'city'] = 'WAKE FOREST'
original_df.loc[original_df["CITY"].str.contains('SPRING$'), 'city'] = 'WILLOW SPRING'
original_df.loc[original_df["CITY"].str.contains('ZEU'), 'city'] = 'ZEBULON'

# View values
original_df["CITY"].value_counts()

RALEIGH    23094
CARY        1662
GARNER      1585
FOREST      1423
SPRINGS     1047
           ...  
SALEM          1
INDIANA        1
BRAGG          1
LAKE           1
EDWARDS        1
Name: CITY, Length: 71, dtype: int64

In [6]:
# Remove crimes not charged in Wake County
original_df = original_df[original_df["CITY"].isin(["ANGIER", "APEX", "CARY", "CLAYTON", "CREEDMOOR", "DURHAM", "FUQUAY-VARINA", 
                                         "GARNER", "HOLLY SPRINGS", "KNIGHTDALE", "MORRISVILLE", "NEW HILL", "RALEIGH", 
                                         "ROLESVILLE", "WAKE FOREST", "WENDELL", "WILLOW SPRING","YOUNGSVILLE", "ZEBULON"])]
original_df.head()

Unnamed: 0,Name,Date of Arrest,Arrest Location,Arresting Agency,Charge,CITY,city
0,"BROTHERS,GLEN DEAN",06-30-2022 11:57:00,"401 WAKE CHAPEL RD FUQUAY-VARINA, NC",FUQUAY POLICE DEPARTMENT,FAIL REPRT NEW ADDRESS-SEX OFF,FUQUAY-VARINA,FUQUAY-VARINA
1,"WADE,JERRELL KUTROI",06-30-2022 22:40:00,"I-540/BUFFALOE RD KNIGHTDALE, NC",STATE HIGHWAY PATROL,DRIVING WHILE IMPAIRED,KNIGHTDALE,KNIGHTDALE
2,"SMITH,STEVEN CARMOS",06-30-2022 23:00:00,"KNIGHTDALE BLVD/HODGE RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE,CARY,
3,"HAMMOND,JASON BRADLEY",06-30-2022 21:45:00,"2100 GLASCOCK ST RALEIGH, NC",RALEIGH POLICE DEPARTMENT,FELONY POSSESSION OF COCAINE,RALEIGH,RALEIGH
4,"STEWART,MITCHELL TYSON",06-30-2022 22:45:00,"KNIGHTDALE BLVD/HODGES RD CARY, NC",CARY POLICE DEPARTMENT,ASSAULT ON A FEMALE,CARY,


In [7]:
# Filter crimes - personal violence crimes (assault, battery, etc.)
results_df = pd.DataFrame(original_df[original_df["Charge"].str.contains('ASSAULT|BATTERY|ABUSE|HUMAN|EXPLOIT|TRESP')])

# Add results to series
charge_series = pd.Series(results_df.groupby(["CITY"])["Charge"].count(), name="PERSONAL_CHARGE_COUNT")
name_series = pd.Series(results_df.groupby(["CITY"])["Name"].nunique(), name="PERSONAL_NAME_COUNT")

# Combine created series into a dataframe
personal_df = pd.concat([charge_series, name_series], axis=1)

# Add row for Angier for easier readability later
personal_df.loc["ANGIER"] = 0, 0
personal_df = personal_df.sort_index()

personal_df

Unnamed: 0_level_0,PERSONAL_CHARGE_COUNT,PERSONAL_NAME_COUNT
CITY,Unnamed: 1_level_1,Unnamed: 2_level_1
ANGIER,0,0
APEX,109,88
CARY,332,270
CLAYTON,1,1
DURHAM,12,6
FUQUAY-VARINA,35,27
GARNER,195,160
KNIGHTDALE,87,67
MORRISVILLE,72,61
RALEIGH,3589,2416


In [8]:
# Filter crimes - property crimes (theft, larceny, etc.)
results_df = pd.DataFrame(original_df[original_df["Charge"].str.contains('LAR|PROP|STOLEN|THEFT|ROBB|BREAK|ENTER')])

# Add results to series
charge_series = pd.Series(results_df.groupby(["CITY"])["Charge"].count(), name="PROPERTY_CHARGE_COUNT")
name_series = pd.Series(results_df.groupby(["CITY"])["Name"].nunique(), name="PROPERTY_NAME_COUNT")

# Combine created series into a dataframe
property_df = pd.concat([charge_series, name_series], axis=1)

# Add row for Youngsville for easier readability later
property_df.loc["YOUNGSVILLE"] = 0, 0
property_df = property_df.sort_index()

property_df

Unnamed: 0_level_0,PROPERTY_CHARGE_COUNT,PROPERTY_NAME_COUNT
CITY,Unnamed: 1_level_1,Unnamed: 2_level_1
ANGIER,2,1
APEX,207,125
CARY,291,185
CLAYTON,12,4
DURHAM,13,6
FUQUAY-VARINA,45,35
GARNER,450,290
KNIGHTDALE,147,94
MORRISVILLE,154,74
RALEIGH,5024,2605


In [9]:
# Filter crimes - drug-based crimes (possession, trafficking, etc.)
results_df = pd.DataFrame(original_df[original_df["Charge"].str.contains('MARIJ|MARJ|POSSESS|DRUG')])

# Add results to series
charge_series = pd.Series(results_df.groupby(["CITY"])["Charge"].count(), name="DRUG_CHARGE_COUNT")
name_series = pd.Series(results_df.groupby(["CITY"])["Name"].nunique(), name="DRUG_NAME_COUNT")

# Combine created series into a dataframe
drug_df = pd.concat([charge_series, name_series], axis=1)

# Add row for Angier and Youngsville for easier readability later
drug_df.loc["ANGIER"] = 0, 0
drug_df.loc["YOUNGSVILLE"] = 0, 0
drug_df = drug_df.sort_index()

drug_df

Unnamed: 0_level_0,DRUG_CHARGE_COUNT,DRUG_NAME_COUNT
CITY,Unnamed: 1_level_1,Unnamed: 2_level_1
ANGIER,0,0
APEX,137,70
CARY,235,133
CLAYTON,3,2
DURHAM,2,2
FUQUAY-VARINA,47,29
GARNER,306,177
KNIGHTDALE,101,59
MORRISVILLE,82,49
RALEIGH,4028,2057


In [10]:
# Combine the three dataframes
crime_count_df = pd.concat([personal_df, property_df, drug_df], axis=1)
crime_count_df = crime_count_df.astype(int)
crime_count_df = crime_count_df.reset_index()
crime_count_df

Unnamed: 0,CITY,PERSONAL_CHARGE_COUNT,PERSONAL_NAME_COUNT,PROPERTY_CHARGE_COUNT,PROPERTY_NAME_COUNT,DRUG_CHARGE_COUNT,DRUG_NAME_COUNT
0,ANGIER,0,0,2,1,0,0
1,APEX,109,88,207,125,137,70
2,CARY,332,270,291,185,235,133
3,CLAYTON,1,1,12,4,3,2
4,DURHAM,12,6,13,6,2,2
5,FUQUAY-VARINA,35,27,45,35,47,29
6,GARNER,195,160,450,290,306,177
7,KNIGHTDALE,87,67,147,94,101,59
8,MORRISVILLE,72,61,154,74,82,49
9,RALEIGH,3589,2416,5024,2605,4028,2057


In [11]:
# Add columns for total crime and total name counts
crime_count_df["TOTAL_CHARGE_COUNT"] = crime_count_df["PERSONAL_CHARGE_COUNT"] + crime_count_df["PROPERTY_CHARGE_COUNT"] + crime_count_df["DRUG_CHARGE_COUNT"]
crime_count_df["TOTAL_NAME_COUNT"] = crime_count_df["PERSONAL_NAME_COUNT"] + crime_count_df["PROPERTY_NAME_COUNT"] + crime_count_df["DRUG_NAME_COUNT"]
crime_count_df

Unnamed: 0,CITY,PERSONAL_CHARGE_COUNT,PERSONAL_NAME_COUNT,PROPERTY_CHARGE_COUNT,PROPERTY_NAME_COUNT,DRUG_CHARGE_COUNT,DRUG_NAME_COUNT,TOTAL_CHARGE_COUNT,TOTAL_NAME_COUNT
0,ANGIER,0,0,2,1,0,0,2,1
1,APEX,109,88,207,125,137,70,453,283
2,CARY,332,270,291,185,235,133,858,588
3,CLAYTON,1,1,12,4,3,2,16,7
4,DURHAM,12,6,13,6,2,2,27,14
5,FUQUAY-VARINA,35,27,45,35,47,29,127,91
6,GARNER,195,160,450,290,306,177,951,627
7,KNIGHTDALE,87,67,147,94,101,59,335,220
8,MORRISVILLE,72,61,154,74,82,49,308,184
9,RALEIGH,3589,2416,5024,2605,4028,2057,12641,7078


In [12]:
# Export to csv file
crime_count_df.to_csv('resources/clean/crime_count_data_clean.csv', index=False)