In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

### Crime: 
The crime metric is based on the crime rate and arrest rate. We calculated the crime rate by dividing total crimes(2005-present) by population(total crimes/population), and calculated the arrest rate by dividing the total arrests by population(arrest rate = arrests/population). Note: For both dataset, those neighborhoods with less than 300 people are removed as outliers. Arrests dataset only cover all the arrests from 2016-present.


In [None]:
# Read data from WPRDC
crimeData = pd.read_csv("https://data.wprdc.org/datastore/dump/044f2016-1dfd-4ab0-bc1e-065da05fca2e")
Popuplation = pd.read_csv("https://data.wprdc.org/dataset/5b18c198-474c-4723-b735-cc5220ad43cc/resource/82f29015-6905-4b1c-8300-afe9bb2231b3/download/total-population.csv")

# find out the number of crime in each neighborhood
crimeNumber = pd.DataFrame(crimeData.INCIDENTNEIGHBORHOOD.value_counts())

# Simplify the population dataset
PopuplationA= Popuplation[["Neighborhood","Estimate; Total"]]

# change column name and index
crimeNumber.columns = ["Crime number"]
crimeNumber["Neighborhood"] = crimeNumber.index
crimeNumber.index = range(1,len(crimeNumber)+1)
crimeNumber = crimeNumber[["Neighborhood","Crime number"]]


In [None]:
# put the population and the crime number in one dataframe
crimeRate = pd.merge(crimeNumber,PopuplationA,on = "Neighborhood")

# calculate the crime rate (total crime number/population) and add the "crime rate" column to the dataframe
crimeRate["Crime rate"] = crimeRate["Crime number"]/crimeRate["Estimate; Total"]
crimeRate = crimeRate[["Neighborhood","Crime number","Estimate; Total","Crime rate"]]

# filter out the neighborhood that has less than 300 people sort the dataset by crime rate
crimeRate = crimeRate[crimeRate["Estimate; Total"]>300].sort_values(by=['Crime rate'])
# Renaming column
crimeRate.columns = ["Neighborhood","Crime number","Population","Crime rate"]
# Give each neighborhood a score based on their ranking
crimeRate["Score"] = range(100,100-len(crimeRate),-1)
crimeRate.head(10)


In [None]:
# visualize the crime rate 

plt.figure(figsize=(25,10))
plt.xticks(rotation=90,size=15)
plt.title("Crime Rate of each neighborhood", size=22)
plt.xlabel("Neighborhood",size=22)
plt.ylabel("Crime Rate",size=22)
g=sns.barplot(x="Neighborhood", y = "Crime rate", data = crimeRate.sort_index(), palette="Paired")

In [None]:

# Read the arrrest data from WPRDC
arrest = pd.read_csv("https://data.wprdc.org/datastore/dump/e03a89dd-134a-4ee8-a2bd-62c40aeebc6f")

# Get the number of arrests of each neighborhood
arrests = pd.DataFrame(arrest.INCIDENTNEIGHBORHOOD.value_counts())
arrests.columns = ["Arrests"]
arrests["Neighborhood"] = arrests.index
arrests.index = range(1,len(arrests)+1)
arrests = arrests[["Neighborhood","Arrests"]]

# put the population and the arrest number into one dataframe
arrestRate = pd.merge(arrests,PopuplationA,on = "Neighborhood")

# calculate the arrest rate of each neighborhood
arrestRate["Arrest rate"] = arrestRate["Arrests"]/arrestRate["Estimate; Total"]

# filter out the neighborhoods with less than 300 people.
arrestRate = arrestRate[arrestRate["Estimate; Total"]>300].sort_values(by=['Arrest rate'])

arrestRate.columns = ["Neighborhood","Arrests", "Population", "Arrest rate"]
# combine1 = combine1.sort_values(by = "arrested rate")
arrestRate["Score"] = range(100,100-len(arrestRate),-1)
arrestRate.head(10)

In [None]:
# visualize the arrest rate 

plt.figure(figsize=(25,10))
plt.xticks(rotation=90,size=15)
plt.xlabel("Neighborhood",size=22)
plt.ylabel("Arrest Rate",size=22)
plt.title("Arrest Rate of each neighborhood", size=22)
g=sns.barplot(x="Neighborhood", y = "Arrest rate", data = arrestRate.sort_index(), palette="Paired")

In [None]:
# Combine two scores to get the final score
final = pd.merge(crimeRate.sort_index(),arrestRate.sort_index(),on = "Neighborhood")
final["Final Score"] = (final["Score_x"]+final["Score_y"])/2
sortedFinal = final[["Neighborhood","Population_x","Crime rate","Arrest rate","Score_x","Score_y","Final Score"]].sort_values(by=["Final Score"],ascending=False)
sortedFinal.columns = ["Neighborhood","Population_x","Crime rate","Arrest rate","Score(Crime rate)","Score(Arrest rate)","Final Score"]
sortedFinal.head(20)


In [None]:
# dataframe to be used for final notebook

safetyRanking  = sortedFinal[["Neighborhood","Final Score"]]
safetyRanking