In [None]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
import folium
from folium.plugins import HeatMap

# Import data
crimedata = pd.read_csv('../input/crimes-in-boston/crime.csv', encoding='latin-1')

crime0 = crimedata.loc[crimedata['YEAR'].isin([2016,2017])]

crime = crime0.loc[crime0['UCR_PART'] == 'Part One']

#Remove unused columns
del crime['INCIDENT_NUMBER'] 
del crime['OFFENSE_CODE']
del crime['UCR_PART']
del crime['Location']

# Peek
crime.head()

In [None]:
crime[["OCCURRED_ON_DATE"]] = crime[["OCCURRED_ON_DATE"]].apply(pd.to_datetime)

# Convert OCCURED_ON_DATE to datetime


# Fill in nans in SHOOTING column
crime.SHOOTING.fillna('N', inplace=True)

# Convert DAY_OF_WEEK to an ordered category
crime.DAY_OF_WEEK = pd.Categorical(crime.DAY_OF_WEEK, 
              categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],
              ordered=True)

# Replace -1 values in Lat/Long with Nan
crime.Lat.replace(-1, None, inplace=True)
crime.Long.replace(-1, None, inplace=True)

# Rename columns to something easier to type (the all-caps are annoying!)
rename = {'OFFENSE_CODE_GROUP':'Group',
         'OFFENSE_DESCRIPTION':'Description',
         'DISTRICT':'District',
         'REPORTING_AREA':'Area',
         'SHOOTING':'Shooting',
         'OCCURRED_ON_DATE':'Date',
         'YEAR':'Year',
         'MONTH':'Month',
         'DAY_OF_WEEK':'Day',
         'HOUR':'Hour',
         'STREET':'Street'}
crime.rename(index=str, columns=rename, inplace=True)

# Check
crime.head()

In [None]:
print('There are '+str(crime.shape[0])+' incidents.')

In [None]:
# some data checks
crime.shape

In [None]:
# checking null values
crime.isnull().count()

Below, I will create a plot which shows the most common kind of UCR_Part one crime (the most serious crimes) one crimes in the city of Boston

In [None]:
sns.catplot(y='Group',
           kind='count',
            height=8, 
            aspect=1.5,
            order=crime.Group.value_counts().index,
           data=crime)

Larceny is the most common sort of serious crime, and homicide is the least common. Now, let's see what time of day these crimes happen most often.

In [None]:
# Crimes by hour of the day
sns.catplot(x='Hour',
           kind='count',
            height=8.27, 
            aspect=3,
            color='red',
           data=crime)
plt.xticks(size=30)
plt.yticks(size=30)
plt.xlabel('Hour', fontsize=40)
plt.ylabel('Count', fontsize=40)

Looks like afternoon and evening are the times of day when series crime occurs most often.

Now let's look closer at the most abundant crime in Boston, Larceny. We'll see when most larcenies happen.

In [None]:
array = ['Larceny']
larceny = crime.loc[crime['Group'].isin(array)]

array2 = ['Homicide']
homicide = crime.loc[crime['Group'].isin(array2)]

In [None]:
# Crimes by hour of the day
sns.catplot(x='Hour',
           kind='count',
            height=8.27, 
            aspect=3,
            color='red',
           data=larceny)
plt.xticks(size=30)
plt.yticks(size=30)
plt.xlabel('Hour', fontsize=40)
plt.ylabel('Count', fontsize=40)

This plot shows that larcenies are even more highly concentrated in the afternoon and evening hours than the average of all crimes combined. There is a strange rise in larcenies around midnight.

Now let's see what day of the week on which serious crimes are most committed.

In [None]:
crime.groupby('Day').count()

In [None]:
# Crimes by day of the week
sns.catplot(x='Day',
           kind='count',
            height=10, 
            aspect=3,
           data=crime)
plt.xticks(size=30)
plt.yticks(size=30)
plt.xlabel('')
plt.ylabel('Count', fontsize=40)

As we can see there is not a big difference in the amount of crime from day to day. But there is an obvious peak around Friday and a bit of a dip in crime on Sunday.

In [None]:
larceny.groupby('Day').count()

In [None]:
# Crimes by day of the week
sns.catplot(x='Day',
           kind='count',
            height=10, 
            aspect=3,
           data=larceny)
plt.xticks(size=30)
plt.yticks(size=30)
plt.xlabel('')
plt.ylabel('Count', fontsize=40)

In [None]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.catplot(x='Month', kind='count', height=8, aspect=3, color='gray', data=crime)
plt.xticks(np.arange(12), months, size=30)
plt.yticks(size=30)
plt.xlabel('')
plt.ylabel('Count', fontsize=40)

It's interesting that the hottest month of the summer and the coolest month of the winter contain the most crimes. Now let's see how how amount of larcenies per month compares to the average of all crime.

In [None]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.catplot(x='Month', kind='count', height=8, aspect=3, color='gray', data=larceny)
plt.xticks(np.arange(12), months, size=30)
plt.yticks(size=30)
plt.xlabel('')
plt.ylabel('Count', fontsize=40)

It's about the same.

Let's see in which month homicides happen the most often.

In [None]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
sns.catplot(x='Month', kind='count', height=8, aspect=3, color='gray', data=homicide)
plt.xticks(np.arange(12), months, size=30)
plt.yticks(size=30)
plt.xlabel('')
plt.ylabel('Count', fontsize=40)

August still falls into the pattern of having the most crime, but January does not. Interesting, I wonder what causes this difference.

Now that we have looked at time it's time to look at place. Next, I will discover where in Boston these crimes most often take place.

In [None]:
sns.scatterplot(x='Lat',
               y='Long',
                alpha=0.01,
               data=crime)

Now let's look at our most common crime, larceny.

In [None]:
sns.scatterplot(x='Lat',
               y='Long',
                alpha=0.01,
               data=larceny)

It looks like most cases are concentrated to the east side of Boston. Though our crimes appear to be centered in the downtown area, there are little pockets of crime scattered around the rest of the city. Now we'll look at each district of the city.

In [None]:
sns.scatterplot(x='Lat',
               y='Long',
                hue='District',
                alpha=0.01,
               data=crime)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

Below, I will make a map on which will be plotted all the crimes across Boston.

In [None]:
sns.scatterplot(x='Lat',
               y='Long',
                hue='Group',
                alpha=0.01,
               data=crime)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

Let's use a difference sort of map which can show us a more fine grained new of the city, street by street.

In [None]:
# Create basic Folium crime map
crime_heatmap = folium.Map(location=[42.3125,-71.0875], 
                       tiles = "OpenStreetMap",
                      zoom_start = 11)

# Add data for heatmp 
data_heatmap = crime[crime.Year == 2017]
data_heatmap = crime[['Lat','Long']]
data_heatmap = crime.dropna(axis=0, subset=['Lat','Long'])
data_heatmap = [[row['Lat'],row['Long']] for index, row in data_heatmap.iterrows()]
HeatMap(data_heatmap, radius=10).add_to(crime_heatmap)

# Plot
crime_heatmap

In [None]:
# Create basic Folium crime map
crime_map = folium.Map(location=[42.3125,-71.0875], 
                       tiles = "OpenStreetMap",
                      zoom_start = 11)

# Add data for heatmp 
data_heatmap = larceny[larceny.Year == 2017]
data_heatmap = larceny[['Lat','Long']]
data_heatmap = larceny.dropna(axis=0, subset=['Lat','Long'])
data_heatmap = [[row['Lat'],row['Long']] for index, row in data_heatmap.iterrows()]
HeatMap(data_heatmap, radius=10).add_to(crime_map)

#Plot
crime_map

Below, I will create a heatmap for homicides in Boston.

In [None]:
# Create basic Folium crime map
crime_map = folium.Map(location=[42.3125,-71.0875], 
                       tiles = "OpenStreetMap",
                      zoom_start = 11)

# Add data for heatmp 
data_heatmap = homicide[homicide.Year == 2017]
data_heatmap = homicide[['Lat','Long']]
data_heatmap = homicide.dropna(axis=0, subset=['Lat','Long'])
data_heatmap = [[row['Lat'],row['Long']] for index, row in data_heatmap.iterrows()]
HeatMap(data_heatmap, radius=10).add_to(crime_map)

# Plot
crime_map

This map shows that most murders (which are recorded in our data) are centered around Dorchester, there seem to be very few murders recorded downtown. Since downtown is where a majority of other crimes happen this is very odd.