# Data import and overview
Just some common loading operations, parsing dates and setting ordered category for the dates. Also, checking data with info() and describe()

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator


sns.set()

import warnings
warnings.filterwarnings("ignore")

codes = pd.read_csv('/kaggle/input/crimes-in-boston/offense_codes.csv', encoding='latin-1')
crimes = pd.read_csv('/kaggle/input/crimes-in-boston/crime.csv', encoding='latin-1', parse_dates=['OCCURRED_ON_DATE'])
crimes['DAY_OF_WEEK'] = pd.Categorical(crimes['DAY_OF_WEEK'], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)
crimes['HOUR'] = crimes['OCCURRED_ON_DATE'].dt.hour
codes['CODE'] = codes['CODE'].astype('int')

In [None]:
crimes.info()

In [None]:
crimes.describe()

In [None]:
crimes.head(2)

# Serious offenses

As per a quick search on Google, UCR Part stands for a categorization of crimes according to its gravity. Part one are the most serious, violent crimes (if someone would please correct me if I'm mistaken...).

Let's see what are the most recurring ones.
- I have refactored the plots into functions, so that I can use them later on...

In [None]:
def offense_groups_countplot(data, title, xlabel, ylabel, size):
    plt.figure(figsize=size)
    ax = sns.countplot(y=data['OFFENSE_CODE_GROUP'], palette='viridis', order=data['OFFENSE_CODE_GROUP'].value_counts().index)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.title(title, fontsize=16, fontweight='bold', pad=10)
    plt.xlabel(xlabel, weight='bold')
    plt.ylabel(ylabel, weight='bold')
    plt.tight_layout()
    plt.show()

serious = crimes[crimes['UCR_PART'] == 'Part One']
offense_groups_countplot(serious, 'Most serious offenses', '# of crimes', 'Offense Group', (12,5))

# Time of Day
Let's check out at what time is it safest to wander around...

First we need to pivot the data in order to build a matrix for the heatmap, only then we can plot it.

In [None]:
crimes_dow_hour = crimes.pivot_table(index='DAY_OF_WEEK', columns='HOUR', values='INCIDENT_NUMBER', aggfunc='count')

def heatmap_plot(data, title):
    plt.figure(figsize=(10,3))
    ax = sns.heatmap(data, cmap='BuPu')
    ax.xaxis.set_ticks_position('top')
    _ = plt.xlabel('')
    _ = plt.ylabel('')
    _ = plt.title(title, fontsize=16, fontweight='bold', pad=10)
    plt.tight_layout()
    plt.show()

heatmap_plot(crimes_dow_hour, 'Hourly Crimes')

It seems working hours are the worst time on weekdays, specially between 04pm to 07pm, being safest between 01am to 08am. Interestingly enough, while 11pm to 00am and 01am and beyond are relatively safe time periods, from 00am to 01am seems to be particularly dangerous.

On weekends, though, there tends to be more crimes happening until late night, as more people tend to stay up.


# Shootings
Let's dig further on which crimes tends to be more delicate to respond to.
I guess it can be 

In [None]:
crimes['SHOOTING'].value_counts()

In [None]:
shootings = crimes[~crimes['SHOOTING'].isna()]
shootings.head()

In [None]:
offense_groups_countplot(shootings, 'Crimes related to shootings', '# of crimes', 'Offense Group', (16,8))

In [None]:
shootings_dow = shootings.pivot_table(index='DAY_OF_WEEK', columns='HOUR', values='INCIDENT_NUMBER', aggfunc='count')
heatmap_plot(shootings_dow, 'Hourly Shootings')

It seems that shootings are less likely on early morning. There are more shootings after midday, but there doesn't seem to be a very clear pattern, except for the particularly high shooting rate on the night from Friday to Saturday.

People are indeed more reckless when they drink too much... Let's see what Offense Groups on saturday nights lead to shootings.

In [None]:
shootings_sat = shootings[shootings['DAY_OF_WEEK'].eq('Saturday') & shootings['HOUR'].eq(0)]
offense_groups_countplot(shootings_sat, "Saturday's Midnight Crimes", '# of crimes committed', 'Offense Group', (12,5))

In [None]:
display(shootings['DISTRICT'].value_counts() / len(shootings))
print((shootings['DISTRICT'].value_counts() / len(shootings))[:3].sum())

The three districts on top account for 74% of the shootings.

# Weather
I downloaded the weather data from the U.S. climate data website: https://www.usclimatedata.com/

> weather = [pd.read_html(f'https://www.usclimatedata.com/climate/boston/massachusetts/united-states/usma0046/{ano}/{mes}#history', attrs={'class': 'daily_climate_table'})[1] for ano in range(2015,2019) for mes in range (1,13)]

In order to get smoother data and compare trends, I'm setting a rolling mean of 30 days.

In [None]:
weather = pd.read_csv('../input/boston-weather-20152018-use-with-crimes-data/boston-weather.csv', parse_dates=['Day'], index_col='Day')
weather = weather.reindex(pd.date_range(crimes['OCCURRED_ON_DATE'].min(), crimes['OCCURRED_ON_DATE'].max()))
weather = pd.to_numeric(weather['High(°F)'], errors='coerce').interpolate()

In [None]:
window = 30

In [None]:
crimes_daily = crimes.groupby(crimes['OCCURRED_ON_DATE'].dt.date)['INCIDENT_NUMBER'].count() \
    .reindex(pd.date_range(crimes['OCCURRED_ON_DATE'].min(), crimes['OCCURRED_ON_DATE'].max())) \
    .rolling(window).mean().dropna()
w = weather.rolling(window).mean().reindex(crimes_daily.index)
print(f'Pearson Correlation Coefficient: {np.corrcoef(crimes_daily, w)[1,0] * 100:.2f}%')

In [None]:
plt.figure(figsize=(20,4))
plt.title('Crimes vs. Temperature (°F)', fontsize=16, weight='bold', pad=20)
ax = sns.lineplot(data=crimes_daily)
plt.ylabel('# of Crimes', rotation=90, weight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontweight='bold')

ax2 = ax.twinx()
sns.lineplot(data=w, ax=ax2, color='orange')
ax2.xaxis.set_ticks_position('none')
ax2.yaxis.set_ticks_position('right')
plt.ylabel('Temperature', rotation=-90, weight='bold', labelpad=18)
ax2.grid(False)

plt.tight_layout()
plt.show()

Temperature and Crime Rate definitely go together.

It seems convenient to see which Offense Groups are most related to the temperature.

In [None]:
def corr_weather(group):
    daily = group.groupby(group['OCCURRED_ON_DATE'].dt.date)['INCIDENT_NUMBER'].count().dropna().rolling(30).mean().dropna()
    w = weather.reindex(daily.index)
    return np.corrcoef(daily, w)[1,0]

crime_weather_per_offense = crimes.groupby('OFFENSE_CODE_GROUP').apply(corr_weather).sort_values(ascending=False)
crime_weather_per_offense = crime_weather_per_offense[~crime_weather_per_offense.isna()]
display(crime_weather_per_offense.head())
display(crime_weather_per_offense.tail())

In [None]:
import geopandas as gpd
from shapely.geometry import Point
geo = gpd.read_file(r'../input/boston-neighborhoods-geojson/Boston_Neighborhoods.geojson')

# Removing abnormal data and transforming the data
serious = serious[(serious['Lat']>40) & (serious['Long']<-69)]
serious_gdf = gpd.GeoDataFrame(serious)
serious_gdf['geometry'] =  gpd.GeoSeries(serious_gdf.apply(lambda s: Point((s['Long'], s['Lat'])), axis=1))

# Merging GDFs
intersects_gdf = gpd.sjoin(serious_gdf, geo, op='within', how='right')
display(intersects_gdf.head(2))

# Getting number of serious crimes per neighborhood and preparing for plot
crimes_per_nb = gpd.GeoDataFrame(intersects_gdf.groupby('Name')['INCIDENT_NUMBER'].count().to_frame('Crimes') \
                                    .merge(geo.set_index('Name'), left_index=True, right_index=True, how='inner')) \
                                    .reset_index()
crimes_per_nb['coords'] = crimes_per_nb['geometry'].apply(lambda x: x.representative_point().coords[:])
crimes_per_nb['coords'] = [coords[0] for coords in crimes_per_nb['coords']]
display(crimes_per_nb.head())

In [None]:
import matplotlib.patheffects as pe
sns.set_style('dark')
ax = crimes_per_nb.plot(column='Crimes', cmap='viridis', legend=True, figsize=(20,10))
for idx, row in crimes_per_nb.iterrows():
    plt.annotate(s=row['Name'], xy=row['coords'],
                 horizontalalignment='center', color='midnightblue', path_effects=[pe.withStroke(linewidth=2, foreground="white")], weight='bold', fontsize=8, rotation=30)
ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
import folium
center = geo.set_index('Name').loc['Dorchester'].geometry.centroid
fb = folium.Map(location=[center.y, center.x],
                zoom_start=13)
fb.choropleth(
    geo_data=r'../input/boston-neighborhoods-geojson/Boston_Neighborhoods.geojson',
    name='geometry',
    fill_color='BuPu',
    fill_opacity=.75,
    line_opacity=.2,
    data=crimes_per_nb,
    columns=['Name', 'Crimes'],
    key_on='feature.properties.Name',
)
#folium.GeoJson(r'../input/boston-neighborhoods-geojson/Boston_Neighborhoods.geojson').add_to(fb)
display(fb)