In [None]:
import pandas as pd
df=pd.read_csv('../input/crimes-in-boston/crime.csv', parse_dates=['OCCURRED_ON_DATE'])
import numpy as np
df.head()

# Data Preprocessing

In [None]:
df['OFFENSE_CODE_GROUP']=df['OFFENSE_CODE_GROUP'].apply(lambda num: num.casefold())

I've normalized offense code groups as lowercase characters.

In [None]:
df.isnull().sum()

In [None]:
df['DISTRICT'].fillna(df['DISTRICT'].mode()[0], inplace=True)
df['UCR_PART'].fillna(df['UCR_PART'].mode()[0], inplace=True)

In [None]:
df = df.replace(r'^\s*$', np.nan, regex=True) #Replacing blank values (white space) with NaN in pandas
df.drop(columns=['SHOOTING'], inplace=True)

There are about 20 thousand nulls in lat and long values. But the location values are full. I will extract these values, which will be used for the spherical drawing in the future by using string operations.

In [None]:
df=pd.concat([df, df['Location'].str.split(', ', expand=True)], axis=1)
df.rename(columns={0:'Latitude',1:'Longitude'}, inplace=True)

Credit to EdChum's (https://stackoverflow.com/users/704848/edchum) code that includes adding the split columns back to the dataframe.

In [None]:
df.drop(columns=['Lat','Long','Location'], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
print(df['OFFENSE_CODE_GROUP'].value_counts(normalize=True).nlargest(10))
print(df['OFFENSE_CODE_GROUP'].value_counts(normalize=True).nsmallest(1))
print(df['DISTRICT'].value_counts(normalize=True).nlargest(10))
print(df[df['REPORTING_AREA'].notnull()]['REPORTING_AREA'].value_counts(normalize=True).nlargest(10))

First observations:

* Motor vehicle accident response is the most common crime, the rarest is burglary- no property taken.
* District where the most crime is committed is B2.
* Area where the most crime is reported is "111".

In [None]:
# to keep information I used offense description column values instead of other
df.loc[df.OFFENSE_CODE_GROUP=='other',"OFFENSE_CODE_GROUP"]=df.OFFENSE_DESCRIPTION
df['OFFENSE_CODE_GROUP']=df['OFFENSE_CODE_GROUP'].apply(lambda num: num.casefold())

In [None]:
(sorted(df.OFFENSE_CODE_GROUP.unique()))

In [None]:
df.OFFENSE_CODE_GROUP.nunique()

As you can see, 86 types of crime are recorded. As each of them was reported by the police department's segmentation methods, I did not want to touch and distort the information. It can be grouped into smaller numbers, but this results in the loss of information.

In [None]:
# different categories of crime
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (20, 15)
plt.style.use('dark_background')

sns.countplot(df['OFFENSE_CODE_GROUP'], order=df['OFFENSE_CODE_GROUP'].value_counts().nlargest(25).index,palette = 'gnuplot')

plt.title('Major Crimes in Boston', fontweight = 30, fontsize = 20)
plt.xticks(rotation = 90, fontsize=15)
plt.show()

In [None]:
pip install squarify

In [None]:
# plotting a tree map
import squarify
y = df['OFFENSE_CODE_GROUP'].value_counts().head(25)
    
plt.rcParams['figure.figsize'] = (15, 15)
plt.style.use('fivethirtyeight')

color = plt.cm.magma(np.linspace(0, 1, 15))
squarify.plot(sizes = y.values, label = y.index, alpha=.8, color = color)
plt.title('Tree Map for Top 25 Crimes', fontsize = 20)

plt.axis('off')
plt.show()

In [None]:
# plotting a tree map
import squarify
y = df['DISTRICT'].value_counts().head(25)
    
plt.rcParams['figure.figsize'] = (15, 15)
plt.style.use('fivethirtyeight')

color = plt.cm.magma(np.linspace(0, 1, 15))
squarify.plot(sizes = y.values, label = y.index, alpha=.8, color = color)
plt.title('Tree Map for Top 25 Crimes', fontsize = 20)

plt.axis('off')
plt.show()

In [None]:
# description of the crime

from wordcloud import WordCloud

plt.rcParams['figure.figsize'] = (15, 15)
plt.style.use('fast')

wc = WordCloud(background_color = 'orange', width = 1500, height = 1500).generate(str(df['OFFENSE_DESCRIPTION']))
plt.title('Description of the Crime', fontsize = 20)

plt.imshow(wc)
plt.axis('off')
plt.show()

World Cloud is used to highlight textual data points which represents their frequency (or importance).

In [None]:
plt.rcParams['figure.figsize'] = (20, 15)
plt.style.use('dark_background')

sns.countplot(df['STREET'], order=df['STREET'].value_counts().nlargest(25).index,palette = 'ocean')

plt.title('Top 15 Streets in Crime', fontweight = 50, fontsize = 50)
plt.xlabel('Counts', fontsize=20)
plt.ylabel('Streets', fontsize=20)

plt.xticks(rotation = 90, fontsize=15)
plt.show()

In [None]:
df.reset_index(inplace=True)

In [None]:
df['incidents']=1

In [None]:
x=df.groupby(['STREET',pd.Grouper(key='OCCURRED_ON_DATE', freq='D')])['incidents'].sum().reset_index()
y=x.groupby(['STREET'])['incidents'].mean().round(0).reset_index()

Average daily crimes per street.

In [None]:
y.sort_values(by='incidents', ascending=False).head(15)

Observations:
* Washington Street is almost twice higher in average crimes than its closest rival Blue Hill Avenue. 
* Except this, the vaues of rest of streets are not that different.
* But what is special about Washington Street? This street is the longest street in Boston around 59,5 km. This explains why it seems more intense than others.

In [None]:
x=df.groupby(['DISTRICT',pd.Grouper(key='OCCURRED_ON_DATE', freq='D')])['incidents'].sum().reset_index()
y=x.groupby(['DISTRICT'])['incidents'].mean().round(0).reset_index()
y.sort_values(by='incidents', ascending=False)

12 districts with daily average crime values.

In [None]:
x=df.groupby(['YEAR','DISTRICT',pd.Grouper(key='OCCURRED_ON_DATE', freq='D')])['incidents'].sum().reset_index()
y=x.groupby(['YEAR','DISTRICT'])['incidents'].mean().round(0).reset_index()
y.sort_values(by='incidents', ascending=False).head()

In [None]:
plt.rcParams['figure.figsize'] = (20, 15)
sns.set_style("whitegrid")

sns.barplot(x='DISTRICT',y='incidents',hue='YEAR',data=y, palette='viridis')
plt.title('Daily Average Crimes Across Years Per District', fontweight = 50, fontsize = 20)
plt.xlabel('Districts', fontsize=20)
plt.ylabel('Daily Average Crimes', fontsize=20)
plt.xticks(rotation = 90, fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)

plt.show()

Observations:
* Most districts reached their peak in 2017.
* In the C4 district, the average number of cases per day has increased towards 2018.
* A7, D4 districts have similar trends.
* B2 district, which has the highest number of cases, reached its peak point in 2017 and then reached its lowest level.  
* Half of districts reached their peaks at 2017.

## Average Monthly Crime Change

In [None]:
x=df.groupby(['YEAR',pd.Grouper(key='OCCURRED_ON_DATE', freq='M')])['incidents'].sum().reset_index()
y=x.groupby(['YEAR'])['incidents'].mean().round(0).reset_index()
y.head()

In [None]:
y['incidents'].pct_change()

Observations:
* The average number of crimes per month started to increase in 2016 until 2018. Then it's decreased by -0.14 in 2018.
* Why it's decreased? The Boston mayoral election was held on November 7, 2017. Remember that organized crime-related violence has important electoral consequences. This is one of the many possibiity of why avg monthly crime is decreased on 2018.

## Crimes on each day

In [None]:
import plotly.express as px
x=df.groupby('DAY_OF_WEEK')['incidents'].sum().reset_index()
fig = px.pie(x, values='incidents', names='DAY_OF_WEEK', title='Crimes by day of week')
fig.show()

Observations:
* There are no major differences between days and avg crimes.
* Crime rates are the most intense on Friday.
* The least intense on Sunday.

## Avg Crimes on each month

In [None]:
df['MONTH']=df['OCCURRED_ON_DATE'].apply(lambda x: x.strftime('%b'))
x=df.groupby(['MONTH',pd.Grouper(key='OCCURRED_ON_DATE', freq='M')])['incidents'].sum().reset_index()
y=x.groupby('MONTH')['incidents'].mean().reset_index()
fig = px.pie(y, values='incidents', names='MONTH', title='Avg Crimes per Month')
fig.show()

Observations:
* Crimes are more intense on July, August and May.
* The least intense on September, February, and June. 

# Hourly Crime Rate Across Years

In [None]:
df.to_csv('crime_processed.csv')

In [None]:
years=[2015, 2016, 2017, 2018]
total_hours=[]
for year in years:
    total_hours_yearly=(df[df['YEAR']==year]['OCCURRED_ON_DATE'].max()-df[df['YEAR']==year]['OCCURRED_ON_DATE'].min()).total_seconds()/3600
    total_hours.append(total_hours_yearly)

In [None]:
total_hours

In [None]:
x=df.groupby(['YEAR'])['incidents'].sum().reset_index()
x=x.assign(total_hours=total_hours)
x['hourly_rate']=(x['incidents']/x['total_hours']).round(1)
x

Observations:
* As you can see, there are no major differences between the hourly rate over the years. 
* Although the values are close, the highest hourly crime rate was observed in 2017.

# Hourly Crime Rate Across Districts

In [None]:
districts=['D14', 'C11', 'D4', 'B3', 'B2', 'C6', 'A1', 'E5', 'A7', 'E13','E18', 'A15']
total_hours=[]
for district in districts:
    total_hours_on_each_district=(df[df['DISTRICT']==district]['OCCURRED_ON_DATE'].max()-df[df['DISTRICT']==district]['OCCURRED_ON_DATE'].min()).total_seconds()/3600
    total_hours.append(total_hours_on_each_district)

In [None]:
x=df.groupby(['DISTRICT'])['incidents'].sum().reset_index()
x.set_index('DISTRICT', inplace=True)
x=x.reindex(districts)
x=x.assign(total_hours=total_hours)
x['hourly_rate']=(x['incidents']/x['total_hours']).round(1)
x.reset_index(inplace=True)
x

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (18, 7)
sns.set_style("whitegrid")

sns.barplot(x='DISTRICT',y='hourly_rate',hue=x.DISTRICT,data=x, palette='viridis')
plt.title('Hourly Crime Rate Per District', fontweight = 50, fontsize = 20)
plt.xlabel('Districts', fontsize=15)
plt.ylabel('Hourly Rate', fontsize=15)
plt.xticks(rotation = 90, fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)

plt.show()

Observations:
* At B2 district, almost 2 crimes are committed in an hour. That's almost higher than half crime rate of their closest rivals which are C11 and D4.
* A15 is the most safest district in Boston.

In [None]:
x['daily_rate']=x['hourly_rate']*24
x

In [None]:
sum(x['daily_rate'])

In [None]:
import plotly.express as px
fig = px.pie(x, values='daily_rate', names='DISTRICT', title='Daily Police Force Distribution Percents per District')
fig.show()

Observations:
* I can distribute the police forces to the districts based on table and pie chart above. 
* On daily basis, I should assign at least 43 police officers to B2 area, 26 people to C11 and D4 and so on. 
* The number of police officers I will assign according to each district may increase or decrease in the effect of seasonality, hours, and day of week. 
* Some days of the week are more intense in crime, some hours are less safe, some months are dangerous. I have already demonstrated these trends in previous works.