# Exploration and interactive visualization of crime locations in Denver

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# read data
df = pd.read_csv('../input/crime-and-weed/crime_marijuana.csv')

In [None]:
# MJ_RELATION_TYPE has "\r" at the end => clean up first
def clean_up(i_str):
    return i_str[:-1]

df.MJ_RELATION_TYPE = list(map(clean_up, df.MJ_RELATION_TYPE))

In [None]:
# preview
df.head()

In [None]:
# dimensions
df.shape

In [None]:
# all columns
print(list(df.columns))

# Dates

In [None]:
# convert dates
df.FIRST_OCCURENCE_DATE = pd.to_datetime(df.FIRST_OCCURENCE_DATE)
df.LAST_OCCURENCE_DATE = pd.to_datetime(df.LAST_OCCURENCE_DATE)
df.REPORTDATE = pd.to_datetime(df.REPORTDATE)

#### Compare dates

In [None]:
df.FIRST_OCCURENCE_DATE.describe(datetime_is_numeric=True)

In [None]:
df.LAST_OCCURENCE_DATE.describe(datetime_is_numeric=True)

In [None]:
df.REPORTDATE.describe(datetime_is_numeric=True)

In [None]:
plt.scatter(df['FIRST_OCCURENCE_DATE'],df['REPORTDATE'])
plt.xlabel('FIRST_OCCURENCE_DATE')
plt.ylabel('REPORTDATE')
plt.grid()
plt.show()

plt.scatter(df['FIRST_OCCURENCE_DATE'],df['LAST_OCCURENCE_DATE'])
plt.xlabel('FIRST_OCCURENCE_DATE')
plt.ylabel('LAST_OCCURENCE_DATE')
plt.grid()
plt.show()

In [None]:
# add date differences to data frame
df['LAG_REPORT'] = list(map(lambda x:x.days, df['REPORTDATE'] - df['FIRST_OCCURENCE_DATE']))

df['LAG_LAST'] = list(map(lambda x:x.days, df['LAST_OCCURENCE_DATE'] - df['FIRST_OCCURENCE_DATE']))
df['LAG_LAST'] = df.LAG_LAST.fillna(0).astype(int)

# and plot them
df.LAG_REPORT.plot(kind='hist', bins=25)
plt.title('Lag report date / first occurrence date')
plt.grid()
plt.show()

df.LAG_REPORT.plot(kind='hist', bins=25)
plt.title('Lag last occurrence date / first occurrence date')
plt.grid()
plt.show()

In [None]:
# show extremes
df[df.LAG_REPORT > 50]

In [None]:
# show extremes
df[df.LAG_LAST > 50]

#### We will focus on first occurence for the following

In [None]:
# extract year and month (use first occurence as basis)
df['YEAR'] = df.FIRST_OCCURENCE_DATE.dt.year
df['MONTH'] = df.FIRST_OCCURENCE_DATE.dt.month
df['YEAR_MONTH'] = df.YEAR.astype(str) + '-' + df.MONTH.astype(str)

# Explore categorical variables

In [None]:
# convert ids to factors
df['DISTRICT_ID'] = df['DISTRICT_ID'].astype(object)
df['PRECINCT_ID'] = df['PRECINCT_ID'].astype(object)
df['OFFENSE_CODE'] = df['OFFENSE_CODE'].astype(object)

In [None]:
plt.figure(figsize=(8,6))
df.DISTRICT_ID.value_counts().plot(kind='bar')
plt.title('DISTRICT')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(12,8))
df.PRECINCT_ID.value_counts().plot(kind='bar')
plt.title('PRECINCT')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(16,6))
df.OFFENSE_CATEGORY_ID.value_counts().plot(kind='bar')
plt.title('OFFENSE_CATEGORY')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(18,6))
df.OFFENSE_TYPE_ID.value_counts().plot(kind='bar')
plt.title('OFFENSE_TYPE')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8,6))
df.MJ_RELATION_TYPE.value_counts().plot(kind='bar')
plt.title('MJ_RELATION')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(18,6))
df.NEIGHBORHOOD_ID.value_counts().plot(kind='bar')
plt.title('NEIGHBORHOOD')
plt.grid()
plt.show()

# Time

In [None]:
plt.figure(figsize=(10,6))
df.YEAR.value_counts().sort_index().plot(kind='bar') # use sort_index to keep natural order
plt.title('YEAR')
plt.grid()
plt.show()

#### 2020 is obviously incomplete => rather ignore

In [None]:
# plot frequency by month
plt.figure(figsize=(10,6))
df.MONTH.value_counts().sort_index().plot(kind='bar') # use sort_index to keep natural order
plt.title('MONTH')
plt.grid()
plt.show()

#### More "activity" in the summer months. Not surprising.

In [None]:
# identify most active year/month combinations
plt.figure(figsize=(18,6))
df.YEAR_MONTH.value_counts().plot(kind='bar') # this time we prefer result sorted by size!
plt.title('YEAR_MONTH')
plt.grid()
plt.show()

# Geography

In [None]:
# interactive scatter plot - color code district
fig = px.scatter(df, x='GEO_X', y='GEO_Y', color='DISTRICT_ID', hover_data=['OFFENSE_TYPE_ID'], opacity=0.5)
fig.update_layout(title='Crime Map (by District)',
                  xaxis_title='GEO_X',
                  yaxis_title='GEO_Y')
fig.show()

In [None]:
# interactive scatter plot - color code offense category
fig = px.scatter(df, x='GEO_X', y='GEO_Y', color='OFFENSE_CATEGORY_ID', hover_data=['OFFENSE_TYPE_ID'], opacity=0.5)
fig.update_layout(title='Crime Map (by Offense Category)',
                  xaxis_title='GEO_X',
                  yaxis_title='GEO_Y')
fig.show()

In [None]:
# interactive scatter plot - color code industry/non-industry 
fig = px.scatter(df, x='GEO_X', y='GEO_Y', color='MJ_RELATION_TYPE', hover_data=['MJ_RELATION_TYPE'], opacity=0.5)
fig.update_layout(title='Crime Map (by MJ_RELATION_TYPE)',
                  xaxis_title='GEO_X',
                  yaxis_title='GEO_Y')
fig.show()

In [None]:
# interactive scatter plot - color code year
fig = px.scatter(df, x='GEO_X', y='GEO_Y', color='YEAR', hover_data=['YEAR'], opacity=0.5)
fig.update_layout(title='Crime Map (by Year)',
                  xaxis_title='GEO_X',
                  yaxis_title='GEO_Y')
fig.show()

In [None]:
# interactive scatter plot - color code month 
fig = px.scatter(df, x='GEO_X', y='GEO_Y', color='MONTH', hover_data=['YEAR'], opacity=0.5)
fig.update_layout(title='Crime Map (by Month)',
                  xaxis_title='GEO_X',
                  yaxis_title='GEO_Y')
fig.show()