In [None]:
#import necessary libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import datetime
from plotly.subplots import make_subplots

In [None]:
# current version of seaborn generates a bunch of warnings that we'll ignore
import warnings 
warnings.filterwarnings("ignore")

In [None]:
file = r'../input/crimes-in-boston/crime.csv'
crime = pd.read_csv(file, sep=',', encoding='latin-1')

In [None]:
crime.head()

In [None]:
crime.describe(include='all')

In [None]:
# Rename columns to something easier to type (the all-caps are annoying!)
rename = {'OFFENSE_CODE_GROUP':'Group',
         'OFFENSE_DESCRIPTION':'Description',
         'DISTRICT':'District',
         'REPORTING_AREA':'Area',
         'SHOOTING':'Shooting',
         'OCCURRED_ON_DATE':'Date',
         'YEAR':'Year',
         'MONTH':'Month',
         'DAY_OF_WEEK':'Day',
         'HOUR':'Hour',
         'STREET':'Street'}
crime.rename(index=str, columns=rename, inplace=True)

In [None]:
#Convert into date time format
crime['Date'] = pd.to_datetime(crime['Date'])

In [None]:
# more data info
print(crime.dtypes)
print('+-'*20)
print(crime.isnull().sum())
print('+-'*20)
print(crime.shape)

In [None]:
# Fill in nans in SHOOTING column
crime.Shooting.fillna('N', inplace=True)

## Data Exploration

### Focusing on top crimes

In [None]:
#since there are a number if crime types, lets first plot ways to pick the top crimes
crime_count = crime['Group'].value_counts()
values =crime_count.values
categories = pd.DataFrame(data=crime_count.index, columns=["Group"])
categories['values'] = values

In [None]:
px.treemap(categories, title='Major Crimes in Boston', path=['Group'], values=categories['values'], height=700,
          color_discrete_sequence = px.colors.sequential.RdBu)

In [None]:
# Lets also plot a bar plot for the top 10 crimes
Group=categories['Group'][0:10]
Values=categories['values'][0:10]

fig=px.bar(categories, x=Group,
           y=Values, 
           title= 'Top 10 crimes in Boston', 
           color=categories['Group'][0:10], 
           height=400)

fig.update_layout(
    xaxis_title_text='Crime', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, 
    bargroupgap=0.1)
    
fig.show()

- Motor Vehicle Accident Response is the top crime followed by Larceny as illustrations.
- Should we now check the top crimes per year? May be this is different, lets check.

## Yearly crime Rate

In [None]:
crimes_year = crime['Year'].value_counts()
yearly = pd.DataFrame(data=crimes_year.index, columns = ['Year'])
yearly['values']=crimes_year.values

In [None]:
fig = px.pie(yearly, values='values', names='Year', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

- 2015 has the least, increased further in 2016 and 2017 but dropped significantly in 2018. May be it is becuase data is from June 2015 to Sep 2018. we need a monthly analysis now.

## Monthly Analysis

In [None]:
crimes_month = crime['Month'].value_counts()
monthly = pd.DataFrame(data=crimes_month.index, columns = ['Month'])
monthly['values']=crimes_month.values

In [None]:
fig=px.bar(monthly, x=monthly['Month'],
           y=monthly['values'], 
           title= 'Monthly Crime breakdown', 
           color=monthly['Month'], 
           height=400)

fig.update_layout(
    xaxis_title_text='Month', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, 
    bargroupgap=0.1)
    
fig.show()

In [None]:
crimes_day= crime['Day'].value_counts()
daily = pd.DataFrame(data=crimes_day.index, columns = ['Day'])
daily['values']=crimes_day.values

In [None]:
fig=px.bar(daily, x=daily['Day'],
           y=daily['values'], 
           title= 'Daily Crime breakdown', 
           color=daily['Day'], 
           height=400)

fig.update_layout(
    xaxis_title_text='Day', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, 
    bargroupgap=0.1)
    
fig.show()

In [None]:
crimes_hour= crime['Hour'].value_counts()
hourly = pd.DataFrame(data=crimes_hour.index, columns = ['Hour'])
hourly['values']=crimes_hour.values

In [None]:
fig=px.bar(daily, x=hourly['Hour'],
           y=hourly['values'], 
           title= 'Hourly Crime breakdown', 
           color=hourly['Hour'], 
           height=400)

fig.update_layout(
    xaxis_title_text='Hour', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, 
    bargroupgap=0.1)
    
fig.show()

#### Observations
- Friday's crime rate is more. And Sunday is the least. 
- June, July, August has a higher crime rate on an average. 
- Evenings had the highest crimes and early mornings the least.