In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#importing initial libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# not to see warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# inputing datasets
df1=pd.read_csv("../input/crimes-in-boston/crime.csv", engine='python')
df1.head()

In [None]:
df2=pd.read_csv("../input/crimes-in-boston/offense_codes.csv", engine='python')
df2.head()

In [None]:
# to see is there any duplicates
print(len(df1))
print(len(df1.drop_duplicates(subset='INCIDENT_NUMBER', keep='first')))

In [None]:
# let us delete duplicates really:)
df1=df1.drop_duplicates(subset='INCIDENT_NUMBER', keep='first').reset_index(drop=True)

In [None]:
# to get general information
df1.info()

#### Here we have missing values in some columns like **DISTRICT, SHOOTING, STREET, LAT, Long**. So we may need actions droping, filling or other to work later on it efficeintly, **according to our neeed**!

In [None]:
# to see missing values in Lat column
df1[df1['Lat'].isnull()]

In [None]:
df1[df1['Lat'].isnull()]['Location'].unique()
# No values besides (0.00000, 0.00000)

###  In the initial time, I wanted to fill missing values in **Lat and Long columns** with data of **Location column** has same values. Then, when we look at those data(rows) which have missing values in Lat or Long column above, may see Location has no missing values right row. **However**, Location's values are like **(0.0000, 0.0000) that consist of only 0(zeroes)**. Hence, I suspect these are wrong, unsuitable values and decided not to fill them. Because it is not a normal occasion of all are 0(zeroes) and can be dangerous implement analyzing actions to filled with 0(zeroes) in normally!

In [None]:
print(len(df2))
print(len(df2.drop_duplicates(subset='CODE', keep='first')))

In [None]:
df2=df2.drop_duplicates(subset='CODE', keep='first').reset_index(drop=True)

In [None]:
df2.info()

In [None]:
# importing libraries for Geographic and Interactive visualizations
from folium import Choropleth, Circle, Marker, Map
from folium.plugins import HeatMap, MarkerCluster
import plotly.express as px
import plotly.graph_objects as go
import math

In [None]:
# took data only year is 2015. Just for interest and not not freeze kernel while map visualization
df1_part = df1[df1['YEAR'] == 2015]
df1_2015=df1_part.copy()
df1_2015 = df1_2015.groupby('OFFENSE_CODE_GROUP')['INCIDENT_NUMBER'].count().reset_index()
df1_2015=df1_2015.sort_values(by='INCIDENT_NUMBER', ascending=False)[:10]
df1_2015

In [None]:
# Crime/Offenses number over most 10 types of Crime/Offenses in interactive bar plot of plotly
fig = px.bar(df1_2015, x='OFFENSE_CODE_GROUP', y='INCIDENT_NUMBER',
             hover_data=['OFFENSE_CODE_GROUP', 'INCIDENT_NUMBER'], color='INCIDENT_NUMBER',
             labels={'INCIDENT_NUMBER':'The Number of Crimes'}, height=500, width=1000, 
             title='The Number of Crimes over Offense Group')
fig.show()

In [None]:
# creating weekend and midweek dataframes
df1_we=df1[(df1['DAY_OF_WEEK']=='Saturday')|(df1['DAY_OF_WEEK']=='Sunday')]
df1_mw=df1[~(df1['DAY_OF_WEEK']=='Saturday')|(df1['DAY_OF_WEEK']=='Sunday')]

**Grouping by Hour**

In [None]:
df1_we=df1_we.groupby('HOUR')['INCIDENT_NUMBER'].count().reset_index()
df1_we

In [None]:
df1_mw=df1_mw.groupby('HOUR')['INCIDENT_NUMBER'].count().reset_index()
df1_mw

In [None]:
# Crime/Offenses weekend and midweek number of Crimes/Offenses in interactive bar plot of plotly
fig = go.Figure(layout={'height':550})
fig.add_trace(go.Bar(x=df1_we['HOUR'], y=df1_we['INCIDENT_NUMBER'], name='Weekend', marker_color='rgb(55, 83, 109)'))
fig.add_trace(go.Bar(x=df1_mw['HOUR'], y=df1_mw['INCIDENT_NUMBER'], name='Midweek', marker_color='rgb(26, 118, 255)'))
fig.update_layout(title='Number of Offenses in Weekend/Midweek over the Hours', xaxis_tickfont_size=14,
    yaxis=dict(title='Number of Offenses', titlefont_size=16, tickfont_size=14),
    xaxis=dict(title='Hour', titlefont_size=16, tickfont_size=14),
    legend=dict(x=0, y=1.0, bgcolor='rgba(255, 255, 255, 0)', bordercolor='rgba(255, 255, 255, 0)'), 
                  barmode='group', bargap=0.15, bargroupgap=0.1)
fig.show()

In [None]:
# Heatmap of Hour and Weekday
x = df1.DAY_OF_WEEK
y = df1.HOUR
fig = go.Figure(go.Histogram2d(x=x,y=y))
fig.update_layout(title='Heat scale on number of Offenses', xaxis_tickfont_size=14,
    yaxis=dict(title='Hour', titlefont_size=16, tickfont_size=14),
    xaxis=dict(title='Week day', titlefont_size=16, tickfont_size=14))
fig.show()

In [None]:
# Cluster map of crime/offense incidents 
m = Map(location=[42.361145, -71.057083], zoom_start=13)
cluster = MarkerCluster()
for idx, row in df1_part.iterrows():
    if not math.isnan(row['Long']) and not math.isnan(row['Lat']):
        cluster.add_child(Marker([row['Lat'], row['Long']]))
m.add_child(cluster)

In [None]:
# Heatpmap of crime/offense area locations
df1_part.dropna(subset=['Lat'], inplace=True)
df1_part.dropna(subset=['Long'], inplace=True)
m = Map(location=[42.361145, -71.057083], zoom_start=13, )
HeatMap(data=df1_part[['Lat','Long']], radius=10).add_to(m)
m

In [None]:
# Streets with the highest ten(10) rank of crime/offense quantity
st=df1.groupby(['STREET'])['INCIDENT_NUMBER'].count().reset_index().sort_values('INCIDENT_NUMBER', ascending=False)[:10]
st

In [None]:
# Percentage distribution of Crimes/Offenses on Streets in the visualization of Pie graph
fig = px.pie(st, values=st['INCIDENT_NUMBER'], 
             title='Percentage distrubution of Offenses on Streets', 
             names=st['STREET'])
fig.show()

In [None]:
# Number of Crimes/Offenses on Streets in the visualization of Line plot
fig = go.Figure(layout={'height':550})
fig.add_trace(go.Line(x=st['STREET'], y=st['INCIDENT_NUMBER'], name='Street', marker_color='rgb(26, 118, 255)'))
fig.update_layout(title='Number of Most Offenses over the Streets', xaxis_tickfont_size=14,
    yaxis=dict(title='Number of Offenses', titlefont_size=16, tickfont_size=14),
    xaxis=dict(title='Street', titlefont_size=16, tickfont_size=14))
fig.show()

### I needed dataframe has number of cases in **each Month and Year**. To do right that, seems the simplest way, but it takes time and would be unprofessional if there were more 4 years
#### df1[df1['YEAR']==2015].groupby('MONTH')['INCIDENT_NUMBER'].count().reset_index()

In [None]:
df1['YEAR'].unique()

In [None]:
# creating dataframe includes count of cases in all year and month
g=pd.DataFrame(data={'MONTH':np.arange(1,13,1)})
for x in [2015,2016,2017,2018]: 
    # or pd['YEAR'].unique()
    g=g.merge(df1[df1['YEAR']==x].groupby('MONTH')['INCIDENT_NUMBER'].count().reset_index().iloc[:,:],
            left_on='MONTH', right_on='MONTH', how='outer')
g.columns=['MONTH','2015','2016','2017','2018']
g=g.fillna(0)
g

In [None]:
# Number of cases in each year over the months on visualization of line plot
fig = go.Figure(layout={'height':550})
fig.add_trace(go.Line(x=g['MONTH'], y=g['2015'], name='2015', marker_color='rgb(227, 20, 20)'))
fig.add_trace(go.Line(x=g['MONTH'], y=g['2016'], name='2016', marker_color='rgb(21, 5, 247)')),
fig.add_trace(go.Line(x=g['MONTH'], y=g['2017'], name='2017', marker_color='rgb(45, 227, 5)'))
fig.add_trace(go.Line(x=g['MONTH'], y=g['2018'], name='2018', marker_color='rgb(86, 0, 245)'))

fig.update_layout(title='Number of Offenses in some Years over the Months', xaxis_tickfont_size=5,
    yaxis=dict(title='Number of Offenses', titlefont_size=16, tickfont_size=14),
    xaxis=dict(title='Month', titlefont_size=16, tickfont_size=14),
    legend=dict(x=0, y=0.5, bgcolor='rgba(255, 255, 255, 0)', bordercolor='rgba(255, 255, 255, 0)'), 
                bargap=0.1, bargroupgap=0.1)
fig.show()

## **If you appreciated really or liked, Please upvote, comment and follow to see other our works**