In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from pylab import rcParams
import plotly.graph_objects as go
from pandas.api.types import CategoricalDtype
import warnings
import os 
import folium
from folium.plugins import HeatMap
import pandas_profiling
import plotly.express as px
pd.set_option('display.max_rows', None)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import datetime
from plotly.subplots import make_subplots
%matplotlib inline 
rcParams["figure.figsize"] = 20,9
warnings.filterwarnings("ignore")

print(os.listdir("../input/crimes-in-boston"))

**Uploading and Reading the Data Set**

In [None]:
data = pd.read_csv('../input/crimes-in-boston/crime.csv',encoding='latin',engine='python')

In [None]:
data.DISTRICT.replace({'A1' : 'Downtown',
'A15': 'Charlestown',
'A7': 'East Boston',
'B2': 'Roxbury',
'B3': 'Mattapan',
'C6': 'South Boston',
'C11': 'Dorchester',
'D4': 'South End',
'D14': 'Brighton',
'E5': 'West Roxbury',
'E13': 'Jamaica Plain',
'E18':'Hyde Park'}, inplace=True)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe().T

**Data Set Duplication Elimination**

In [None]:
case_count = data.copy()
case_count.sort_values("INCIDENT_NUMBER",inplace=True)
#prepare to delete the duplication in INCIDENT_NUMBER column, as is primary key of data set
case_count.drop_duplicates(subset="INCIDENT_NUMBER", inplace=True)
#delete all duplicate rows from INCIDENT_NUMBER column but first row will be remain

**Finding Missing Value and Elimination**

In [None]:
data.isnull().sum()

In [None]:
figure = plt.figure(figsize=(13,6))
sns.heatmap(data.isnull(),yticklabels='')

In [None]:
def missing_zero_values_table(data):
        zero_val = (data == 0.00).astype(int).sum(axis=0)
        mis_val = data.isnull().sum()
        mis_val_percent = 100 * data.isnull().sum() / len(data)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(data)
        mz_table['Data Type'] = data.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(data.shape[1]) + " columns and " + str(data.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
        return mz_table

missing_zero_values_table(data)

Because number of these crimes are low, and the column has very little information about it, and after this short analyze, we can drop 'shooting' column our analyze.

In [None]:
data.drop("SHOOTING", axis=1, inplace = True)

District, reporting area, street, as well as latitude and longitude, all have some NAs. None of these fields would benefit from imputation so I'm choosing the exclude them. The dataset is large and exluding these rows with missing values shouldn't have a significant impact on the reliability of any inference.

1. **Visualisation**

In [None]:
def treemap(categories,title,path,values):
    fig = px.treemap(categories, path=path, values=values, height=700,
                 title=title, color_discrete_sequence = px.colors.sequential.RdBu)
    fig.data[0].textinfo = 'label+text+value'
    fig.show()

In [None]:
def histogram(data,path,color,title,xaxis,yaxis):
    fig = px.histogram(data, x=path,color=color)
    fig.update_layout(
        title_text=title,
        xaxis_title_text=xaxis, 
        yaxis_title_text=yaxis, 
        bargap=0.2, 
        bargroupgap=0.1
    )
    fig.show()

In [None]:
def bar(categories,x,y,color,title,xlab,ylab):
    fig = px.bar(categories, x=x, y=y,
             color=color,
             height=400)
    fig.update_layout(
    title_text=title, 
    xaxis_title_text=xlab, 
    yaxis_title_text=ylab,
    bargap=0.2, 
    bargroupgap=0.1
    )
    fig.show()

In [None]:
Number_crimes = data['OFFENSE_CODE_GROUP'].value_counts()
values = Number_crimes.values
categories = pd.DataFrame(data=Number_crimes.index, columns=["OFFENSE_CODE_GROUP"])
categories['values'] = values

In [None]:
max_district_crime = data['DISTRICT'].value_counts().index[0]
max_street_crime = data['STREET'].value_counts().index[0]
max_year_crime = data['YEAR'].value_counts().index[0]
max_hour_crime = data['HOUR'].value_counts().index[0]
max_month_crime = data['MONTH'].value_counts().index[0]
max_day_crime = data['DAY_OF_WEEK'].value_counts().index[0]

month = ['January','February','March','April','May','June','July',
         'August','September','October','November','December']

print('District with higher occurrence of crimes:', max_district_crime)
print('Street with higher occurrence of crimes:', max_street_crime)
print('Year with highest crime occurrence:', max_year_crime)
print('Hour with highest crime occurrence:', max_hour_crime)
print('Month with highest crime occurrence:', month[max_month_crime-1], max_month_crime)
print('Day with highest crime occurrence:', max_day_crime)

2. **Histogram of Major Crime occurence in accordance with count**

In [None]:
histogram(data,"OFFENSE_CODE_GROUP","OFFENSE_CODE_GROUP",
          'Major Crimes in Boston (Showing **Motor Vehicle Accident Response** as highest)','Crime','Count')

In [None]:
bar(categories,categories['OFFENSE_CODE_GROUP'][0:10],categories['values'][0:10]
    ,categories['OFFENSE_CODE_GROUP'][0:10],'Top 10 Major Crimes in Boston City','Crime','Count')

In [None]:
fig = px.histogram(case_count, x=['YEAR'], template='plotly_white', 
                opacity=0.7,log_y=True, labels={'x':'YEARS', 'y':'Case Number Count'} )
fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)

fig.show()

In [None]:
Number_crimes_year = data['YEAR'].value_counts()
years = pd.DataFrame(data=Number_crimes_year.index, columns=["YEAR"])
years['values'] = Number_crimes_year.values

fig = px.pie(years, values='values', names='YEAR', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [None]:
Number_crimes_month = data['MONTH'].value_counts()
months = pd.DataFrame(data=Number_crimes_month.index, columns=["MONTH"])
months['values'] = Number_crimes_month.values

In [None]:
fig = go.Figure(go.Bar(
            x=months['values'],
            y=months['MONTH'],
        marker=dict(
            color='rgb(200,14,12)',

        ),
            orientation='h'))
fig.update_layout(
    title_text='Major Crimes in Boston per month', 
    xaxis_title_text='Count',
    yaxis_title_text='Month', 
    bargap=0.2, 
    bargroupgap=0.1
)
fig.show()

In [None]:
fig = px.histogram(data, y="DAY_OF_WEEK",color="DAY_OF_WEEK")
fig.update_layout(
    title_text='Crime count on each day', 
    xaxis_title_text='Day',
    yaxis_title_text='Crimes Count', 
    bargap=0.2, 
    bargroupgap=0.1
)
fig.show()

In [None]:
histogram(data,"HOUR","HOUR",'Crime count on each Hour','Hour','Count')

In [None]:
histogram(data,"YEAR","MONTH",'Crime count on each year per month','Year','Crimes Count')

In [None]:
plt.subplots(figsize=(15,6))
sns.countplot('DISTRICT',data=data,palette='RdYlGn',edgecolor=sns.color_palette('Paired',20),order=data['DISTRICT'].value_counts().index)
plt.xticks(rotation=90)
plt.title('Number Of Crimes Activities By District')
plt.show()

Relationship between Year and District (Number of crimes)

In [None]:
G1=data[data['MONTH'].isin(data['MONTH'].value_counts()[1:11].index)]
pd.crosstab(G1.YEAR,G1.DISTRICT).plot(color=sns.color_palette('dark',6))
fig=plt.gcf()
plt.title('Relationship between Year and District (Number of crimes)')
fig.set_size_inches(18,6)
plt.show()