# <center> Montreal Crime Data </center>

Ref : https://www.kaggle.com/stevieknox/montreal-crime-data

In [None]:
# Load the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from IPython.display import HTML
import matplotlib.lines as lines
from wordcloud import WordCloud

In [None]:
#Visualization settings
sns.set_style(style='white')
sns.set(rc={
    'figure.figsize': (12,7),
    'axes.facecolor': 'white',
    'axes.grid': True,
    'grid.color': '.9',
    'axes.linewidth': 1.0,
    'grid.linestyle': u'-'},
    font_scale=1.5)
custom_colors=["#3498db", "#95a5a6","#34495e", "#2ecc71", "#e74c3c"]
sns.set_palette(custom_colors)
background_color='#fbfbfb'


In [None]:
# Load input file
df_input = pd.read_csv('../input/montreal-crime-data/Montreal Crime Data.csv')

In [None]:
print (f"Shape of dataframe : {df_input.shape}\n")
print (f"Sample data frame:\n")
display(df_input.head())
print ("Dataset summary \n")
display (df_input.info())

In [None]:
df_input.columns

In [None]:
# Feature 'Unnanmed' is just a series of number. Doesn't add any model values to dataset. Hence removing this feature from dataset.
df_input.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# Unique values in the feature 'count'
df_input['count'].unique()

In [None]:
# Since feature 'count' contains only one value, not much significance in data analysis. Hence removing this feature.
df_input.drop('count', axis=1, inplace=True)

In [None]:
# Removing features lattitude and longitude
df_input.drop(['longitude', 'latitude'], axis=1,inplace=True)

In [None]:
# After removing the non necessary columns
df_input.columns

# Feature Analysis

In [None]:
#Categorical feature analysis
def analyze_categorical_feature(fld,display_graph=True):
    print ("Sample data:\n")
    display(fld.head())
    df=pd.DataFrame({"Value": fld.value_counts().index,
                 "Count":fld.value_counts().values})
    print ("\nNull value count : ", fld.isnull().sum())
    unique_list=fld.unique().tolist()
    print ("\nUnique values: ", unique_list)
    print ("\n Unique values count: ", len(unique_list))
    print ("\nValue counts:\n",    df)
    if display_graph==True:
        plt.subplots(figsize=(25,10),facecolor=background_color)
        plt.subplot(2,2,1)
        plt.pie(fld.value_counts(),labels=fld.value_counts().index,autopct=lambda x: f'{x: .2f}%');
        plt.xticks(rotation=90)

        plt.subplot(2,2,2)   
        sns.barplot(data=df, x="Value",y="Count").set_facecolor(background_color);
        plt.xticks(rotation=90);
        plt.suptitle(fld.name + " -distribution");

        plt.show()
        plt.close()
    


In [None]:
#Continuous feature analysis
def analyze_continuous_feature(fld):
    print ("Sample data:\n",fld.head())
    print ("\nNull value count : ", fld.isnull().sum())
    print ("\n", fld.describe())
    print (f"\n Skewness : {fld.skew()} \n")
    plt.subplots(figsize=(25,10))
    plt.subplot(2,2,1)
    plt.hist(fld)
    plt.subplot(2,2,2)
    sns.boxplot(fld)
    plt.suptitle(fld.name + "-distribution")
    plt.show()
    plt.close()
    

### 1. Category

In [None]:
analyze_categorical_feature(df_input.category)

### 2. Date, Year

In [None]:
# Since and date and year features are related, we can take year feature to analyse the distribution of records.
analyze_categorical_feature(df_input.year)

Dataset contains data from year 2015 to 2021 <br>
We can see that from 2015 to 2021, number of crime rates were reduced.<br>

### 3. Postal code

In [None]:
# Unique values in Postal code
df_input.postal_code.unique()

In [None]:
# number of unique values in postal code
df_input.postal_code.nunique()

### 4. City

In [None]:
analyze_categorical_feature(df_input.city)

<p>
There are 28 different city's record is available in the dataset. <br>
67.82% records are from Montreal city.
    </p>

### 5. Neighbourhood

In [None]:
analyze_categorical_feature(df_input.neighbourhood)

# Feature Relationships

### 1. Cities having theft in common.

In [None]:
df_cities = df_input.loc[:,['category', 'city']].copy()

In [None]:
# Filtering categories based on theft
df_cities=df_cities[(df_cities['category'] == 'Motor vehicle theft') | (df_cities['category']=='Theft in / from a motor vehicle') | (df_cities['category']=='Confirmed Theft')]

In [None]:
cities = df_cities.city
cities=cities.value_counts()

In [None]:
cities_counts={}

In [None]:
for item in cities.iteritems():
    cities_counts[item[0]]=item[1]

In [None]:
# Cities based on theft crime count
cities_counts

In [None]:
wc=WordCloud(background_color=background_color).generate_from_frequencies(cities_counts)

In [None]:
fig=plt.figure(figsize=(15,9),facecolor=background_color)
gs= fig.add_gridspec(1,1)
gs.update(wspace=0.5, hspace=0.5)
ax0=fig.add_subplot(gs[0,0])
ax0.set_facecolor(background_color)
ax0.imshow(wc,interpolation='bilinear')
ax0.axis('off')
#Draw a seperation line
l1=lines.Line2D([0.92,0.92],[0.1,0.9], transform=fig.transFigure,color='black',lw=0.2)
fig.lines.extend([l1])

fig.text(x=0.93,y=0.7,
         s="Ciity names based on crime 'theft' category",
         fontsize=25,fontfamily='serif',color='grey',fontweight='bold')
fig.text(x=0.92,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
        fontsize=22,
        color='grey',
        s='''
        MONTREAL city having highest theft count.
        SENNEVILLE having less theft.
        ''')
fig.tight_layout(pad=0)
fig.patch.set_facecolor(background_color)
plt.show()


### 2. Cities having 'Offences resulting in death'

In [None]:
df_cities = df_input.loc[:,['category', 'city']].copy()

In [None]:
df_cities=df_cities[df_cities['category'] == 'Offenses resulting in death']

In [None]:
cities = df_cities.city
city_counts={}
cities = cities.value_counts()

In [None]:
for item in cities.iteritems():
    city_counts[item[0]] = item[1]

In [None]:
# City names based on count
city_counts

In [None]:
# Word cloud preparation
wc=WordCloud(background_color=background_color).generate_from_frequencies(cities_counts)

In [None]:
# Word cloud visualization
fig=plt.figure(figsize=(15,9),facecolor=background_color)
gs= fig.add_gridspec(1,1)
gs.update(wspace=0.5, hspace=0.5)
ax0=fig.add_subplot(gs[0,0])
ax0.set_facecolor(background_color)
ax0.imshow(wc,interpolation='bilinear')
ax0.axis('off')
#Draw a seperation line
l1=lines.Line2D([0.92,0.92],[0.1,0.9], transform=fig.transFigure,color='black',lw=0.2)
fig.lines.extend([l1])

fig.text(x=0.93,y=0.7,
         s="Offenses resulting in death",
         fontsize=25,fontfamily='serif',color='grey',fontweight='bold')
fig.text(x=0.92,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
        fontsize=22,
        color='grey',
        s='''
        MONTREAL city having highest crime records.
        MONTREAL-OUEST, SAINT-LAURENT,ROXBORO
        DOLLARD-DES-ORMEAUX,L'ILE-BIZARD
        cities having very less (1) crime records
        ''')
fig.tight_layout(pad=0)
fig.patch.set_facecolor(background_color)
plt.show()


### 3. Top 3 crime categories per year

In [None]:
df_data = df_input.loc [:,['year', 'city', 'category']].copy()

In [None]:
group=df_data.groupby(['year', 'category']).aggregate('count')

In [None]:
df_summary=group.reset_index()

In [None]:
df_summary.rename({'city':'count'}, axis=1,inplace=True)

In [None]:
df_summary

In [None]:
# Word cloud visualization
fig=plt.figure(figsize=(15,9),facecolor=background_color)
gs= fig.add_gridspec(1,1)
gs.update(wspace=0.5, hspace=0.5)
ax0=fig.add_subplot(gs[0,0])
ax0.set_facecolor(background_color)

sns.lineplot(data=df_summary, x = 'year', y='count',hue='category', ax=ax0)

#Draw a seperation line
l1=lines.Line2D([0.92,0.92],[0.1,0.9], transform=fig.transFigure,color='black',lw=0.2)
fig.lines.extend([l1])

fig.text(x=0.93,y=0.7,
         s="Crime trends per year",
         fontsize=25,fontfamily='serif',color='grey',fontweight='bold')
fig.text(x=0.92,
        y=0.5,
        fontweight='light',
        fontfamily='serif',
        fontsize=22,
        color='grey',
        s='''
        Home invasion rate is drastically reduced from 2015 to 2021.
        Home invasion,Confirmed theft and Mischief were the top
        3 crime categories per year.        
        ''')
fig.tight_layout(pad=0)
fig.patch.set_facecolor(background_color)
plt.show()
