# A Detailed Crime Analysis in the City of Toronto

## Sarbpreet Ghotra & Sameer Ladha

## DS8007 Final Project


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import folium
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
import json
from matplotlib.colors import Normalize
import seaborn as sns

### Data Preprocessing

In [None]:
filename = "/Users/sameerladha/Documents/School/Masters of Science - Data Science and Analytics/S2/DS8007 - Advanced Data Visualization/Final Project DS8007/Major_Crime_Indicators_Open_Data.csv"
df = pd.read_csv(filename)

prelen = len(df)
# Show basic data description
df.info()

### Data Cleaning


In [None]:
# Remove any years before 2014 as most of the data is from 2014 onwards
df = df[df["OCC_YEAR"] >= 2014]

# Drop nan values
df.dropna(inplace=True)

postlen = len(df)

print("We removed", prelen - postlen, "rows of data")
print("-" * 35)
print()
print()
# Dataset after cleaning
df.info()

In [None]:
columns_of_interest = [
    "PREMISES_TYPE",
    "MCI_CATEGORY",
    "OFFENCE",
    "DIVISION",
    "NEIGHBOURHOOD_158",
]

for column_name in columns_of_interest:
    unique_values = df[column_name].unique()
    num_unique_values = len(unique_values)

    print("Unique values in column '{}':".format(column_name))
    for value in unique_values:
        print(value)

    print("\nNumber of unique values:", num_unique_values)
    print("\n")

### Exploratory Data Analysis

In [None]:
report_year_counts = df["REPORT_YEAR"].value_counts().sort_index()
occ_year_counts = df["OCC_YEAR"].value_counts().sort_index()
report_month_counts = df["REPORT_MONTH"].value_counts().sort_index()
occ_month_counts = df["OCC_MONTH"].value_counts().sort_index()

all_years = sorted(set(report_year_counts.index).union(set(occ_year_counts.index)))
all_months = sorted(set(report_month_counts.index).union(set(occ_month_counts.index)))

report_year_counts = report_year_counts.reindex(all_years, fill_value=0)
occ_year_counts = occ_year_counts.reindex(all_years, fill_value=0)
report_month_counts = report_month_counts.reindex(all_months, fill_value=0)
occ_month_counts = occ_month_counts.reindex(all_months, fill_value=0)

plt.figure(figsize=(10, 6))

plt.fill_between(
    all_years,
    report_year_counts.values,
    label="Reported Year",
    color="lightgreen",
    alpha=0.5,
)
plt.fill_between(
    all_years,
    occ_year_counts.values + report_year_counts.values,
    label="Occurred Year",
    color="skyblue",
    alpha=0.5,
)
plt.title("Number of Crimes by Reported Year and Occurred Year")
plt.xlabel("Year")
plt.ylabel("Number of Crimes")
plt.legend(loc="upper left")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))

plt.fill_between(
    all_months,
    report_month_counts.values,
    label="Reported Month",
    color="lightgreen",
    alpha=0.5,
)
plt.fill_between(
    all_months,
    occ_month_counts.values + report_month_counts.values,
    label="Occurred Month",
    color="skyblue",
    alpha=0.5,
)
plt.title("Number of Crimes by Reported Month and Occurred Month")
plt.xlabel("Month")
plt.legend(loc="upper left")

plt.show()

In [None]:
divisions = df["DIVISION"].value_counts()
plt.figure(figsize=(10, 6))
plt.stem(divisions.index, divisions.values)
plt.title("Number of Crimes by Police Division")
plt.ylabel("Number of Crimes")
plt.xlabel("Division")
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 6))
top_10_neighbourhoods = df["NEIGHBOURHOOD_158"].value_counts().nlargest(10)
axs[0].barh(top_10_neighbourhoods.index, top_10_neighbourhoods.values)
axs[0].set_title("Top 10 Neighbourhoods with the Most Crime")
axs[0].set_xlabel("Number of Crimes")
axs[0].set_ylabel("Neighbourhood")


bottom_10_neighbourhoods = df["NEIGHBOURHOOD_158"].value_counts().nsmallest(10)
axs[1].barh(bottom_10_neighbourhoods.index, bottom_10_neighbourhoods.values)
axs[1].set_title("Top 10 Neighbourhoods with the Lowest Crime")
axs[1].set_xlabel("Number of Crimes")
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Types of Crimes
crime_counts = df["MCI_CATEGORY"].value_counts().sort_values(ascending=False)
axs[0].pie(crime_counts, labels=crime_counts.index, autopct="%1.1f%%", startangle=140)
axs[0].set_title("Types of Crimes")

# Where Crimes Occur
premises_counts = df["PREMISES_TYPE"].value_counts().sort_values(ascending=False)
axs[1].pie(
    premises_counts, labels=premises_counts.index, autopct="%1.1f%%", startangle=140
)
axs[1].set_title("Where Crimes Occur")

plt.show()

In [None]:
city_center_latitude = df["LAT_WGS84"].median()
city_center_longitude = df["LONG_WGS84"].median()

m = folium.Map(location=[city_center_latitude, city_center_longitude], zoom_start=12)

heat_data = [[row["LAT_WGS84"], row["LONG_WGS84"]] for index, row in df.iterrows()]

HeatMap(heat_data).add_to(m)

m

In [None]:
dfmap = pd.read_csv(
    "/Users/sameerladha/Documents/School/Masters of Science - Data Science and Analytics/S2/DS8007 - Advanced Data Visualization/Final Project DS8007/Toronto-Police-Service-Data-Visualization/Data/neighbourhood-crime-rates - 4326.csv"
)

# Replace single quotes with double quotes and load the geometry as a JSON object
dfmap["geometry"] = dfmap["geometry"].apply(lambda x: json.loads(x.replace("'", '"')))

# Use this command to display crimes for all years
# filtered_crime_df = df

# Use this command to display crimes by year
filtered_crime_df = df[(df["OCC_YEAR"] == 2014)]

# Toronto's latitude and longitude for the map's center
toronto_lat = 43.651070
toronto_lng = -79.347015

# Initialize a Folium map centered on Toronto
toronto_map = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=11)


# Function to add a neighborhood to the map
def add_neighborhood_to_map(area_name, geometry, map_obj):
    folium.GeoJson(
        geometry,
        style_function=lambda x: {
            "color": "blue",
            "weight": 2,
            "fillColor": "grey",
            "fillOpacity": 0.5,
        },
        tooltip=area_name,
    ).add_to(map_obj)


# Add each neighborhood to the map
for _, row in dfmap.iterrows():
    add_neighborhood_to_map(row["AREA_NAME"], row["geometry"], toronto_map)

marker_cluster = MarkerCluster().add_to(toronto_map)

for idx, row in filtered_crime_df.iterrows():
    folium.Marker(
        location=[row["LAT_WGS84"], row["LONG_WGS84"]],
        popup=f"Date Occured: {row['OCC_DATE']},Type: {row['MCI_CATEGORY']}, Premises: {row['PREMISES_TYPE']}",
        icon=None,
    ).add_to(marker_cluster)

toronto_map

In [None]:
dfmap = pd.read_csv(
    "/Users/sameerladha/Documents/School/Masters of Science - Data Science and Analytics/S2/DS8007 - Advanced Data Visualization/Final Project DS8007/Toronto-Police-Service-Data-Visualization/Data/neighbourhood-crime-rates - 4326.csv"
)

# Replace single quotes with double quotes and load the geometry as a JSON object
dfmap["geometry"] = dfmap["geometry"].apply(lambda x: json.loads(x.replace("'", '"')))

# Use this command to display crimes for all years
# filtered_crime_df = df

# Use this command to display crimes by year
filtered_crime_df = df[(df["OCC_YEAR"] == 2014)]

# Initialize a Folium map centered on Toronto
toronto_map = folium.Map(location=[43.651070, -79.347015], zoom_start=11)

categories = filtered_crime_df["MCI_CATEGORY"].unique()
premises = filtered_crime_df["PREMISES_TYPE"].unique()

# Create a layer for each category and premise type
for category in categories:
    for premise in premises:
        # Filter the DataFrame for this combination of category and premise
        filtered_df = filtered_crime_df[
            (filtered_crime_df["MCI_CATEGORY"] == category)
            & (filtered_crime_df["PREMISES_TYPE"] == premise)
        ]

        marker_cluster = MarkerCluster(name=f"{category} - {premise}").add_to(
            toronto_map
        )

        for idx, row in filtered_df.iterrows():
            folium.Marker(
                location=[row["LAT_WGS84"], row["LONG_WGS84"]],
                popup=f"Date Occured: {row['OCC_DATE']},Type: {row['MCI_CATEGORY']}, Premises: {row['PREMISES_TYPE']}",
                icon=None,
            ).add_to(marker_cluster)

# Add LayerControl to allow toggling
folium.LayerControl().add_to(toronto_map)

toronto_map

In [None]:
df1 = df.copy()

# Clean the 'OCC_DOW' column by stripping extra spaces
df1["OCC_DOW"] = df1["OCC_DOW"].str.strip()

occ_dow_counts = (
    df1["OCC_DOW"]
    .value_counts()
    .reindex(
        ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    )
)
norm = Normalize(vmin=occ_dow_counts.min(), vmax=occ_dow_counts.max())
cmap = plt.cm.magma

plt.figure(figsize=(10, 6))
bars = plt.bar(occ_dow_counts.index, occ_dow_counts.values)

for bar, value in zip(bars, occ_dow_counts.values):
    bar.set_color(cmap(norm(value)))

plt.title("Crime Occurrences by Day of the Week")
plt.xlabel("Day of the Week")
plt.ylabel("Number of Occurrences")
plt.xticks(rotation=45)
plt.ylim(20000, max(occ_dow_counts.values) * 1.1)

plt.show()

In [None]:
crime_over_time = (
    df.groupby(["REPORT_YEAR", "MCI_CATEGORY"]).size().unstack(fill_value=0)
)

plt.figure(figsize=(14, 8))
for category in crime_over_time.columns:
    plt.plot(
        crime_over_time.index, crime_over_time[category], label=category, marker="o"
    )

plt.title("Crimes Over Time by Type")
plt.xlabel("Year")
plt.ylabel("Number of Crimes")
plt.legend(title="Crime Type", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

In [None]:
crimes_by_premises_over_time = (
    df.groupby(["REPORT_YEAR", "PREMISES_TYPE"]).size().unstack(fill_value=0)
)

plt.figure(figsize=(14, 8))
for premises_type in crimes_by_premises_over_time.columns:
    plt.plot(
        crimes_by_premises_over_time.index,
        crimes_by_premises_over_time[premises_type],
        label=premises_type,
        marker="o",
    )

plt.title("Crimes Over Time by Premises Type")
plt.xlabel("Year")
plt.ylabel("Number of Crimes")
plt.legend(title="Premises Type", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

In [None]:
crimes_by_division_over_time = (
    df.groupby(["REPORT_YEAR", "DIVISION"]).size().unstack(fill_value=0)
)

plt.figure(figsize=(14, 8))
for division in crimes_by_division_over_time.columns:
    plt.plot(
        crimes_by_division_over_time.index,
        crimes_by_division_over_time[division],
        label=division,
        marker=".",
    )

plt.title("Reported Crimes by Division Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Reported Crimes")
plt.legend(title="Division", bbox_to_anchor=(1.05, 1), loc="upper left", ncol=2)
plt.grid(True)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

In [None]:
offence_heatmap_data = df.pivot_table(
    values="EVENT_UNIQUE_ID", index="OCC_DOW", columns="OCC_HOUR", aggfunc="count"
)
sns.heatmap(offence_heatmap_data, cmap="YlGnBu")
plt.title("Offences by Day of Week and Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Day of the Week")
plt.show()

In [None]:
time_ranges = {
    "Morning": (6, 11),
    "Afternoon": (12, 16),
    "Evening": (17, 21),
    "Night": (22, 23),
    "Night": (0, 5),
}

colors = {
    "Morning": "gold",
    "Afternoon": "orange",
    "Evening": "lightcoral",
    "Night": "dimgray",
}


def assign_time_period(hour):
    for period, (start, end) in time_ranges.items():
        if start <= hour <= end:
            return period
    return "Night"


df["Time_Period"] = df["OCC_HOUR"].apply(assign_time_period)
time_period_counts = df["Time_Period"].value_counts().sort_index()
ordered_periods = ["Morning", "Afternoon", "Evening", "Night"]
time_period_counts = time_period_counts.reindex(ordered_periods)

plt.figure(figsize=(10, 6))
time_period_counts.plot(
    kind="bar", color=[colors[period] for period in time_period_counts.index]
)
plt.title("Time of Day Crime Distribution")
plt.xlabel("Time Period")
plt.ylabel("Number of Crimes")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--")
plt.show()

In [None]:
df["Time_Period"] = df["OCC_HOUR"].apply(assign_time_period)

crime_period_data = df.pivot_table(
    index="MCI_CATEGORY", columns="Time_Period", aggfunc="size", fill_value=0
)

ordered_periods = ["Morning", "Afternoon", "Evening", "Night"]
crime_period_data = crime_period_data[ordered_periods]

crime_period_data.plot(
    kind="bar",
    stacked=True,
    color=[colors[period] for period in ordered_periods],
    figsize=(12, 8),
)

plt.title("Crime by Type and Time of Day 2014-2022")
plt.xlabel("Crime Type")
plt.ylabel("Number of Crimes")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--")
plt.legend(title="Time Period")
plt.tight_layout()
plt.show()

In [None]:
df_2023 = df[df["OCC_YEAR"] == 2023].copy()

months_order = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]

df_2023.loc[:, "OCC_MONTH"] = pd.Categorical(
    df_2023["OCC_MONTH"], categories=months_order, ordered=True
)

grouped_data = (
    df_2023.groupby(["OCC_MONTH", "MCI_CATEGORY"], observed=True)
    .size()
    .unstack(fill_value=0)
)

plt.figure(figsize=(12, 8))
for column in grouped_data.columns:
    plt.plot(grouped_data.index, grouped_data[column], label=column)

plt.xlabel("Month")
plt.ylabel("Number of Crime Occurrences")
plt.title("Occurrences by Category in 2023")
plt.xticks(rotation=45)
plt.legend(title="Category", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.tight_layout()
plt.show()