In [1]:
import glob
import os
import pandas as pd
import altair as alt
import calendar
import datetime
from altair import pipe, limit_rows, to_values

In [2]:
# # Set the directory where the CSV files are located
# dir_path = "."

# # Get all the CSV files in the directory
# all_files = glob.glob(os.path.join(dir_path, "*.csv"))

# # Combine the CSV files into a single DataFrame
# df = pd.concat((pd.read_csv(f) for f in all_files))

# # Save the combined DataFrame to a new CSV file
# df.to_csv("combined.csv", index=False)

# Crime Analysis

By: Benjamin Chang, Sandra Radic, and Justin Kung

## Abstract

The project includes analyzing the Vancouver Crime dataset, which contains information on crime types, location, date, and time, as well as demographic data from the Vancouver 2016 census dataset.

## Introduction

We made this project so Sandra can walk home without crying.
Tasks include analyzing temporal and spatial patterns in crime activity, exploring crime type distribution, visualizing crime activity on a map, and comparing crime rates across different neighbourhoods and crime types. 
**TODO: Input the specific viz's we choose for each task**

In [3]:
# Read in the data
url = 'https://raw.githubusercontent.com/sgskung/crime320/main/combined.csv'
df = pd.read_csv(url)

In [4]:
# Creating necessary variables
df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])

In [5]:
# Change the max_rows limit of altair because our dataset is 55,000 rows.
t = lambda data: pipe(data, limit_rows(max_rows=60000), to_values)
alt.data_transformers.register('custom', t)
alt.data_transformers.enable('custom')

DataTransformerRegistry.enable('custom')

## Preliminary Data Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55398 entries, 0 to 55397
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   TYPE           55398 non-null  object        
 1   YEAR           55398 non-null  int64         
 2   MONTH          55398 non-null  int64         
 3   DAY            55398 non-null  int64         
 4   HOUR           55398 non-null  int64         
 5   MINUTE         55398 non-null  int64         
 6   HUNDRED_BLOCK  55398 non-null  object        
 7   NEIGHBOURHOOD  55398 non-null  object        
 8   X              55398 non-null  float64       
 9   Y              55398 non-null  float64       
 10  DATE           55398 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(5), object(3)
memory usage: 4.6+ MB


In [7]:
# Total number of rows
print("Total number of rows:", len(df))

# Unique values in TYPE column
print("Unique crime types:", df['TYPE'].nunique())
print(df['TYPE'].unique())

Total number of rows: 55398
Unique crime types: 11
['Theft from Vehicle' 'Theft of Bicycle' 'Theft of Vehicle'
 'Vehicle Collision or Pedestrian Struck (with Fatality)'
 'Vehicle Collision or Pedestrian Struck (with Injury)'
 'Break and Enter Commercial' 'Break and Enter Residential/Other'
 'Homicide' 'Mischief' 'Offence Against a Person' 'Other Theft']


In [8]:
# Count of crimes by neighbourhood
print(df.groupby('NEIGHBOURHOOD').size().sort_values(ascending=False))

# Number of crimes in Downtown neighbourhood
print("Number of crimes in Downtown:", len(df[df['NEIGHBOURHOOD'] == 'Downtown']))

NEIGHBOURHOOD
Central Business District    30086
Strathcona                    8090
Renfrew-Collingwood           6570
Fairview                      5387
Hastings-Sunrise              3411
Victoria-Fraserview           1854
dtype: int64
Number of crimes in Downtown: 0


In [9]:
# Count of crimes by year
print(df.groupby('YEAR').size().sort_values(ascending=False))

# Number of crimes in 2016
print("Number of crimes in 2016:", len(df[df['YEAR'] == 2016]))

YEAR
2020    19683
2022    18807
2021    16908
dtype: int64
Number of crimes in 2016: 0


In [12]:
# Count of crimes by hour
print(df.groupby('HOUR').size().sort_values(ascending=False))

# Most frequent hour for crimes
print("Most frequent hour for crimes:", df['HOUR'].mode().values[0])

HOUR
0     9643
18    2993
17    2982
16    2710
12    2665
19    2640
15    2544
20    2495
14    2414
22    2271
13    2209
21    2166
11    2003
10    1943
23    1883
9     1809
8     1633
1     1447
7     1275
2     1239
3     1171
4     1138
5     1078
6     1047
dtype: int64
Most frequent hour for crimes: 0


## Exploratory Data Analysis
Lowkey some of these could be viz's themselves and we just use the preliminary analysis as our EDA.

In [13]:
## Crime counts by neighborhood

# Group the data by neighborhood and count the number of crimes in each neighborhood
crime_counts = df.groupby('NEIGHBOURHOOD').size().reset_index(name='counts')

# Create a bar chart showing the crime counts by neighborhood
chart = alt.Chart(crime_counts).mark_bar().encode(
    x='counts',
    y=alt.Y('NEIGHBOURHOOD:N', sort='-x'),
)

chart

  for col_name, dtype in df.dtypes.iteritems():


In [14]:
## Crime counts by neighborhood

# Group the data by neighborhood and count the number of crimes in each neighborhood
crime_counts = df.groupby('TYPE').size().reset_index(name='counts')

# Create a bar chart showing the crime counts by neighborhood
chart = alt.Chart(crime_counts).mark_bar().encode(
    x='counts',
    y=alt.Y('TYPE:N', sort='-x'),
)

chart

  for col_name, dtype in df.dtypes.iteritems():


In [15]:
## Crime count by day of the week
df['WEEKDAY'] = df.apply(lambda x: calendar.day_name[datetime.date(x['YEAR'], x['MONTH'], x['DAY']).weekday()], axis=1)

weekday_count = df.groupby('WEEKDAY')['WEEKDAY'].count().reset_index(name='count')
bars = alt.Chart(weekday_count).mark_bar().encode(
    x=alt.X('count', title='Crime Count'),
    y=alt.Y('WEEKDAY', sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], title='Day of the Week')
).properties(title='Crime count by day of the week')
bars


  for col_name, dtype in df.dtypes.iteritems():


In [16]:
## Crime trends over time

# Create a new column 'year-month' by combining the YEAR and MONTH columns
df['year-month'] = pd.to_datetime(df[['YEAR', 'MONTH']].assign(DAY=1))

# Create a dataframe of crime counts by year-month
month_counts = df.groupby('year-month')['TYPE'].count().reset_index(name='COUNT')

# Create a line chart of crime counts by year-month
alt.Chart(month_counts).mark_line().encode(
    x='year-month:T',
    y='COUNT:Q'
)

  for col_name, dtype in df.dtypes.iteritems():


In [17]:
## Crime types by neighborhood

# Group the data by neighborhood and crime type, and count the number of crimes in each category
crime_types = df.groupby(['NEIGHBOURHOOD', 'TYPE']).size().reset_index(name='counts')

# Create a stacked bar chart showing the crime types by neighborhood
chart = alt.Chart(crime_types).mark_bar().encode(
    x='counts',
    y=alt.Y('NEIGHBOURHOOD:N', sort='-x'),
    color='TYPE:N'
)

chart

  for col_name, dtype in df.dtypes.iteritems():


In [18]:
# Hourly Crime Rate

hourly_count = df.groupby('HOUR')['HOUR'].count().reset_index(name='count')
line = alt.Chart(hourly_count).mark_line().encode(
    x=alt.X('HOUR', title='Hour'),
    y=alt.Y('count', title='Crime Count')
).properties(title='Hourly crime count')

line

  for col_name, dtype in df.dtypes.iteritems():


In [19]:
## Correlation between crime and location - this is broken right now lol

# Task Abstraction

Task 1: Temporal Analysis - Identify patterns in crime over time

Visualize the temporal trends in crime in Vancouver, focusing on monthly, weekly, and hourly patterns to identify the times of day and days of the week when crimes are most likely to occur and investigate if crime rates have increased or decreased over the years.

**Attributes: Year, Month, Day, Hour, Minute**

Task 2: Spatial Analysis - Explore geospatial distribution of crime in Vancouver

Visualize the spatial distribution of crime in Vancouver, focusing on the neighbourhoods where the crimes occur most frequently to identify hotspots of criminal activity and investigate if certain types of crimes are more prevalent in specific neighbourhoods.

**Attributes: X, Y, Neighbourhood**

Task 3: Crime Type Analysis - Analyze types of crimes committed in Vancouver

Visualize the types of crimes committed in Vancouver and analyzing the frequency and distribution of each type of crime to identify the most common types of crimes in the city and investigate if certain types of crimes are more prevalent in certain neighbourhoods or at certain times.

**Attributes: Type, Neighbourhood, Year, Month**

Task 4: Demographic Analysis - Explore relationship between crime and demographic data

Visualize the relationship between crime and demographic census data in Vancouver, focusing on how different demographic factors (e.g., age, gender, income) relate to crime rates in different neighbourhoods to identify whether certain demographic factors are associated with higher or lower crime rates in particular areas, and to explore whether these relationships hold across different types of crimes.

**Attributes: Type, Neighbourhood, Age, Gender, Income**

Task 5: Crime Comparison - Compare crime rates across different neighbourhoods and crime types

Compare crime rates across different neighbourhoods and crime types in Vancouver, allowing users to identify which neighbourhoods have higher or lower crime rates compared to others, and which types of crimes are most common in different areas. The aim is to provide a way of exploring crime data that highlights spatial and temporal differences in crime activity across the city - **TODO: this one can include multiple views of the data, and interactivity**

**Attributes: Type, Neighbourhood, Year, Month, Hour, Day**