In [1]:
import altair as alt
import pandas as pd
from vega_datasets import data
# import vega
import geopandas as gpd

In [2]:
path = 'datasets/crimedata.csv'
crime_data = pd.read_csv(path)
crime_data.head()

Unnamed: 0.1,Unnamed: 0,Neighbourhood,Total - Age groups and average age of the population - 100% data,0 to 14 years...3,0 to 4 years...4,5 to 9 years...5,10 to 14 years...6,15 to 64 years...7,15 to 19 years...8,20 to 24 years...9,...,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,X,Y,Population density,Average cost of house in neighbour,Average income
0,1,Arbutus-Ridge,15295.0,2015.0,455.0,685.0,880.0,9805.0,1230.0,1165.0,...,,,,,,,,,,
1,2,Downtown,62030.0,4000.0,2080.0,1105.0,810.0,51275.0,1180.0,4050.0,...,,,,,,,,,,
2,3,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,11.0,15.0,14.0,30.0,29XX W 31ST AVE,487516.1816,5454623.638,,,
3,4,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,7.0,11.0,21.0,0.0,29XX W 31ST AVE,487579.6067,5454613.684,,,
4,5,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,4.0,25.0,21.0,54.0,29XX W 33RD AVE,487585.2638,5454405.082,,,


## View 1 - Trend of Crime linked to Crime by Month (Uni-Directional Linking)

### Task(s) Addressed:
1. What is the trend of total crime over time by year? (Trend)
2. What is the Distribution of crime each Year by Month? (Characterize Distribution)


In [3]:
#Creating the desired dataframe
# Discard year 2023 as it does not have complete data (current year)
df = crime_data[crime_data['YEAR'] < 2023]
#Combine different TYPEs into a couple similar types
df_ym = df.groupby(['YEAR', 'MONTH']).count()
df_ym = df_ym.rename(columns={"Unnamed: 0": "Count",})
df_ym = pd.DataFrame(df_ym.to_records())
# Add a new column with the month names
df_ym["MONTH"] = df_ym['MONTH'].apply(lambda x: pd.Timestamp(year=2000, month=int(x), day=1).strftime('%b'))
df_ym.head()

Unnamed: 0,YEAR,MONTH,Count,Neighbourhood,Total - Age groups and average age of the population - 100% data,0 to 14 years...3,0 to 4 years...4,5 to 9 years...5,10 to 14 years...6,15 to 64 years...7,...,TYPE,DAY,HOUR,MINUTE,HUNDRED_BLOCK,X,Y,Population density,Average cost of house in neighbour,Average income
0,2003.0,Jan,5043,5042,3740,3740,3740,3740,3740,3740,...,5043,5043,5043,5043,5043,5042,5042,0,0,0
1,2003.0,Feb,4250,4250,3283,3283,3283,3283,3283,3283,...,4250,4250,4250,4250,4250,4250,4250,0,0,0
2,2003.0,Mar,4665,4658,3583,3583,3583,3583,3583,3583,...,4665,4665,4665,4665,4665,4658,4658,0,0,0
3,2003.0,Apr,4895,4894,3714,3714,3714,3714,3714,3714,...,4895,4895,4895,4895,4895,4894,4894,0,0,0
4,2003.0,May,5439,5431,4057,4057,4057,4057,4057,4057,...,5439,5439,5439,5439,5438,5431,5431,0,0,0


In [4]:
# Selector
selector = alt.selection_single(fields=['YEAR'])

# Create the Base Graph
base = alt.Chart(df_ym).properties(
    width=350,
    height=300,
).add_selection(selector)

# Create the Line Chart
lines = base.mark_line().encode(
    x=alt.X('YEAR:O', title="Year"),
    y=alt.Y('sum(Count):Q', title="Number of Crimes"),
    opacity=alt.condition(selector, alt.value(1), alt.value(0.1)),
    tooltip=[
        alt.Tooltip('YEAR'),
        alt.Tooltip('sum(Count)')
    ]
)
lines = lines.mark_line(point=True,color='orange').encode(
    opacity=alt.condition(selector, alt.value(1), alt.value(0.25))
).add_selection(selector)

# Create the Bar Chart
bar = base.mark_bar(opacity=1, thickness=100).encode(
    x=alt.X('MONTH:O', axis=alt.Axis(labelAngle=-30), title="Month", sort="-y"),
    y=alt.Y('sum(Count)', title="Crime by Month"),
    tooltip=[
        alt.Tooltip('MONTH:O', title="Month"),
        alt.Tooltip('sum(Count)', title='Incedents of Crime'),
        #alt.Tooltip('YEAR', title='Year')
    ]

).transform_filter(  # Add this method to filter the data based on the selected year
    selector
)

complete = (lines | bar).properties(
    title="Trend of Crime in Vancouver Linked to Crime by Month"
).configure_point(
    size=75
)
complete


## View 2 -  Scatter Plot of Mean Age and Total Population by Neighbourhood linked to a Stacked Bar Chart of Distribution of Type of Crime by Neighborhoods Near Downtown (Bi-directional Linking)


### Task(s) Addressed:
1. What is the frequency and distribution of the types of crime that are observed in neighbourhoods in downtown Vancouver? 
2. How do the average ages of the population and total populations of each of the neighbourhoods in downtown Vancouver compare, and are they related to the types of crime that are observed in these neighbourhoods?


In [5]:
# Used https://altair-viz.github.io/gallery/scatter_with_layered_histogram.html as a resource
# Discard year 2023 as it does not have complete data (current year)
df = crime_data[crime_data['YEAR'] < 2023]

#Combine different TYPEs into a couple similar types
df = df.replace('Break and Enter Commercial','Break And Enter')
df = df.replace('Break and Enter Residential/Other','Break And Enter')
df = df.replace('Homicide','Offence Against a Person')
df = df.replace('Other Theft','Other Theft')
df = df.replace('Theft from Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Bicycle','Vehicle Related Theft')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Fatality)','Traffic Accident')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Injury)','Traffic Accident')

# Look at subsection of neighbourhoods in Downtown
neighbourhoods = ['Strathcona', 'Grandview-Woodland', 'Hastings-Sunrise', 'Mount Pleasant', 'Fairview']

df2 = df.loc[df['Neighbourhood'].isin(neighbourhoods)] 

In [6]:
# Second Vis:

# Select Neighbourhood, Mean Age, Total Population, and Type of Crime
df = df2.groupby(['Neighbourhood', 'TYPE', 'Average age of the population',
                   "Total - Age groups and average age of the population - 100% data"]).count()
df = df.rename(columns={"Unnamed: 0": "Count",})
df = pd.DataFrame(df.to_records())

df.head()

# Selector
selector = alt.selection_single(fields = ['Neighbourhood'])

#Create the Base Graph 
base = alt.Chart(df).properties(
    width=300,
    height=250,
).add_selection(selector)

# Create the ScatterPlot
points = base.mark_circle(size = 200).encode(
    x=alt.X('Average age of the population', scale = alt.Scale(domain = [30, 55]), title = "Average Age of the Population"),
    y=alt.Y('Total - Age groups and average age of the population - 100% data',
            scale = alt.Scale(domain = [10000, 40000]), title = "Total Population"),
    color='Neighbourhood:N',
    opacity=alt.condition(selector, alt.value(1), alt.value(0.1)),
    tooltip = [
        alt.Tooltip('Average age of the population'), 
        alt.Tooltip('Neighbourhood:N'), 
        alt.Tooltip('Total - Age groups and average age of the population - 100% data:N', 
                    title = 'Total Population in the Neighbourhood')
    ]   
)

#Create the Bar Chart
bar = base.mark_bar(opacity=1, thickness=100).encode(
    x=alt.X('TYPE:N', axis=alt.Axis(labelAngle=-30), title = "Type of Crime"),
    y=alt.Y('sum(Count)', title = "Incidents of Crime"),
    color= 'Neighbourhood:N',
    opacity=alt.condition(selector, alt.value(1), alt.value(0.25)),
    tooltip = [
        alt.Tooltip('TYPE', title = "Type of Crime"), 
        alt.Tooltip('Neighbourhood:N'), 
        alt.Tooltip('sum(Count)', 
                    title = 'Incedents of Crime')
    ]   
)

# Combine the two visualizations 
complete = (points | bar).properties(
    title = "Age to Population Scatterplot Linked to Crime For Each Type Stacked Bar Chart by Neighbourhood (Downtown area)"
)
complete

## View 3 - Distribution of Crime in Vancouver

### Task(s) Addressed:
1. What are the neighbourhoods that have historically had the most amount of crime over the period of the dataset? 
2. How does the total crime of a neighbourhood in the dataset compare with the total crime of neighbourhoods geographically near it and far from it?

In [7]:
import altair as alt
from vega_datasets import data
import requests
import json

vancouver_url = 'https://opendata.vancouver.ca/explore/dataset/local-area-boundary/download/?format=geojson&timezone=America/Los_Angeles'

In [8]:
df = crime_data['Neighbourhood'].value_counts()
df = df.to_frame()
df = df.rename(columns={"Neighbourhood": "count"})
df = df.rename_axis('Neighbourhood').reset_index()
gdf = gpd.read_file(vancouver_url)

gdf = gdf.rename(columns={'name': 'Neighbourhood'})
gdf = gdf.merge(df, on='Neighbourhood')
neighbours = gdf['Neighbourhood'].unique() # get unique field values

selectNeighbourhood = alt.selection_single(
    name='Select', # name the selection 'Select'
    fields=['Neighbourhood'], # limit selection to the Major_Genre field
    init={'Neighbourhood': neighbours[0]}, # use first genre entry as initial value
    bind=alt.binding_select(options=neighbours) # bind to a menu of unique genre values
)
gdf_projected = gdf.to_crs("EPSG:32610")  # You can replace EPSG:32610 with an appropriate EPSG code for your area
gdf_projected["centroid"] = gdf_projected["geometry"].centroid
gdf_projected["centroid"] = gdf_projected["centroid"].to_crs(gdf.crs)
gdf["centroid_lng"] = gdf_projected["centroid"].apply(lambda point: point.x)
gdf["centroid_lat"] = gdf_projected["centroid"].apply(lambda point: point.y)


data  = alt.InlineData(values = gdf.__geo_interface__, #geopandas to geojson
                       # root object type is "FeatureCollection" but we need its features
                       format = alt.DataFormat(property='features',type='json')) 
data


base = alt.Chart(data).mark_geoshape(
    stroke='black',
    strokeWidth=1
).add_selection(
    selectNeighbourhood
).encode(
    color=alt.Color("properties.count:Q", title='Incedents of Crime'),
    tooltip=[
        alt.Tooltip('properties.Neighbourhood:N', title='Neighbourhood'),
        alt.Tooltip('properties.count:Q', title='Incedents of Crime')
    ],
    opacity=alt.condition(selectNeighbourhood, alt.value(0.75), alt.value(0.25), legend=None)
).transform_calculate(
    Neighbourhood='datum.properties.Neighbourhood'
).project(
    type='identity', reflectY=True
)


text_chart = alt.Chart(data).mark_text(
    align='center',
    baseline='middle',
    fontSize=10,
    fontWeight="bold",
    dy=-8  # Adjust the y-offset of the text labels if necessary
).encode(
    longitude='properties.centroid_lng:Q',
    latitude='properties.centroid_lat:Q',
    text='properties.mapid:N',  # Use the 'mapid' column for text
    tooltip=[
        alt.Tooltip('properties.Neighbourhood:N', title='Neighbourhood'),
        alt.Tooltip('properties.count:Q', title='Incedents of Crime')
    ]
)
map_with_mapid = base + text_chart
map_with_mapid = map_with_mapid.properties(
    height=500,
    width=600,
    title="Distribution of Total Crime in Vancouver by Neighbourhood"
)
map_with_mapid

## View 4 - Trend of Crime Data by Type of Crime 

### Task(s) Addressed:
1. How has the crime rate by crime type in Vancouver changed over time? (Trend)

In [9]:
# Used https://altair-viz.github.io/gallery/multiline_tooltip.html as a resource
# Discard year 2023 as it is not over
df = crime_data[crime_data['YEAR'] < 2023]

#Combine different TYPEs into a couple similar types
df = df.replace('Break and Enter Commercial','Break And Enter')
df = df.replace('Break and Enter Residential/Other','Break And Enter')
df = df.replace('Homicide','Offence Against a Person')
df = df.replace('Other Theft','Other Theft')
df = df.replace('Theft from Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Bicycle','Vehicle Related Theft')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Fatality)','Traffic Accident')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Injury)','Traffic Accident')
#Change Year Column to Temporal
df['YEAR'] = pd.to_datetime(df.YEAR, format='%Y')



In [10]:
df

Unnamed: 0.1,Unnamed: 0,Neighbourhood,Total - Age groups and average age of the population - 100% data,0 to 14 years...3,0 to 4 years...4,5 to 9 years...5,10 to 14 years...6,15 to 64 years...7,15 to 19 years...8,20 to 24 years...9,...,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,X,Y,Population density,Average cost of house in neighbour,Average income
2,3,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,11.0,15.0,14.0,30.0,29XX W 31ST AVE,487516.1816,5454623.638,,,
3,4,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,7.0,11.0,21.0,0.0,29XX W 31ST AVE,487579.6067,5454613.684,,,
4,5,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,4.0,25.0,21.0,54.0,29XX W 33RD AVE,487585.2638,5454405.082,,,
5,6,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,9.0,8.0,7.0,0.0,29XX W 33RD AVE,487585.2638,5454405.082,,,
6,7,Dunbar-Southlands,21425.0,3545.0,675.0,1225.0,1650.0,14215.0,1800.0,1740.0,...,12.0,2.0,7.0,54.0,29XX W 38TH AVE,487435.4586,5453876.477,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853943,853944,Arbutus Ridge,,,,,,,,,...,11.0,9.0,10.0,23.0,YEW ST / KING EDWARD AVE,488652.0000,5455342.000,,,
853944,853945,Arbutus Ridge,,,,,,,,,...,9.0,24.0,12.0,31.0,YEW ST / NANTON AVE,488739.0000,5454999.000,,,
853945,853946,Arbutus Ridge,,,,,,,,,...,8.0,31.0,20.0,1.0,YEW ST / W 33RD AVE,488673.0000,5454386.000,,,
853946,853947,Arbutus Ridge,,,,,,,,,...,1.0,23.0,18.0,16.0,YEW ST / W 37TH AVE,488470.0000,5453964.000,,,


In [11]:
# Find Neighbourhood Count
df = df.groupby(['YEAR', 'TYPE']).count()
df = df.rename(columns={"Unnamed: 0": "Count",})
df = pd.DataFrame(df.to_records())

df

Unnamed: 0,YEAR,TYPE,Count,Neighbourhood,Total - Age groups and average age of the population - 100% data,0 to 14 years...3,0 to 4 years...4,5 to 9 years...5,10 to 14 years...6,15 to 64 years...7,...,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,X,Y,Population density,Average cost of house in neighbour,Average income
0,2003-01-01,Break And Enter,10081,10081,8710,8710,8710,8710,8710,8710,...,10081,10081,10081,10081,10081,10081,10081,0,0,0
1,2003-01-01,Mischief,6387,6387,4955,4955,4955,4955,4955,4955,...,6387,6387,6387,6387,6387,6387,6387,0,0,0
2,2003-01-01,Offence Against a Person,3531,3529,2654,2654,2654,2654,2654,2654,...,3531,3531,3531,3531,3531,3531,3531,0,0,0
3,2003-01-01,Other Theft,11426,11426,7969,7969,7969,7969,7969,7969,...,11426,11426,11426,11426,11424,11426,11426,0,0,0
4,2003-01-01,Traffic Accident,1881,1849,1568,1568,1568,1568,1568,1568,...,1881,1881,1881,1881,1881,1849,1849,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2022-01-01,Mischief,5604,5604,3354,3354,3354,3354,3354,3354,...,5604,5604,5604,5604,5604,5604,5604,0,0,0
116,2022-01-01,Offence Against a Person,3884,3882,2424,2424,2424,2424,2424,2424,...,3884,3884,3884,3884,3884,3884,3884,0,0,0
117,2022-01-01,Other Theft,10731,10731,7272,7272,7272,7272,7272,7272,...,10731,10731,10731,10731,10731,10731,10731,0,0,0
118,2022-01-01,Traffic Accident,1031,1028,839,839,839,839,839,839,...,1031,1031,1031,1031,1031,1030,1030,0,0,0


In [12]:
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection_single(nearest=True, on='mouseover',
                        fields=['YEAR'])

# # The basic line
line = alt.Chart(df).mark_line().encode(
    alt.X('YEAR:T', title = 'Year'),
    y='Count',
    color='TYPE:N'
)


# Transparent selectors across the chart. This is what tells us the x-value of the cursor
selectors = alt.Chart(df).mark_point().encode(
    x='YEAR:T',
    opacity=alt.value(0)

)

selectors_near = selectors.encode(
    opacity=alt.condition(nearest, alt.value(0.5), alt.value(0.3))
).add_selection(nearest)

# Draw points on the line, and highlight based on selection
points = line.mark_circle().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
).add_selection(nearest)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'Count:Q', alt.value(' '))
)

# Draw a rule at the location of the selection
rules = alt.Chart(df).mark_rule(color='gray').encode(
    x='YEAR:T',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
layers = alt.layer(
    line, selectors, points, text, rules
).properties(
    width=600, height=300,
    title = "Total Crime in Vancouver Over Time by Crime Type"
)

layers