In [None]:
import pandas as pd
import altair as alt # Enables the use of VegaLite v5 schema
import geopandas as gpd # For beat boundaries in the maps
import numpy as np
alt.data_transformers.enable("vegafusion") # Enable large datasets

#### Downcasting Data Types

This will reduce the `DataFrame`'s memory usage.

In [None]:
dt_cols = ['crash_date', 'date_police_notified']

uint8_cols = ['posted_speed_limit', 'num_units', 'injuries_total', 'injuries_fatal', 'injuries_incapacitating', 'injuries_non_incapacitating', 'injuries_reported_not_evident', 'injuries_no_indication', 'injuries_unknown', 'crash_hour', 'crash_day_of_week', 'crash_month']
uint32_cols = ['lane_cnt', 'street_no']
float32_cols = ['latitude', 'longitude']

dtypes = {}

for col in uint8_cols:
    dtypes[col] = 'uint8'

for col in uint32_cols:
    dtypes[col] = 'uint32'

for col in float32_cols:
    dtypes[col] = 'float32'

dtypes['beat_of_occurrence'] = 'uint16'

# Load the dataset
file_path = 'data.csv'
df = pd.read_csv(file_path, dtype=dtypes, parse_dates=dt_cols)

### Single View Visualizations (6)

#### 1. Calendar Heatmap

This visualization corresponds to our first goal.

Due to limitations, the ax

In [None]:
df['crash_date_only'] = df['crash_date'].dt.date.astype(str)
df['crash_time'] = df['crash_date'].dt.time.astype(str)

df['adjusted_date'] = df['crash_date'] - pd.to_timedelta((df['crash_date'].dt.dayofweek + 1) % 7, unit='d')
df['crash_week'] = ((df['adjusted_date'].dt.dayofyear - 1) // 7 + 1).astype(str).str.zfill(2)

df_23 = df[df['crash_date'].dt.year == 2023]

In [None]:
alt.Chart(df_23).mark_rect().transform_calculate(
    day_name="['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'][datum.crash_day_of_week - 1]"
).encode(
    x=alt.X('crash_week:O', title='Week'),
    y=alt.Y('day_name:N', title='Day of Week'),
    color=alt.Color('count():Q', scale=alt.Scale(scheme='greens'), title=None),
    tooltip=[
        alt.Tooltip('crash_date_only:N', title='Date'),
        alt.Tooltip('count():Q', title='Crashes')
    ]
).properties(
    title='2023 Daily Crashes in the Greater Chicago Area'
)

#### 2. Stacked Area Chart

In [None]:
df_new = df[df['crash_date'].dt.year > 2014]

injury_counts = df_new.groupby(['crash_date_only', 'most_severe_injury']).size().reset_index(name='count')

pivot_injury_counts = injury_counts.pivot(index='crash_date_only', columns='most_severe_injury', values='count').fillna(0)

long_format = pivot_injury_counts.reset_index().melt(id_vars='crash_date_only', var_name='most_severe_injury', value_name='count')

In [None]:
alt.Chart(long_format, width=700, height=200).mark_area(opacity=0.7).encode(
    x=alt.X('crash_date_only:T', title='Crash Date', axis=alt.Axis(format='%Y %b')),
    y=alt.Y('count:Q', title='Number of Crashes'),
    color=alt.Color('most_severe_injury:N', title='Most Severe Injury', legend=alt.Legend(orient="top"), scale=alt.Scale(scheme='redyellowgreen')),
    tooltip=[
        alt.Tooltip('crash_date_only:T', title='Crash Date'),
        alt.Tooltip('count:Q', title='Total Crashes'),
        alt.Tooltip('most_severe_injury:N', title='Most Severe Injury')
    ]
).properties(
    title="Distribution of Injury Severity Over Time"
)

#### 3. Double-Time Bar Graph

In [None]:
df['time_category'] = df['crash_hour'].apply(lambda x: '6AM - 5PM' if 6 <= x <= 17 else '6PM - 5AM')

hour_order = [6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5]

df['crash_hour_12'] = df['crash_hour'].apply(lambda x: x%12 if x%12 else 12)

df['crash_hour_12'] = pd.Categorical(df['crash_hour_12'], categories=hour_order, ordered=True)

hourly_counts_12h = df.groupby(['crash_hour_12', 'time_category']).size().reset_index(name='count')

In [None]:
base = alt.Chart(hourly_counts_12h, width=400).transform_calculate(
    signed_count="datum.time_category == '6PM - 5AM' ? -datum.count : datum.count"
)

bar = base.mark_bar().encode(
    x=alt.X('signed_count:Q',
            title='Number of Crashes',
            axis=alt.Axis(labelExpr='abs(datum.value)')),
    y=alt.Y('crash_hour_12:O',
            title='Hour',
            sort=hour_order),
    color=alt.Color('time_category:N',
                    legend=alt.Legend(orient='top', title=None),
                    scale=alt.Scale(range=["#FFD700", "#001F3F"]),
                    title=None)
)

text_positive = base.transform_filter(
    alt.datum.signed_count > 0
).mark_text(
    size=8,
    align='right',
    dx=-4,
).encode(
    x='signed_count:Q',
    y=alt.Y('crash_hour_12:O', sort=hour_order),
    text='count:Q',
    color=alt.value('black')
)

text_negative = base.transform_filter(
    alt.datum.signed_count <= 0
).mark_text(
    size=8,
    align='left',
    dx=4,
).encode(
    x='signed_count:Q',
    y=alt.Y('crash_hour_12:O', sort=hour_order),
    text='count:Q',
    color=alt.value('white')
)

# Layer the bar and both text charts
alt.layer(bar, text_positive, text_negative).properties(
    title="Cumulative Hourly Crashes: Day vs. Night"
)


#### 4. Grouped Bar Chart

In [None]:
conditions = ['device_condition', 'weather_condition', 'lighting_condition', 'roadway_surface_cond', 'road_defect']
injury_types = ['injuries_fatal', 'injuries_incapacitating', 'injuries_non_incapacitating', 'injuries_reported_not_evident', 'injuries_no_indication', 'injuries_unknown']

def aggregate_condition_data(df, condition, injury_types):
    agg_data = df.groupby(condition)[injury_types].sum().reset_index()
    agg_data_melted = agg_data.melt(id_vars=[condition], value_vars=injury_types, var_name='InjuryType', value_name='Count')
    return agg_data_melted[agg_data_melted['Count'] > 0]

charts = []
for condition in conditions:
    data = aggregate_condition_data(df, condition, injury_types)
    chart = alt.Chart(data).mark_bar(width=500).encode(
        x=alt.X('Count:Q', title="Number of Injuries"),
        y=alt.Y(f"{condition}:N"),
        color=alt.Color('InjuryType:N', legend=alt.Legend(orient="top"), title="Type of Injury"),
        tooltip=[
            alt.Tooltip(f"{condition}:N", title=condition),
            alt.Tooltip('InjuryType:N', title="Type"),
            alt.Tooltip('Count:Q', title="Count")
        ]
    ).transform_calculate(
        InjuryLabel="slice(datum.InjuryType, 9)"
    ).properties(
        title=f"Distribution of Injuries by {condition.replace('_', ' ').title()}"
    )
    charts.append(chart)

In [None]:
alt.vconcat(*charts)

#### 5. Heatmap

In [None]:
def is_valid(value):
    return pd.notna(value) and value != 'Unknown'

filtered_data = df[
    df['prim_contributory_cause'].apply(is_valid) &
    df['sec_contributory_cause'].apply(is_valid)
]

def top_ten_values(column):
    return column.value_counts().head(10).index.tolist()

top_prim = top_ten_values(filtered_data['prim_contributory_cause'])
top_sec = top_ten_values(filtered_data['sec_contributory_cause'])

contingency_table = pd.crosstab(
    index=filtered_data['prim_contributory_cause'],
    columns=filtered_data['sec_contributory_cause']
)

top_contingency_table = contingency_table.loc[top_prim, top_sec]

heatmap_df = top_contingency_table.reset_index().melt(id_vars='prim_contributory_cause', var_name='sec_contributory_cause', value_name='count')

In [None]:
alt.Chart(heatmap_df).mark_rect(tooltip=True).encode(
    x='prim_contributory_cause:N',
    y='sec_contributory_cause:N',
    color=alt.Color('count:Q', scale=alt.Scale(scheme='redyellowblue'), title='Count')
).properties(
    title="Co-occurrence of Top Contributory Causes in Accidents"
)

#### 6. Scatter Plot

In [None]:
filtered_df = df[(df['injuries_total'] > 0)]

aggregated_df = filtered_df.groupby(['posted_speed_limit', 'damage']).agg({
    'injuries_fatal': 'mean',
    'injuries_total': 'sum'
}).reset_index()

aggregated_df.rename(columns={
    'injuries_fatal': 'average_fatal_injuries',
    'injuries_total': 'total_injuries'
}, inplace=True)

In [None]:
alt.Chart(aggregated_df, width=500, height=300).mark_point(filled=True, tooltip=True).encode(
    x=alt.X('posted_speed_limit:Q', title="Posted Speed Limit"),
    y=alt.Y('average_fatal_injuries:Q', title="Average Fatal Injuries"),
    size=alt.Size('total_injuries:Q',
                  scale=alt.Scale(type='sqrt', range=[50, 1000]),
                  title="Injuries (sqrt scale)"),
    color=alt.Color('damage:N',
                    scale=alt.Scale(scheme='redyellowgreen', reverse=True),
                    title="Damage Level")
).properties(
    title="Fatal and Total Injuries by Speed Limit and Damage Level"
)

### Linked Visualization (4)

#### 1. Geospatial Map and Bar Chart

In [None]:
crashes_agg = df.groupby(['posted_speed_limit', 'beat_of_occurrence']).agg(
    crash_count=('posted_speed_limit', 'size'),  # Count the number of crashes
    total_injuries=('injuries_total', 'sum')  # Sum the total injuries
).reset_index()

In [None]:
gdf = gpd.read_file('Boundaries Police Beats.geojson')
gdf['beat_num'] = gdf['beat_num'].astype('int')

merged_gdf = gdf.merge(crashes_agg, left_on='beat_num', right_on='beat_of_occurrence')

In [None]:
click_beat = alt.selection_point(fields=['beat_of_occurrence'])
click_bar = alt.selection_point(fields=['posted_speed_limit'])

choropleth_chart = alt.Chart(merged_gdf).mark_geoshape(stroke="white", strokeWidth=1).encode(
    color=alt.condition(
        click_beat,
        'total_count:Q', 
        alt.value('lightgray'),
        legend=alt.Legend(title="Crash Count", orient='bottom'),
    ),
    tooltip=[
        alt.Tooltip('beat_of_occurrence:N', title='Beat Number'),
        alt.Tooltip('total_count:Q', title='Crash Count'),
        alt.Tooltip('posted_speed_limit:O', title="Posted Speed Limit")
    ]
    
).add_params(
    click_beat
).transform_filter(
    click_bar
).properties(
    width=300,
    height=500,
    title="Choropleth Map of Crashes by Beat"
)

updated_choropleth = choropleth_chart.transform_joinaggregate(
    total_count='sum(crash_count)',
    groupby=['beat_of_occurrence']
)

In [None]:
bar_chart = alt.Chart(crashes_agg).mark_bar().encode(
    y=alt.Y('sum(total_injuries):Q', title="Total Injuries", scale=alt.Scale(type='symlog')),
    x=alt.X('posted_speed_limit:O', title="Posted Speed Limit"),
    color=alt.condition(
        click_bar,
        alt.value('steelblue'),
        alt.value('lightgray')
    ),
    tooltip=[
        alt.Tooltip('posted_speed_limit:O', title="Posted Speed Limit"),
        alt.Tooltip('sum(total_injuries):Q', title="Total Injuries")
    ]
).add_params(
    click_bar
).transform_filter(
    click_beat 
).properties(
    width=500,
    height=400,
    title="Bar Chart of Total Injuries by Speed Limit"
)

updated_bar = bar_chart.transform_filter(
    alt.datum.total_injuries > 0
)

In [None]:
alt.hconcat(updated_choropleth, updated_bar)

#### 2. Heatmap and Bubble Chart

In [None]:
heatmap = alt.Chart(df_23).mark_rect().encode(
    x='prim_contributory_cause:N',
    y='most_severe_injury:N',
    color=alt.Color('count():Q', scale=alt.Scale(scheme='yelloworangered'))
).properties(
    width=700,
    height=300
)

bubble_chart = alt.Chart(df_23).mark_circle().encode(
    x='prim_contributory_cause:N',
    y='most_severe_injury:N',
    color=alt.Color('count():Q', scale=alt.Scale(scheme='yelloworangered')),
    size=alt.Size('count():Q', scale=alt.Scale(range=[0, 1000])),
    tooltip=['prim_contributory_cause', 'most_severe_injury', 'count()']
).properties(
    width=700,
    height=300
)

shared_selection = alt.selection_point(fields=['prim_contributory_cause', 'most_severe_injury'])

interactive_heatmap = heatmap.add_params(
    shared_selection
).transform_filter(
    shared_selection
)

interactive_bubble_chart = bubble_chart.add_params(
    shared_selection
).transform_filter(
    shared_selection
)

alt.vconcat(interactive_heatmap, interactive_bubble_chart)

#### 3. Parallel Coordinates Plot and Point Chart with Jittering

In [None]:
df_september = df_23[df_23['crash_date'].dt.month == 9]

df_copy = df_september.copy()
df_copy['normalized_crash_hour'] = df_copy['crash_hour'] / 24
df_copy['crash_day_of_week_code'] = pd.Categorical(df_copy['crash_day_of_week']).codes / 6


long_parallel = df_copy.melt(
    id_vars=['crash_hour', 'crash_day_of_week'], 
    value_vars=['normalized_crash_hour', 'crash_day_of_week_code'], 
    var_name='Measure', 
    value_name='Value'
)

long_parallel['index'] = long_parallel.groupby(['crash_hour', 'crash_day_of_week']).ngroup()


parcoords_select = alt.selection_point(fields=['index'], on='mouseover', nearest=True)

parallel_coordinates = alt.Chart(long_parallel).mark_line().encode(
    x='Measure:N',
    y='Value:Q',
    color=alt.Color('index:N', legend=None),
    opacity=alt.condition(parcoords_select, alt.value(0.9), alt.value(0.2))
).add_params(parcoords_select).properties(
    width=600,
    height=300
)

scatter_select = alt.selection_point(fields=['crash_hour', 'crash_day_of_week'], on='mouseover', nearest=True)

df_copy['jitter'] = np.random.uniform(-0.3, 0.3, size=len(df_copy))
jittered_scatter = alt.Chart(df_copy).mark_point(opacity=0.7).encode(
    x=alt.X('jittered_crash_hour:Q', title='Crash Hour (jittered)'),
    y=alt.Y('crash_day_of_week:N', title='Crash Day of Week'),
    color=alt.Color('crash_hour:N', scale=alt.Scale(scheme='category10')),
    tooltip=['crash_hour:N', 'crash_day_of_week:N']
).transform_calculate(
    jittered_crash_hour='datum.crash_hour + datum.jitter'
).add_params(scatter_select)

alt.vconcat(parallel_coordinates, jittered_scatter)

#### 4. Donut Chart and Line Chart

In [None]:
daily_crash_counts = df_23.groupby(['crash_date_only', 'weather_condition']).size().reset_index(name='crash_count')

weather_select = alt.selection_point(fields=['weather_condition'], name="weather")
brush = alt.selection_interval(encodings=['x'])

weather_donut = alt.Chart(daily_crash_counts).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field='crash_count', aggregate='sum', type='quantitative'),
    color=alt.Color('weather_condition:N', legend=None),
    tooltip=['weather_condition:N', 'sum(crash_count):Q']
).add_params(
    weather_select
).transform_filter(
    brush
).properties(
    width=150,
    height=150,
    title="Weather"
)


line_chart = alt.Chart(daily_crash_counts).mark_line().encode(
    x=alt.X('crash_date_only:T', title='Date'),
    y=alt.Y('crash_count:Q', title='Number of Crashes'),
    color=alt.condition(weather_select, 'weather_condition:N', alt.value('lightgray')),
    tooltip=[
        alt.Tooltip('crash_date_only:T', title='Date'), 
        alt.Tooltip('crash_count:Q', title='Number of Crashes')
    ]
).transform_filter(
    weather_select
).add_params(
    brush
).properties(
    width=700,
    height=150,
    title="Daily Crashes in 2023"
)

alt.hconcat(weather_donut, line_chart)

### Spatial Visualization (1)

#### 1. Bubble Map

In [None]:
df_23_copy = df_23.copy()

df_23_copy['lat_bin'] = np.floor(df_23_copy['latitude'] / 0.01) * 0.01
df_23_copy['lon_bin'] = np.floor(df_23_copy['longitude'] / 0.01) * 0.01

binned_crash_counts = df_23_copy.groupby(['lat_bin', 'lon_bin']).size().reset_index(name='crash_count')

binned_crash_counts['lat_center'] = binned_crash_counts['lat_bin'] + 0.01 / 2
binned_crash_counts['lon_center'] = binned_crash_counts['lon_bin'] + 0.01 / 2

bubble_map = alt.Chart(binned_crash_counts).mark_circle().encode(
    longitude='lon_center:Q',
    latitude='lat_center:Q',
    color=alt.Color('crash_count:Q', title='Number of Crashes', scale=alt.Scale(scheme='redyellowgreen', reverse=True)),
    tooltip=['lon_center:Q', 'lat_center:Q', 'crash_count:Q']
).properties(
    width=300,
    height=500
)

gdf_json = gdf.to_json()

geoshape_layer = alt.Chart(alt.Data(values=gdf_json)).mark_geoshape(
    fill='lightgray',
    stroke='black'
).properties(
    width=300,
    height=500
)

alt.layer(geoshape_layer, bubble_map)
