# Wrocław Public Transport

In this notebook I analyze performance of Wrocław's public transport

## Data Loading and Cleaning

In [1]:
import numpy as np
import math
import polars as pl
import requests

from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, LinearColorMapper
from bokeh.palettes import Inferno256
from bokeh.plotting import figure, show
from bokeh.tile_providers import OSM, get_provider
from bokeh.io import output_notebook

tile_provider = get_provider(OSM)
color_mapper = LinearColorMapper(palette=Inferno256)
output_notebook()

In [2]:
positions = pl.read_csv("positions.csv")

In [3]:
# https://wiki.openstreetmap.org/wiki/Mercator#Python
MERCATOR_RADIUS = 6378137

positions = positions.filter(
    (pl.col("x") > -90) & (pl.col("x") < 90) & (pl.col("y") > -180) & (pl.col("y") < 180)
).sort(
    ["k", "timestamp"]
).with_column(
    pl.lit(2).alias("offset")
).with_columns([
    # necessary workaround with offset as polars do not support time zones
    ((pl.col("timestamp")+"Z").str.strptime(pl.Datetime, fmt="%+", strict=False) + pl.duration(hours="offset")).alias("timestamp"),
    np.deg2rad(pl.col("x")).alias("x_rad"),
    np.deg2rad(pl.col("y")).alias("y_rad"),
]).with_columns([
    pl.internals.expr.ExprDateTimeNameSpace.seconds(pl.col("timestamp")-pl.col("timestamp").shift(1)).alias("time_diff_s"),
    ((pl.col("x_rad")/2 + math.pi/4).tan().log() * MERCATOR_RADIUS).alias("y_mercator"),
    (pl.col("y_rad") * MERCATOR_RADIUS).alias("x_mercator"),
])

In [4]:
# https://stackoverflow.com/questions/365826/calculate-distance-between-2-gps-coordinates
positions['distance_km'] = positions.select([
    pl.col("x_rad").alias("x"),
    pl.col("y_rad").alias("y"),
    pl.col("x_rad").shift(1).alias("prev_x"),
    pl.col("y_rad").shift(1).alias("prev_y")
]).with_columns([
    (pl.col("x") - pl.col("prev_x")).alias("d_lat"),
    (pl.col("y") - pl.col("prev_y")).alias("d_lon"),
]).with_columns([
    ((pl.col("d_lat") / 2).sin().pow(2) + (pl.col("d_lon") / 2).sin().pow(2) * pl.col("prev_x").cos() * pl.col("x").cos()).alias("a")
]).with_columns([
    ((pl.col("a").sqrt() / (1-pl.col("a")).sqrt()).arctan() * 2 * 6373).alias("distance_km")
])['distance_km']

In [5]:
positions = positions.with_column(
    pl.when(pl.col("k").is_first())
    .then(None)
    .otherwise(pl.col("time_diff_s")).alias("time_diff_s")
).with_column(
    pl.when(pl.col("k").is_first())
    .then(None)
    .otherwise(pl.col("distance_km")).alias("distance_km")
).with_column(
    (pl.col("distance_km") / pl.col("time_diff_s") * 3600).alias("speed_km_h")
).filter(
    (pl.col("speed_km_h").max().over("k") > 5) # max speed over 5km/h
)

In [6]:
# Sometimes vehicles have GPS active when they finished their drive and stand idle.
# This needs to be cleaned in order to have correct data about drive times.

MIN_OBS = 40
CUTOFF_KM_H = 5

positions = positions.with_columns([
    (pl.col("speed_km_h") > CUTOFF_KM_H).cumsum().fill_null("backward").over("k").alias("cutoff"),
]).filter(
    pl.col("cutoff").is_between(0, pl.col("cutoff").max().over("k"))
).filter(
    pl.col("k").count().over("k") >= MIN_OBS
)

In [7]:
rides = positions.groupby('k').agg([
    pl.col("name").first().alias("name"),
    pl.col("timestamp").first().alias("start_time"),
    (pl.col("distance_km").sum() / pl.col("time_diff_s").sum() * 3600).alias("speed_km_h_avg")
]).sort("start_time")

## Sample Tram Ride

In [8]:
# TODO replace example with new data - sample ride
# TODO anpther map with speed by position
# https://products.aspose.app/gis/transformation/lat-long-to-mercator

sample_ride = positions.filter(pl.col("k") == 19709056)
source = ColumnDataSource(
    data=sample_ride.select([
        pl.col("x_mercator"),
        pl.col("y_mercator"),
        pl.col("timestamp"),
#         (pl.col("speed_km_h")+1).log().alias("speed_map_scale"),
    ]).to_dict(as_series=False)
)

p = figure(
    title="Tram 9",
    x_range=(1875733.4, 1914695.2), 
    y_range=(6621293.7, 6674532.7),
    x_axis_type="mercator",
    y_axis_type="mercator"
)
p.add_tile(tile_provider)
p.circle(
    source=source,
    x="x_mercator",
    y="y_mercator",
    size=10,
    fill_color={"field": "timestamp", "transform": color_mapper},
    fill_alpha=0.8
)
show(p)


## Fastest Lines

In [9]:
line_stats = rides.groupby("name").agg([
    pl.col("k").n_unique().alias("rides"),
    pl.col("speed_km_h_avg").median().alias("speed_km_h_avg_median"),
]).sort("speed_km_h_avg_median")

source = ColumnDataSource(
    data=line_stats.to_dict(as_series=False)
)

p = figure(
    title="Categorical Dot Plot",
    tools="",
    toolbar_location=None,
    y_range=line_stats["name"].to_list(),
    x_range=[0, 50],
    height=1800
)
p.segment(source=source, x0=0, y0="name", x1="speed_km_h_avg_median", y1="name", line_width=2, line_color="green", )
p.circle(source=source, x="speed_km_h_avg_median", y="name", size=15, fill_color="orange", line_color="green", line_width=3, )

# TODO font sizes
show(p)

## Performance by time of the day

In [10]:
rides_by_time = rides.groupby_dynamic(index_column="start_time", every="30m").agg([
    pl.median("speed_km_h_avg").alias("median_avg_speed"),
    pl.count("k").alias("rides"),
]).sort("start_time")

p1 = figure(title="Rides by time of the day", x_axis_type = "datetime", x_axis_label='Time', y_axis_label='Rides')
p1.line(
    rides_by_time["start_time"],
    rides_by_time["rides"],
    line_width=2
)

p2 = figure(title="Speed by time of the day", x_axis_type = "datetime", x_axis_label='Time', y_axis_label='Ride speed')
p2.line(
    rides_by_time["start_time"],
    rides_by_time["median_avg_speed"],
    line_width=2
)

# TODO style and make it two rows
show(gridplot([p1, p2], ncols=2, width=400, height=600))