In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import matplotlib.pyplot as plt
import imageio.v2 as imageio
import osmnx as ox

In [2]:
# read bus stops geojson
bus_stops = gpd.read_file('data/raw/tfl/Bus_Stops.geojson')
bus_stops = bus_stops.to_crs(epsg=4326)
bus_stops = bus_stops.dropna(subset=['geometry'])
# drop DATE_UPDATED
bus_stops = bus_stops.drop(columns=['DATE_UPDATED'])
bus_stops.head()

Unnamed: 0,OBJECTID,STOP_NAME,STOP_CODE,COUNTDOWN_SIGN,NAPTAN_ATCO,ROAD_NAME,POINT_LETTER,ROUTES,LIVE_BUS_ARRIVAL,OS_EASTING,OS_NORTHING,POSTCODE,geometry
0,627,Northumberland Park Bus Stand,BP4641,,,NORTHUMBERLAND PARK,,,https://tfl.gov.uk/bus/stop//,534761,191137,N17 0LB,POINT (-0.05554 51.60296)
1,780,Hyde Park Corner,BP4661,,,PICCADILLY,,,https://tfl.gov.uk/bus/stop//,528623,179941,W1J 9DZ,POINT (-0.14820 51.50378)
2,2401,Hampton Court Station,BP5447,,,HAMPTON COURT STATION FORECOURT,AP,,https://tfl.gov.uk/bus/stop//,515372,168407,KT8 9AE,POINT (-0.34279 51.40297)
3,2951,Birkbeck,TRS162,,9400ZZCRBIR1,HARRINGTON ROAD - BIRKBECK,,T2,https://tfl.gov.uk/bus/stop/9400ZZCRBIR1/,535302,168966,SE20 7YA,POINT (-0.05623 51.40359)
4,3016,Avenue Road,TRS164,,9400ZZCRAVE1,AVENUE ROAD,,T2,https://tfl.gov.uk/bus/stop/9400ZZCRAVE1/,535744,169323,BR3 4SB,POINT (-0.04974 51.40669)


In [3]:
# import bus data
file = os.path.join('data/raw/tfl/busto_outputs/busto-sample.CSV')
bus_entex = pd.read_csv(file)
bus_entex.head()

# get geometry for bus_entex from bus_stops
bus_entex= bus_entex.merge(bus_stops[['STOP_CODE', 'geometry']], left_on='STOPCODE', right_on='STOP_CODE', how='left')
bus_entex = gpd.GeoDataFrame(bus_entex, geometry='geometry',crs='epsg:4326')

FILTER WITHIN LONDON   

In [4]:
# download the Greater London boundary
greater_london = ox.geocode_to_gdf('Greater London, UK')
greater_london = greater_london.to_crs(epsg=4326)

# subset points that are within the boundary of london
bus_entex = bus_entex[bus_entex.geometry.intersects(greater_london.geometry[0])]

FUNCTION TO FILTER BUS DATA BY DATE

In [5]:
# filter for saturday only
bus_entex = bus_entex[bus_entex['DAY_TYPE'] == 'Saturday']
bus_entex.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 877 entries, 0 to 921
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   YEAR          877 non-null    int64   
 1   DAY_TYPE      877 non-null    object  
 2   TIMEBAND      877 non-null    int64   
 3   ROUTE         877 non-null    int64   
 4   DIRECTION     877 non-null    int64   
 5   STOPCODE      877 non-null    object  
 6   STOPNAME      877 non-null    object  
 7   STOPSEQUENCE  877 non-null    int64   
 8   AllBoardings  877 non-null    float64 
 9   AllAlighted   877 non-null    float64 
 10  load          877 non-null    float64 
 11  Capacity      877 non-null    float64 
 12  Seats         877 non-null    float64 
 13  V/C           877 non-null    float64 
 14  STOP_CODE     877 non-null    object  
 15  geometry      877 non-null    geometry
dtypes: float64(6), geometry(1), int64(5), object(4)
memory usage: 116.5+ KB


FUNCTION TO PLOT SINGLE ROUTE

In [6]:
# # define function to plot according to route_id
# def plot_route(route_id):
    
#     route_id = str(route_id)    #convert route_id to string
#     bus_entex['ROUTE'] = bus_entex['ROUTE'].astype(str)
#     df_plot = bus_entex[bus_entex.ROUTE==route_id] # filter data for the route_id

#     # import london boroughs
#     boroughs = gpd.read_file(os.path.join('data','shapefile','statistical-gis-boundaries-london','ESRI','London_Borough_Excluding_MHW.shp'))
#     boroughs = boroughs.to_crs(epsg=4326) # change crs to 4326                   

#     # plot
#     # get minmax values for colorbar, min of both n_boarders and n_alighters
#     vmin = df_plot[['AllBoardings','AllAlighted']].min().min()
#     vmax = df_plot[['AllBoardings','AllAlighted']].max().max()

#     # cmap
#     cmap = plt.cm.viridis
    
#     fig,ax = plt.subplots(2,1,figsize=(15,20))

#     df_plot.plot(ax=ax[0], cmap=cmap, column='AllBoardings',markersize=df_plot['AllBoardings']/10, vmin=vmin, vmax=vmax,label='STOPNAME')
#     boroughs.boundary.plot(ax=ax[0], color='grey', linewidth=0.3)
#     ax[0].set_title('Route {} - Boarders'.format(route_id))
#     ax[0].set_axis_off()

#     df_plot.plot(ax=ax[1], cmap=cmap, column='AllAlighted', markersize=df_plot['AllAlighted']/10, vmin=vmin, vmax=vmax,label='STOPNAME')
#     boroughs.boundary.plot(ax=ax[1], color='grey', linewidth=0.3)
#     ax[1].set_title('Route {} - Alighters'.format(route_id))
#     ax[1].set_axis_off()

#     # shared legend for both axes
#     sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
#     cbar = plt.colorbar(sm, ax=ax, orientation='vertical', pad=0.01, aspect=50)
#     cbar.set_label('Number of passengers')

#     plt.show()
    
#     return

In [7]:
# plot_route(1)

Split by time band

In [8]:
# pivot by time bands
bus_entex_pivot = bus_entex.pivot_table(index=['STOPCODE','STOPNAME','ROUTE','geometry'], columns='TIMEBAND', values=['AllAlighted'], aggfunc='sum')
bus_entex_pivot.columns = bus_entex_pivot.columns.droplevel()
bus_entex_pivot = bus_entex_pivot.reset_index()
# add total columnS that are numerical
bus_entex_pivot['Total'] = bus_entex_pivot[bus_entex_pivot.columns[4:]].sum(axis=1)
bus_entex_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874 entries, 0 to 873
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   STOPCODE  874 non-null    object  
 1   STOPNAME  874 non-null    object  
 2   ROUTE     874 non-null    int64   
 3   geometry  874 non-null    geometry
 4   8         874 non-null    float64 
 5   Total     874 non-null    float64 
dtypes: float64(2), geometry(1), int64(1), object(2)
memory usage: 41.1+ KB


In [9]:
# rename columns based on timeband, use dict

timeband_dict = {
    8: 'Early',
    1: 'Early',
    2: 'AM Peak',
    3: 'AM Peak',
    4: 'Midday',
    5: 'PM Peak',
    6: 'Evening',
    7: 'Late',
}

bus_entex_pivot = bus_entex_pivot.rename(columns=timeband_dict)

In [10]:
# geodataframe
bus_entex_pivot = gpd.GeoDataFrame(bus_entex_pivot, geometry='geometry',crs='epsg:4326')
bus_entex_pivot.to_crs(epsg=27700, inplace=True)

EXPORT

In [11]:
# export to geojson AGGREGATED BY STOP AND ROUTE
bus_entex_pivot.to_file('data/cleaned/flow_bus_byStopRoutes_.gpkg', driver='GPKG')

In [12]:
# sum flow_bus by stoap_id, aggregate all numerical columns
flow_bus_byStop = bus_entex_pivot.groupby(['STOPCODE','STOPNAME']).agg({
    'Early': 'sum',
    # 'AM Peak': 'sum',
    # 'Midday': 'sum',
    # 'PM Peak': 'sum',
    # 'Evening': 'sum',
    # 'Late': 'sum',
    'Total': 'sum',
    'geometry': 'first'
}).reset_index()
flow_bus_byStop = gpd.GeoDataFrame(flow_bus_byStop, geometry='geometry', crs='EPSG:27700')
flow_bus_byStop.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 868 entries, 0 to 867
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   STOPCODE  868 non-null    object  
 1   STOPNAME  868 non-null    object  
 2   Early     868 non-null    float64 
 3   Total     868 non-null    float64 
 4   geometry  868 non-null    geometry
dtypes: float64(2), geometry(1), object(2)
memory usage: 34.0+ KB


In [13]:
# export to geojson AGGREGATED BY STOP
flow_bus_byStop.to_file('data/cleaned/flow_bus_byStop_.gpkg', driver='GPKG')