In [1]:
# Set the project path to root
import os
os.chdir("C:/Users/Ryan/Documents/GitHub/transport-for-london")

import pandas as pd
from src.data import api_tools


First up we'll explore the locations of Transport for London (TFL) bike stations around the city.

We will:
1. Initiate an API connection to the TfL Rest API
2. Extract locations from the BikePoint endpoint
3. Save the extracted data (Which is returned as a JSON) as a CSV

In [2]:
endpoint = "BikePoint"
filepath = "./csv/"
filename = "tfl_response.csv"

# We have built a custom API class to handle requests
tfl_api = api_tools.TfLApiClient()
response = tfl_api.make_request('BikePoint')

# We convert the returned JSON to a pandas dataframe and save
response_df = pd.DataFrame(response)
response_df.to_csv(filepath + filename, index=None)


2024-01-01 17:41:45 [INFO]: API request successful
2024-01-01 17:41:45 [INFO]: Returned 797 rows


Let's take a look at the returned data...

In [3]:
response_df.dtypes

$type                    object
id                       object
url                      object
commonName               object
placeType                object
additionalProperties     object
children                 object
childrenUrls             object
lat                     float64
lon                     float64
dtype: object

Here we can see there is supplementary information for each bike station held within the 'additionalProperties' column, which may be worth exploring further...

In [4]:
# Supplementary data on each bike station
print([prop['key'] for prop in response_df.loc[0]['additionalProperties']])
pd.DataFrame(response_df['additionalProperties'][0])[['key', 'value', 'modified']]

['TerminalName', 'Installed', 'Locked', 'InstallDate', 'RemovalDate', 'Temporary', 'NbBikes', 'NbEmptyDocks', 'NbDocks', 'NbStandardBikes', 'NbEBikes']


Unnamed: 0,key,value,modified
0,TerminalName,001023,2024-01-01T15:37:30.333Z
1,Installed,true,2024-01-01T15:37:30.333Z
2,Locked,false,2024-01-01T15:37:30.333Z
3,InstallDate,1278947280000,2024-01-01T15:37:30.333Z
4,RemovalDate,,2024-01-01T15:37:30.333Z
5,Temporary,false,2024-01-01T15:37:30.333Z
6,NbBikes,15,2024-01-01T15:37:30.333Z
7,NbEmptyDocks,3,2024-01-01T15:37:30.333Z
8,NbDocks,19,2024-01-01T15:37:30.333Z
9,NbStandardBikes,14,2024-01-01T15:37:30.333Z


'NbBikes', 'NbEmptyDocks', 'NbDocks' seem like they may be useful so we will include these columns in the top level data.

One way would be to run a lambda function over the whole df, extracting the data of interest but this can get messy. Instead we will simply create a new df from the desired columns, including the bike station id, or in other words we transform the data from its raw state to a useable state.



In [5]:

additional_cols = ['NbBikes', 'NbEmptyDocks', 'NbDocks']
new_data = list()

for item in response:
    new_dict = dict()

    #Copy through the top level details
    new_dict['id'] = item['id']
    new_dict['lat'] = item['lat']
    new_dict['lon'] = item['lon']

    # Iterate the additional properties to extract the desired values
    additional_props = item['additionalProperties']

    # We don't name the cols explicitly, allowing us to change our mind on the desired columns in the future
    for prop in additional_props:
        if prop['key'] in additional_cols:
            new_dict[prop['key']] = prop['value']

    new_data.append(new_dict)

# We'll call this our bike stations df to work with from here on
bike_stations_df = pd.DataFrame(new_data)

Interestingly there are stations claiming to have 0 docks in the data. We will remove these from the dataset as a bike station without any bike docks seems nonsensical, however these may be special cases worth looking into later.

In [8]:
bike_stations_df.query('NbDocks == 0')

Unnamed: 0,id,lat,lon,NbBikes,NbEmptyDocks,NbDocks
773,BikePoints_240,51.505459,-0.105692,0,0,0


In [9]:
# A little data cleaning and housekeeping
bike_stations_df[additional_cols] = bike_stations_df[additional_cols].astype(int)
bike_stations_df['id'] = bike_stations_df['id'].astype(str)
clean_df = bike_stations_df.query('NbDocks > 0').reset_index()


One more handy feature might be the ratio of bikes available vs the number of docks vacant at each station.  
We can run a simple lambda function over the dataframe to add this feature.

In [10]:
clean_df['occupancy_ratio'] = clean_df.apply(lambda x: round(x['NbBikes'] / x['NbDocks'],2), axis=1)

# And finally let's save this modified data to CSV
clean_df.to_csv('./csv/bike_point_clean.csv')

Now we have the Name, ID and locations of all available bike stations plus some useful properties.  
We can visualise these to better understand their distribution

In [15]:
import plotly.express as px

fig = px.scatter_mapbox(clean_df,
                        lat='lat', 
                        lon='lon', 
                        hover_name='id', 
                        hover_data=['NbDocks', 'NbBikes', 'NbEmptyDocks', 'occupancy_ratio'],  
                        color='occupancy_ratio', 
                        mapbox_style='carto-darkmatter',  
                        color_continuous_scale=px.colors.diverging.balance_r,
                        range_color=[0,1], 
                        zoom=10.5) 

fig.update_layout(margin=dict(l=50, r=50, b=50, t=55))
fig.update_layout(dict(title='London Bike Station Ocupancies'))

fig.add_annotation(
    text='Fig: Data retrieved from api.tfl.gov.uk, showing ratio of currently occupied and empty docks of all bike stations in the TFL dataset.',
    showarrow=False,
    x=0, y=-0.1)


fig.show()

Doing some visual exploration, we can see the patterns of vacant vs full docks at each bike station. Depending on when the API is queried, data may reflect different patterns of users travelling to and from the central city and outer suburbs of London.

Some possible inferences:

- There are less bikes available at inner city bike stations when the API was queried
- The system requires some "slack", with more docking spaces available than bikes
- There may be a daily pattern to bike station occupancy levels
- The Southeast is underserved relative to other regions


In [111]:
# Kepler.gl is my geo-data exploration tool of choice however plotly provides a solid in-notebook experience for rendering
# Uncomment the below to use Kepler

# import keplergl

# from keplergl import KeplerGl
# map_data = clean_df[['id', 'lat', 'lon', 'occupancy_ratio'] + additional_cols]
# tfl_bike_map = KeplerGl()
# tfl_bike_map.add_data(data=map_data, name='tfl_bike_stations')

# tfl_bike_map
