# Fetching the journeys data for analysis

Download the CSV data from TfL, and save in a GeoPandas geodataframe.


In [1]:
# Converts journeys into geometry

import os
import csv
import json
import urllib.request
from requests import get

import pandas as pd
import geopandas as gpd
import rasterio
import rasterio.plot
import matplotlib.pyplot as plt

from shapely.geometry import LineString

In [2]:
# set paths
DL_path = "data/cycles/DL_data"
points_path = "data/cycles/points"
points_fn = "BikePoints.geojson"
journeys_path = "data/cycles/journeys"

# set source URL and filename
source_url = "https://cycling.data.tfl.gov.uk/usage-stats/"
source_fn = [
    "346JourneyDataExtract28Nov2022-04Dec2022.csv",
    "347JourneyDataExtract05Dec2022-11Dec2022.csv",
    "348JourneyDataExtract12Dec2022-18Dec2022.csv",
    "349JourneyDataExtract19Dec2022-25Dec2022.csv",
    "350JourneyDataExtract26Dec2022-01Jan2023.csv",
    "351JourneyDataExtract02Jan2023-08Jan2023.csv",
    "352JourneyDataExtract09Jan2023-15Jan2023.csv",
    "353JourneyDataExtract16Jan2023-22Jan2023.csv",
    "354JourneyDataExtract23Jan2023-29Jan2023.csv",
    "355JourneyDataExtract30Jan2023-05Feb2023.csv",
    "356JourneyDataExtract06Feb2023-12Feb2023.csv",
    "357JourneyDataExtract13Feb2023-19Feb2023.csv",
    "358JourneyDataExtract20Feb2023-26Feb2023.csv",
    "359JourneyDataExtract27Feb2023-05Mar2023.csv",
    "360JourneyDataExtract06Mar2023-12Mar2023.csv",
    "361JourneyDataExtract13Mar2023-19Mar2023.csv",
    "362JourneyDataExtract20Mar2023-26Mar2023.csv",
    "363JourneyDataExtract27Mar2023-02Apr2023.csv",
    "364JourneyDataExtract03Apr2023-09Apr2023.csv",
    "365JourneyDataExtract10Apr2023-16Apr2023.csv",
    "366JourneyDataExtract17Apr2023-23Apr2023.csv",
    "367JourneyDataExtract24Apr2023-30Apr2023.csv",
    "368JourneyDataExtract01May2023-07May2023.csv",
    "369JourneyDataExtract08May2023-14May2023.csv",
    "370JourneyDataExtract15May2023-21May2023.csv",
    "371JourneyDataExtract22May2023-28May2023.csv",
    "372JourneyDataExtract29May2023-04Jun2023.csv",
    "373JourneyDataExtract05Jun2023-11Jun2023.csv",
    "374JourneyDataExtract12Jun2023-18Jun2023.csv",
    "375JourneyDataExtract19Jun2023-30Jun2023.csv",
    "376JourneyDataExtract01Jul2023-14Jul2023.csv",
    "377JourneyDataExtract15Jul2023-31Jul2023.csv",
    "378JourneyDataExtract01Aug2023-14Aug2023.csv",
    "378JourneyDataExtract15Aug2023-31Aug2023.csv",
    "379JourneyDataExtract01Sep2023-14Sep2023.csv",
    "380JourneyDataExtract15Sep2023-30Sep2023.csv",
    "381JourneyDataExtract01Oct2023-14Oct2023.csv",
    "382JourneyDataExtract15Oct2023-31Oct2023.csv",
    "383JourneyDataExtract01Nov2023-14Nov2023.csv",
    "384JourneyDataExtract15Nov2023-30Nov2023.csv"
]

In [3]:
# download journeys

# creates saving directory if does not exist
if not os.path.exists(journeys_path):
    os.makedirs(journeys_path)

for fn in source_fn:
    # check if local exists
    if not os.path.exists(os.path.join(journeys_path, fn)):
        source = source_url + fn
        print(f"Downloading   : {fn}")
        with open(os.path.join(journeys_path, fn), "wb") as file:
            response = get(source)
            file.write(response.content)
    else:
        print(f"Found in local: {fn}")

print('Done')

Found in local: 346JourneyDataExtract28Nov2022-04Dec2022.csv
Found in local: 347JourneyDataExtract05Dec2022-11Dec2022.csv
Found in local: 348JourneyDataExtract12Dec2022-18Dec2022.csv
Found in local: 349JourneyDataExtract19Dec2022-25Dec2022.csv
Found in local: 350JourneyDataExtract26Dec2022-01Jan2023.csv
Found in local: 351JourneyDataExtract02Jan2023-08Jan2023.csv
Found in local: 352JourneyDataExtract09Jan2023-15Jan2023.csv
Found in local: 353JourneyDataExtract16Jan2023-22Jan2023.csv
Found in local: 354JourneyDataExtract23Jan2023-29Jan2023.csv
Found in local: 355JourneyDataExtract30Jan2023-05Feb2023.csv
Found in local: 356JourneyDataExtract06Feb2023-12Feb2023.csv
Found in local: 357JourneyDataExtract13Feb2023-19Feb2023.csv
Found in local: 358JourneyDataExtract20Feb2023-26Feb2023.csv
Found in local: 359JourneyDataExtract27Feb2023-05Mar2023.csv
Found in local: 360JourneyDataExtract06Mar2023-12Mar2023.csv
Found in local: 361JourneyDataExtract13Mar2023-19Mar2023.csv
Found in local: 362Journ

## Create dataframe from journeys

Using the above dataset, a dataframe of journeys are created in this script below.


In [4]:
%%time

# test flag, set 1 for testing
test_flag = 0


journeys_df = []
# load data
# keeping it as a list to avoid memory crashes in further analysis
# merging will come at the very end
for idx, fn in enumerate(source_fn):
    journeys_df.append(pd.read_csv(os.path.join(journeys_path, fn), low_memory = False))
    if test_flag == 1:
        break


CPU times: total: 12.8 s
Wall time: 19.5 s


## Join with height data

In [5]:
# set path
points_path = 'data/cycles/points'
points_parquet_fn = 'docking_stations.geoparquet'
journeys_df_fn = 'journeys.parquet'

# load points gdf
points_gdf = gpd.read_parquet(os.path.join(points_path, points_parquet_fn))


In [6]:
# merge the points data to the journeys data

# add prefix to data points
start_points = points_gdf.add_prefix('start_')
end_points = points_gdf.add_prefix('end_')

# create new list
journeys_df_merged = []

for df in journeys_df:     
    # merge the start point data
    temp_df = df.merge(start_points, left_on = 'Start station', right_on = 'start_name')
    # merge the end point data
    temp_df = temp_df.merge(end_points, left_on = 'End station', right_on = 'end_name')

    journeys_df_merged.append(temp_df)

# delete the unnneccesary df
del(start_points)
del(end_points)

In [7]:
# extract only the required columns
columns = ['Number', 'Start date', 'Start station number', 'start_location', 'start_zone', 
           'start_LSOA11CD', 'start_LSOA11NM', 'start_MSOA11CD', 'start_MSOA11NM', 'start_cc_zone', 'start_height', 'start_geometry',
           'End date', 'End station number', 'end_location', 'end_zone', 
           'end_LSOA11CD', 'end_LSOA11NM', 'end_MSOA11CD', 'end_MSOA11NM', 'end_cc_zone', 'end_height', 'end_geometry',
           'Bike number', 'Bike model', 'Total duration (ms)']

for df in journeys_df_merged:
    df = df[columns].copy()
    # clean dates
    dates = ['Start date', 'End date']
    for d in dates:
        df[d] = pd.to_datetime(df[d], format = '%Y-%m-%d %H:%M')
    
    # make bike model into categorical data
    # this should only have 2 types: 'CLASSIC' and 'PBSC_EBIKE'
    df['Bike model'] = df['Bike model'].astype('category')

In [8]:
# make one large df from the list of df
df_concat = pd.concat(journeys_df_merged, ignore_index = True)

# delete the unnneccesary df
del(journeys_df_merged)

In [9]:
# calculate the height difference
df_concat['height_diff'] = df_concat.end_height - df_concat.start_height

# save the distance of journey as a column
# define as GeoSeries
start_geoseries = gpd.GeoSeries(df_concat.start_geometry, crs = 'EPSG:27700')
end_geoseries = gpd.GeoSeries(df_concat.end_geometry, crs = 'EPSG:27700')

df_concat['distance'] = start_geoseries.distance(end_geoseries, align = False)

In [10]:
df_concat.height_diff.describe()

count    8.491612e+06
mean    -2.200772e-01
std      9.166256e+00
min     -4.387600e+01
25%     -4.118500e+00
50%      0.000000e+00
75%      3.757999e+00
max      4.387600e+01
Name: height_diff, dtype: float64

In [11]:
df_concat.head(10)

Unnamed: 0,Number,Start date,Start station number,Start station,End date,End station number,End station,Bike number,Bike model,Total duration,...,end_zone,end_height,end_LSOA11CD,end_LSOA11NM,end_MSOA11CD,end_MSOA11NM,end_cc_zone,end_geometry,height_diff,distance
0,127293434,2022-12-04 23:59,1194,"Westminster University, Marylebone",2022-12-05 00:08,300079,"London Street, Paddington",24140,CLASSIC,9m 5s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),-1.193249,1540.854577
1,127256557,2022-12-02 20:06,1085,"Old Quebec Street, Marylebone",2022-12-02 20:11,300079,"London Street, Paddington",51668,CLASSIC,5m 17s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),-0.800749,1073.865608
2,127222561,2022-12-01 15:48,1085,"Old Quebec Street, Marylebone",2022-12-01 15:54,300079,"London Street, Paddington",57546,CLASSIC,5m 24s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),-0.800749,1073.865608
3,127198699,2022-11-30 18:05,1085,"Old Quebec Street, Marylebone",2022-11-30 18:11,300079,"London Street, Paddington",52833,CLASSIC,6m 37s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),-0.800749,1073.865608
4,127193452,2022-11-30 15:44,1085,"Old Quebec Street, Marylebone",2022-11-30 15:49,300079,"London Street, Paddington",55552,CLASSIC,5m 18s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),-0.800749,1073.865608
5,127268831,2022-12-03 14:15,200096,"Green Park Station, Mayfair",2022-12-03 14:36,300079,"London Street, Paddington",14273,CLASSIC,21m 42s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),7.563251,2342.151738
6,127268836,2022-12-03 14:15,200096,"Green Park Station, Mayfair",2022-12-03 14:36,300079,"London Street, Paddington",53406,CLASSIC,21m 35s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),7.563251,2342.151738
7,127222344,2022-12-01 15:39,200096,"Green Park Station, Mayfair",2022-12-01 15:51,300079,"London Street, Paddington",52680,CLASSIC,11m 55s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),7.563251,2342.151738
8,127200688,2022-11-30 18:45,200096,"Green Park Station, Mayfair",2022-11-30 18:59,300079,"London Street, Paddington",50109,CLASSIC,14m 44s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),7.563251,2342.151738
9,127168974,2022-11-29 17:46,200096,"Green Park Station, Mayfair",2022-11-29 18:00,300079,"London Street, Paddington",23321,CLASSIC,13m 37s,...,Paddington,26.312,E01033594,Westminster 015G,E02000974,Westminster 015,False,POINT (526821.250 181157.755),7.563251,2342.151738


In [14]:
# create geodataframe
df_concat['geometry'] = df_concat.start_geometry

df_concat_gdf = gpd.GeoDataFrame(df_concat, geometry = df_concat.geometry, crs = 'EPSG:27700')

In [16]:
# convert dates into datetime format
dates = ['Start date', 'End date']
for c in dates:
    df_concat_gdf[c] = pd.to_datetime(df_concat_gdf[c], format = '%Y-%m-%d %H:%M')

# convert bike model into categories
df_concat_gdf['Bike model'] = df_concat_gdf['Bike model'].astype('category')

# convert number of ports into numeric
numbers = ['start_ports', 'end_ports']
for c in numbers:
    df_concat_gdf[c] = df_concat_gdf[c].astype('int')


In [17]:
# save file

df_concat_fn = 'journeys_df.parquet'
df_concat_gdf.drop(['Start station number', 'End station number'], axis = 1).to_parquet(os.path.join(journeys_path, df_concat_fn))