## Converting JSON files into Geopandas

TfL cycling data into geopandas dataframe.

### Cycling docks

Changing the locations into geopandas dataframe.


In [1]:
# import libraries
import os
import csv
import urllib.request
import pandas as pd
import geopandas as gpd
import json
from shapely.geometry import LineString

# set paths and load data

DL_path = "../Data/Cycles/DL_Data"
points_path = "../Data/Cycles/Points"
points_fn = "BikePoints.geojson"
journeys_path = "../Data/Cycles/Journeys"

In [2]:
# download json file from TfL Unified API if not already in local
source_url = "https://api.tfl.gov.uk/BikePoint"

if os.path.exists(os.path.join(DL_path, "BikePoint.json")):
    print("Loading file from local")
    docks = json.load(open(os.path.join(DL_path, "BikePoint.json")))
else:
    print(f"Download from {source_url}") 
    with urllib.request.urlopen(source_url) as source:
        docks = json.load(source)
    # save to local
    docks_DL_data = json.dumps(docks)
    # creates saving directory if does not exist
    if not os.path.exists(DL_path):
        os.makedirs(DL_path)
    # save json file
    with open(os.path.join(DL_path, "BikePoint.json"), "w") as f:
        f.write(docks_DL_data)

Loading file from local


In [3]:
docks_df = pd.json_normalize(docks)

# split location and zone
name = ['location', 'zone']
docks_df[name] = docks_df['commonName'].str.rsplit(pat = ',', n = 1, expand = True)

# clear spaces before and after commas
for n in name:
    docks_df[n] = docks_df[n].str.replace(r'^( +)|( +)$', r'', regex = True)

# fix zone name
replaces = {
    "Kings Cross": "King's Cross",
    "Parsons Green": "Parson's Green",
    "St Lukes": "St. Luke's",
    "St Pauls": "St. Paul's",
    "St.John's Wood": "St. John's Wood",
}
docks_df['zone'] = docks_df.zone.replace(replaces)

docks_slice = docks_df[['location', 'zone', 'lat', 'lon']]
docks_gdf = gpd.GeoDataFrame(docks_slice, geometry = gpd.points_from_xy(docks_slice.lon, docks_slice.lat), crs = 'EPSG:4326')
del(docks_slice)

# get centroid for each zone and store in GeoDataFrame
summary_df = docks_df.groupby('zone')[['lat', 'lon']].mean().reset_index()
zones_gdf = gpd.GeoDataFrame(summary_df, geometry = gpd.points_from_xy(summary_df.lon, summary_df.lat), crs = 'EPSG:4326')
del(summary_df)

In [4]:
# convert into BNG

docks_gdf = docks_gdf.to_crs('EPSG:27700')
zones_gdf = zones_gdf.to_crs('EPSG:27700')

zones_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   zone      117 non-null    object  
 1   lat       117 non-null    float64 
 2   lon       117 non-null    float64 
 3   geometry  117 non-null    geometry
dtypes: float64(2), geometry(1), object(1)
memory usage: 3.8+ KB


### Journey data

taking the journey data, summarizing by origin-destination by zone, and converting into geopandas geodataframe.

In [5]:
# download and open csv files
# source: https://cycling.data.tfl.gov.uk/usage-stats/374JourneyDataExtract12Jun2023-18Jun2023.csv
journeys_source = "https://cycling.data.tfl.gov.uk/usage-stats/374JourneyDataExtract12Jun2023-18Jun2023.csv"
journeys_DL_fn = "374JourneyDataExtract12Jun2023-18Jun2023.csv"

# load csv file
df = pd.read_csv(os.path.join(DL_path, journeys_DL_fn))

print(f"The data has {df.shape[0]:,} rows.")

The data has 207,960 rows.


In [6]:
# clean data

# make Bike model into categorical data
df['Bike model'] = df['Bike model'].astype('category')

# split location and area
df[['Start location', 'Start area']] = df['Start station'].str.rsplit(pat = ',', n = 1, expand = True)
df[['End location', 'End area']] = df['End station'].str.rsplit(pat = ',', n = 1, expand = True)

# clear spaces before and after commas
names = ['Start location', 'Start area', 'End location', 'End area']
for n in names:
    df[n] = df[n].str.replace(r'^( +)|( +)$', r'', regex = True)

# clean the names of zones
areas = ['Start area', 'End area']
replaces = {
    "Kings Cross": "King's Cross",
    "Parsons Green": "Parson's Green",
    "St Lukes": "St. Luke's",
    "St Pauls": "St. Paul's",
    "St.John's Wood": "St. John's Wood",
}

for a in areas:
    df[a] = df[a].replace(replaces)

# replace dates
dates = ['Start date', 'End date']
for d in dates:
    df[d] = pd.to_datetime(df[d], infer_datetime_format = True)

  df[d] = pd.to_datetime(df[d], infer_datetime_format = True)
  df[d] = pd.to_datetime(df[d], infer_datetime_format = True)


In [7]:
# summarize by area
byarea_df = df.groupby(['Start area', 'End area'])['Total duration (ms)'].agg(Trips = 'count', Ave_ms = 'mean').reset_index()

# separate into points and lines
diff_df = byarea_df[byarea_df['Start area'] != byarea_df['End area']]
same_df = byarea_df[byarea_df['Start area'] == byarea_df['End area']]

zones = ['Start area', 'End area']

# get geometry
for index, row in diff_df.iterrows():
    start = zones_gdf.loc[zones_gdf.zone == row['Start area'], 'geometry'].iloc[0]
    end = zones_gdf.loc[zones_gdf.zone == row['End area'], 'geometry'].iloc[0]
    diff_df.loc[index, 'geometry'] = LineString([start, end])

for index, row in same_df.iterrows():
    start = zones_gdf.loc[zones_gdf.zone == row['Start area'], 'geometry'].iloc[0]
    same_df.loc[index, 'geometry'] = start


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diff_df.loc[index, 'geometry'] = LineString([start, end])
  diff_df.loc[index, 'geometry'] = LineString([start, end])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_df.loc[index, 'geometry'] = start
  same_df.loc[index, 'geometry'] = start


In [8]:
# merge opposite directions

df2 = df.copy().dropna()

for index, row in df2.iterrows():
    df2.loc[index, 'zone1'] = max(row['Start area'], row['End area'])
    df2.loc[index, 'zone2'] = min(row['Start area'], row['End area'])

  df2.loc[index, 'zone1'] = max(row['Start area'], row['End area'])
  df2.loc[index, 'zone2'] = min(row['Start area'], row['End area'])


In [9]:
byarea_df2 = df2.groupby(['zone1', 'zone2'])['Total duration (ms)'].agg(Trips = 'count', Ave_ms = 'mean').reset_index()

diff_df2 = byarea_df2[byarea_df2.zone1 != byarea_df2.zone2]
same_df2 = byarea_df2[byarea_df2.zone1 == byarea_df2.zone2]

# get geometry
for index, row in diff_df2.iterrows():
    start = zones_gdf.loc[zones_gdf.zone == row['zone1'], 'geometry'].iloc[0]
    end = zones_gdf.loc[zones_gdf.zone == row['zone2'], 'geometry'].iloc[0]
    diff_df2.loc[index, 'geometry'] = LineString([start, end])

for index, row in same_df2.iterrows():
    start = zones_gdf.loc[zones_gdf.zone == row['zone1'], 'geometry'].iloc[0]
    same_df2.loc[index, 'geometry'] = start

# turn into geodataframe
diff_gpd2 = gpd.GeoDataFrame(diff_df2, crs = 'EPSG:27700')
same_gpd2 = gpd.GeoDataFrame(same_df2, crs = 'EPSG:27700')

diff_gpd2.to_file(os.path.join(journeys_path, "inter_zone.gpkg"))
same_gpd2.to_file(os.path.join(journeys_path, "intra_zone.gpkg"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diff_df2.loc[index, 'geometry'] = LineString([start, end])
  diff_df2.loc[index, 'geometry'] = LineString([start, end])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_df2.loc[index, 'geometry'] = start
  same_df2.loc[index, 'geometry'] = start
