# Data Wrangling of Santander Cycles Open Data

For the #30DayMapChallenge I have used the Open Data for the Santander Cycles bike share scheme in London.
Here, I have done some data manipulation from the data that I acquired from Transport for London Open Data.

For this challenge, I did not use the real-time data from the API, but focused on the locations for bike points and some journeys data.

## Fetching the Bike Points

The data for bike points can be found from the [TfL Unified API](https://api.tfl.gov.uk/bikepoint).
I have converted this file into a GeoJSON file for analysis.

The data directory is not pushed to GitHub.
Downloads the raw data into `DL_path`, and saves the points assigned by `points_path`.

In [1]:
# Converts Bike Points into GeoJSON file
import os
import json
import urllib.request

# set paths
DL_path = "../Data/Cycles/DL_Data"
points_path = "../Data/Cycles/Points"
points_fn = "BikePoints.geojson"

# import json file from TfL Unified API if not already in local
source_url = "https://api.tfl.gov.uk/BikePoint"

if os.path.exists(os.path.join(DL_path, "BikePoint.json")):
    print("Loading file from local")
    docks = json.load(open(os.path.join(DL_path, "BikePoint.json")))
else:
    print(f"Download from {source_url}") 
    with urllib.request.urlopen(source_url) as source:
        docks = json.load(source)
    # save to local
    docks_DL_data = json.dumps(docks)
    # creates saving directory if does not exist
    if not os.path.exists(DL_path):
        os.makedirs(DL_path)
    # save json file
    with open(os.path.join(DL_path, "BikePoint.json"), "w") as f:
        f.write(docks_DL_data)

# create output file
output = {
    "type": "FeatureCollection",
    "features" : []
}

# add docks data
for d in docks:
    for x in d["additionalProperties"]:
        if x["key"] == "TerminalName":
            id = x["value"]
    lat = d["lat"]
    lon = d["lon"]
    long_name = d["commonName"]

    #Separate area and specific port name
    #Some have spaces before the comma, some do not
    #"Location , Area" and "Location, Area" both exist
    comma_spaced = d["commonName"].rfind(" , ")
    comma_unspaced = d["commonName"].rfind(", ")
    if comma_spaced != -1:
        loc = d["commonName"][:comma_spaced]
        area = d["commonName"][comma_spaced+3:]
    elif comma_unspaced != -1:
        loc = d["commonName"][:comma_unspaced]
        area = d["commonName"][comma_unspaced+2:]
    else:
        loc = ""
        area = ""        
    for x in d["additionalProperties"]:
        if x["key"] == "NbDocks":
            docks = x["value"]

    # GeoJSON features
    port = {
        "type": "Feature",
        "geometry": {
            "type": "Point",
            "coordinates": [lon, lat]
        },
        "properties": {
            "id": id,
            "name": long_name,
            "location": loc,
            "area": area,
            "ports": docks,
        },
    }
    output["features"].append(port)

# create output
docks_geojson = json.dumps(output)

# creates saving directory if does not exist
if not os.path.exists(points_path):
    os.makedirs(points_path)

# save json file
with open(os.path.join(points_path, points_fn), "w") as f:
    f.write(docks_geojson)
print("done")
    


Loading file from local
done


## Fetching the Journeys for analysis

Now, I have taken the journey data from TfL open data as well. Here, I took the first 1,000 lines from the latest data available, just so that I can plot on the map.

In [8]:
# Converts journeys into geometry

import os
import csv
import json
import urllib.request

# number of journeys to stop at
stop_count = 0

DL_path = "../Data/Cycles/DL_Data"
points_path = "../Data/Cycles/Points"
points_fn = "BikePoints.geojson"
journeys_path = "../Data/Cycles/Journeys"
journeys_fn = "journeys_" + str(stop_count) + ".geojson"

# load docks
docks_geojson = json.load(open(os.path.join(points_path, points_fn)))

# output file
journeys = {
    "type": "featureCollection",
    "features": []
}

# download and open csv files
# source: https://cycling.data.tfl.gov.uk/usage-stats/374JourneyDataExtract12Jun2023-18Jun2023.csv
journeys_source = "https://cycling.data.tfl.gov.uk/usage-stats/374JourneyDataExtract12Jun2023-18Jun2023.csv"
journeys_DL_fn = "374JourneyDataExtract12Jun2023-18Jun2023.csv"

# check and download if not existent
if os.path.exists(os.path.join(DL_path, journeys_DL_fn)):
    print("Load from local")
else:
    print("Downloading file")
    response = urllib.request.urlopen(journeys_source)
    journeys_DLdata = response.read().decode('UTF-8')
    
    with open(os.path.join(DL_path, journeys_DL_fn),w) as f:
        f.write(journeys_DLdata)

# load csv file
with open(os.path.join(DL_path, journeys_DL_fn)) as csv_file:
    csv_reader = csv.DictReader(csv_file)
    count = 0
    for row in csv_reader:
        # get variables
        rental_id = int(row["Number"])
        duration = int(row["Total duration (ms)"])
        bike_id = int(row["Bike number"])
        type = str(row["Bike model"])
        if row["End station number"] != "":
            end_id = str(row["End station number"])
        start_id = str(row["Start station number"])
        if row["End station"] != "":
            end_st = str(row["End station"])
        start_st = str(row["Start station"])
    
        # get coordinates for start / end
        start_coord = None
        end_coord = None
        for ds in output["features"]:
            if ds["properties"]["name"] == start_st:
    #            if ds["properties"]["id"] == start_id:
                start_coord = ds["geometry"]["coordinates"]
            if ds["properties"]["name"] == end_st:
    #            if ds["properties"]["id"] == end_id:
                end_coord = ds["geometry"]["coordinates"]
                
        # outputs
        journey = {
            "type": "Feature",
            "geometry": {
                "type": "LineString",
                "coordinates": [start_coord, end_coord]
            },
            "properties": {
                "rental_id": rental_id,
                "bike_id": bike_id,
                "type": type,
                "start_id": start_id,
                "start_name": start_st,
                "end_id": end_id,
                "end_name": end_st, 
                "duration": duration
            }
        }
        
        journeys["features"].append(journey)
    
        # stop at maximum
        count += 1
        if count == stop_count:
            break


journeys_geojson = json.dumps(journeys)

# creates saving directory if does not exist
if not os.path.exists(journeys_path):
    os.makedirs(journeys_path)

# save file
with open(os.path.join(journeys_path, journeys_fn), "w") as f:
    f.write(journeys_geojson)
print("done")
    



Load from local
done
