In [1]:
from random import choice, randint
from datetime import datetime, timedelta
import re

try:
    import pandas as pd
except ImportError:
    raise ImportError("This script requires pandas to work.")

In [4]:
num_bikes = 50
num_stations = 10
max_hours = 5
journeys = 5
filename = "test_data.csv"

In [3]:
# Generate test data
stations = ["ST-%s" % n for n in range(1, num_stations + 1)]
bikes = ["BK-%s" % n for n in range(1, num_bikes + 1)]

with open(filename, 'w') as wr:
    for bike in bikes:
        current_station = choice(stations)
        start_time = datetime.now()
        for _ in range(randint(1, journeys + 1)):
            end_time = start_time + timedelta(
                hours=randint(0, max_hours - 1),
                minutes=randint(0, 60)
            )
            wr.write("%s,%s,%s,%s\n" % (
                current_station, bike,
                start_time.strftime("%Y%m%dT%H:%M:%S"),
                end_time.strftime("%Y%m%dT%H:%M:%S")
            ))
            start_time = end_time + timedelta(
                hours=randint(0, max_hours - 1),
                minutes=randint(0, 60)
            )
            current_station = choice(stations)
print "Generated data and stored it in %s" % filename

Generated data and stored it in test_data.csv


In [25]:
bike_data = pd.read_csv(
    filename,
    sep=",",
    names=['station_id', 'bike_id', 'arrival', 'departure'],
    parse_dates=[2, 3]
)
# Sort bike data and store it in a dataframe (bds = bike data sorted)
bds = bike_data.sort_index(by=['bike_id', 'arrival', 'departure'])
bds.head(10)

Unnamed: 0,station_id,bike_id,arrival,departure
0,ST-2,BK-1,2015-09-14 21:53:31,2015-09-14 23:12:31
1,ST-1,BK-1,2015-09-15 01:18:31,2015-09-15 02:13:31
2,ST-5,BK-1,2015-09-15 03:26:31,2015-09-15 03:46:31
19,ST-3,BK-10,2015-09-14 21:53:31,2015-09-14 22:05:31
20,ST-5,BK-10,2015-09-15 00:05:31,2015-09-15 00:09:31
3,ST-4,BK-2,2015-09-14 21:53:31,2015-09-14 22:57:31
4,ST-1,BK-3,2015-09-14 21:53:31,2015-09-15 00:25:31
5,ST-3,BK-4,2015-09-14 21:53:31,2015-09-15 00:09:31
6,ST-1,BK-4,2015-09-15 02:56:31,2015-09-15 03:58:31
7,ST-5,BK-4,2015-09-15 04:48:31,2015-09-15 07:41:31


In [26]:
# Append columns "arrival" and "bike_id" but shifted
# upwards by one row, so that departure and next arrival align
bds[["next_arrival", "next_bike_id"]] = bds[["arrival", "bike_id"]].shift(-1)

# Append another colum (bool) which is True if the next row's data
# is for the same bike_id
bds["bike_ids_match"] = bds.bike_id == bds.next_bike_id

# Calculate journey time between departure and next_arrival
bds["journey_time"] = bds.next_arrival - bds.departure

# Only take the complete journeys where departure time
# and next arrival time are both present and the bike ids match
complete_journeys = bds[bds.bike_ids_match]
complete_journeys.head(10)

Unnamed: 0,station_id,bike_id,arrival,departure,next_arrival,next_bike_id,bike_ids_match,journey_time
0,ST-2,BK-1,2015-09-14 21:53:31,2015-09-14 23:12:31,2015-09-15 01:18:31,BK-1,True,02:06:00
1,ST-1,BK-1,2015-09-15 01:18:31,2015-09-15 02:13:31,2015-09-15 03:26:31,BK-1,True,01:13:00
19,ST-3,BK-10,2015-09-14 21:53:31,2015-09-14 22:05:31,2015-09-15 00:05:31,BK-10,True,02:00:00
5,ST-3,BK-4,2015-09-14 21:53:31,2015-09-15 00:09:31,2015-09-15 02:56:31,BK-4,True,02:47:00
6,ST-1,BK-4,2015-09-15 02:56:31,2015-09-15 03:58:31,2015-09-15 04:48:31,BK-4,True,00:50:00
7,ST-5,BK-4,2015-09-15 04:48:31,2015-09-15 07:41:31,2015-09-15 09:46:31,BK-4,True,02:05:00
11,ST-3,BK-7,2015-09-14 21:53:31,2015-09-15 00:15:31,2015-09-15 02:55:31,BK-7,True,02:40:00
12,ST-5,BK-7,2015-09-15 02:55:31,2015-09-15 04:27:31,2015-09-15 06:44:31,BK-7,True,02:17:00
14,ST-3,BK-8,2015-09-14 21:53:31,2015-09-14 23:41:31,2015-09-15 01:55:31,BK-8,True,02:14:00
15,ST-2,BK-8,2015-09-15 01:55:31,2015-09-15 02:15:31,2015-09-15 02:48:31,BK-8,True,00:33:00


In [27]:
# Print the mean journey time
mean_journey = complete_journeys.journey_time.mean()
if mean_journey in [pd.NaT, pd.np.NaN]:
    print "No complete bike journeys found in the dataset."
else:
    mean_journey_duration = re.search(r'\d{2}:\d{2}:\d{2}', str(mean_journey))
    if not mean_journey_duration:
        print "Mean journey time (hh:mm:ss) : %s:%s:%s" %\
            mean_journey.components[1:4]
    else:
        print "Mean journey time (hh:mm:ss) : %s" %\
            mean_journey_duration.group()

Mean journey time (hh:mm:ss) : 01:54:38
