In [1]:
import pandas as pd
import geopy.distance
import json
import datetime

### Bike

In [2]:
data = pd.read_csv('data/unprocessed/bikes_2018_q1.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,duration_sec,start_time,end_time,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude
0,0,75284,2018-01-31 22:52:35.2390,2018-02-01 19:47:19.8240,120,37.76142,-122.426435,285,37.783521,-122.431158
1,1,85422,2018-01-31 16:13:34.3510,2018-02-01 15:57:17.3100,15,37.795392,-122.394203,15,37.795392,-122.394203
2,2,71576,2018-01-31 14:23:55.8890,2018-02-01 10:16:52.1160,304,37.348759,-121.894798,296,37.325998,-121.87712
3,3,61076,2018-01-31 14:53:23.5620,2018-02-01 07:51:20.5000,75,37.773793,-122.421239,47,37.780955,-122.399749
4,4,39966,2018-01-31 19:52:24.6670,2018-02-01 06:58:31.0530,74,37.776435,-122.426244,19,37.788975,-122.403452


In [4]:
data.shape

(312902, 10)

Filtering out return trips - this is redundant, will be done more precisely later

In [5]:
data = data[data['start_station_id'] != data['end_station_id']]

In [6]:
data.shape

(305111, 10)

Filtering out trips starting / ending outside SF

In [7]:
station_data = pd.read_csv('data/sf_stations.csv')

In [8]:
station_ids = station_data.id.unique()

In [9]:
data = data[data['start_station_id'].isin(station_ids) & data['end_station_id'].isin(station_ids)]

In [10]:
data.shape

(228569, 10)

Remove very slow (possibly having breaks) and return trips

In [11]:
start_lat = 'start_station_latitude'
start_lon = 'start_station_longitude'
end_lat = 'end_station_latitude'
end_lon = 'end_station_longitude'

def get_row_distance(row):
    return geopy.distance.distance((row[start_lat], row[start_lon]), (row[end_lat], row[end_lon])).km

In [12]:
data['distance'] = data.apply(get_row_distance, axis=1)

In [13]:
data['speed'] = data['distance'] / data['duration_sec'] * 3600

In [14]:
data = data[data['distance'] > 0.3]

In [15]:
data = data[data['speed'] > 1]

In [16]:
data.shape

(0, 12)

Add zone ids

In [17]:
with open('data/station2zone.json', 'r') as f:
    zones = json.load(f)

In [18]:
data['start_zone'] = data.apply(lambda x: zones[str(x.start_station_id)], axis=1)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
data['end_zone'] = data.apply(lambda x: zones[str(x.end_station_id)], axis=1)

Proper dates

In [None]:
data['start_date'] = data.apply(lambda x: datetime.datetime.strptime(x.start_time, '%Y-%m-%d %H:%M:%S.%f'), axis=1)

In [None]:
data['hour_of_day'] = data.start_date.dt.hour

In [None]:
data['day_of_week'] = data.start_date.dt.dayofweek

In [None]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [None]:
data['day_of_week'] = data.apply(lambda x: days[x.day_of_week], axis=1)

Extract csvs

In [None]:
cols = list(data.columns)
cols[1] = 'duration'
cols[4] = 'start_station'
cols[5] = 'start_station_lat'
cols[6] = 'start_station_lon'
cols[7] = 'end_station'
data.columns = cols

In [None]:
station_cols = ['start_station', 'end_station', 'duration']
station_dow_cols = ['day_of_week'] + station_cols
station_hod_cols = ['hour_of_day'] + station_cols
zone_cols = ['start_zone', 'end_zone', 'duration']
common_cols = ['start_zone', 'end_zone', 'start_station', 'end_station', 'duration']

In [None]:
bike_stations_hod = data[station_hod_cols].groupby(station_hod_cols[:-1]).mean()

In [None]:
bike_stations_hod.head()

In [None]:
bike_stations_hod.to_csv('data/bike_stations_hod.csv', index=None)

In [None]:
hod = data[['hour_of_day'] + common_cols].groupby(['hour_of_day'] + common_cols[0:2])['duration'].mean()

In [None]:
hod.head(5)

In [None]:
hod.to_csv('data/bike_hod.csv', index=None)

In [None]:
dow = data[['day_of_week'] + common_cols].groupby(['day_of_week'] + common_cols[0:2]).mean()
dow['duration'] = dow['duration'].astype('int')

In [None]:
dow.head(5)

In [None]:
dow.to_csv('data/bike_dow.csv')

### Uber

In [None]:
uber_dow = pd.read_csv('data/unprocessed/uber_2018_q1_day_of_week.csv')

In [None]:
uber_dow.head()

In [None]:
uber_dow.shape

In [None]:
with open('data/sf_zones.json', 'r') as f:
    zone_data = json.load(f)

In [None]:
target_zones = [zone['properties']['id'] for zone in zone_data['features']]

In [None]:
uber_dow = uber_dow[uber_dow['sourceid'].isin(target_zones) & uber_dow['dstid'].isin(target_zones)]

In [None]:
uber_dow.shape

In [None]:
uber_dow['day_of_week'] = uber_dow.apply(lambda x: days[int(x.dow) - 1], axis=1)

In [None]:
uber_dow = uber_dow[['sourceid', 'dstid', 'mean_travel_time', 'day_of_week']]

In [None]:
uber_dow.columns = ['start_zone', 'end_zone', 'duration', 'day_of_week']
uber_dow.duration = uber_dow.duration.astype('int')

In [None]:
uber_dow.head()

In [None]:
uber_dow.to_csv('data/uber_dow.csv', index=None)

And now hour of the day

In [None]:
uber_hod = pd.read_csv('data/unprocessed/uber_2018_q1_hour_of_day.csv')

In [None]:
uber_hod = uber_hod[uber_hod['sourceid'].isin(target_zones) & uber_hod['dstid'].isin(target_zones)]

In [None]:
uber_hod = uber_hod[['sourceid', 'dstid', 'mean_travel_time', 'hod']]

In [None]:
uber_hod.columns = ['start_zone', 'end_zone', 'duration', 'hour_of_day']
uber_hod.duration = uber_hod.duration.astype('int')
uber_hod.head()

In [None]:
uber_hod.to_csv('data/uber_hod.csv', index=None)