# Load package

In [1]:
import pandas as pd
import numpy as np
import datetime
import sqlite3
import matplotlib.pyplot as plt
%matplotlib inline 

# Load data from database

In [103]:
# path = 'didi-vehicle-repositioning-strategy/data/'
path = 'data/NYC_2013/'
conn = sqlite3.connect(path + 'trip_data.db')
cursor = conn.cursor()

In [105]:
cursor.execute("""
CREATE TEMP TABLE subtable AS 
SELECT hack_license, datetime(pickup_datetime) AS pickup_datetime, datetime(dropoff_datetime) AS dropoff_datetime
FROM trip
""")
res = cursor.fetchall()

In [106]:
cursor.execute("""
CREATE TEMP TABLE next_trip AS
SELECT hack_license, dropoff_datetime_now AS end_datetime, MIN(pickup_datetime_next) AS start_datetime_next, 
(strftime('%s',pickup_datetime_next)-strftime('%s',dropoff_datetime_now))/3600 AS delta
FROM (
SELECT a.hack_license, a.dropoff_datetime AS dropoff_datetime_now, b.pickup_datetime AS pickup_datetime_next
FROM subtable a
LEFT JOIN subtable b
ON a.hack_license = b.hack_license
WHERE a.dropoff_datetime <= b.pickup_datetime
)
GROUP BY hack_license, dropoff_datetime_now
HAVING delta >= 5
ORDER BY hack_license
""")
res = cursor.fetchall()

OperationalError: database or disk is full

In [101]:
cursor.execute("""
SELECT *, LAG(end_datetime ,-1, 0) 
OVER (
    PARTITION BY hack_license
    ORDER BY hack_license, start_datetime_next) AS end_datetime_next
FROM
next_trip
""")
res = cursor.fetchall()
res

[(2013000002,
  '2013-06-01 04:44:54',
  '2013-06-01 22:37:57',
  17,
  '2013-06-02 05:40:59'),
 (2013000002, '2013-06-02 05:40:59', '2013-06-02 22:29:11', 16, 0),
 (2013000003,
  '2013-06-01 05:15:00',
  '2013-06-01 17:44:00',
  12,
  '2013-06-02 05:15:00'),
 (2013000003, '2013-06-02 05:15:00', '2013-06-02 18:12:00', 12, 0),
 (2013000006, '2013-06-02 04:52:00', '2013-06-02 17:35:00', 12, 0),
 (2013000008, '2013-06-01 21:39:00', '2013-06-02 12:40:00', 15, 0),
 (2013000009, '2013-06-01 01:30:41', '2013-06-02 08:58:59', 31, 0),
 (2013000012,
  '2013-06-01 00:38:00',
  '2013-06-01 16:29:00',
  15,
  '2013-06-02 01:41:00'),
 (2013000012, '2013-06-02 01:41:00', '2013-06-02 15:54:00', 14, 0),
 (2013000013, '2013-06-01 04:52:00', '2013-06-01 18:02:00', 13, 0),
 (2013000014, '2013-06-01 04:09:00', '2013-06-01 20:52:00', 16, 0),
 (2013000015,
  '2013-06-01 03:41:00',
  '2013-06-01 17:59:00',
  14,
  '2013-06-02 04:36:00'),
 (2013000015, '2013-06-02 04:36:00', '2013-06-02 17:21:00', 12, 0),
 (20

In [27]:
conn.close()

## Previous 

In [None]:
# path = 'didi-vehicle-repositioning-strategy/data/'
path = 'data/NYC_2013/'
trip_2013 = pd.read_csv(path + 'FOIL2013/trip_data_1.csv')
trip_2013.head()

In [None]:
# calculate delta
trip_2013['delta'] = (trip_2013['pickup_datetime_next'] - trip_2013['dropoff_datetime_now']) / pd.Timedelta(hours=1)
trip_2013.head()

In [None]:
# calculate the start and end working time of drivers
# if delta > k hours, dropoff time = end working time, next trip's pickup time = start working time
k = 5
pick_drop = trip_2013[trip_2013.delta > k]
# find the end working time of next shift; working hours = end working time - start working time
pick_drop = pick_drop.merge(pick_drop[['hack_license','dropoff_datetime_now']], on='hack_license', suffixes=[None,'_end'])
# find all the futre trips whose dropoff time < start working time
pick_drop = pick_drop[pick_drop['pickup_datetime_next'] <= pick_drop['dropoff_datetime_now_end']]
# find the last trip of a shift
pick_drop = pick_drop.groupby(['hack_license',
                               'dropoff_datetime_now','pickup_datetime_next',
                               'delta']).dropoff_datetime_now_end.min().reset_index()
# calculate working hours
pick_drop['work_hour'] = (pick_drop['dropoff_datetime_now_end'] - pick_drop['pickup_datetime_next']) / pd.Timedelta(hours=1)
pick_drop.head()

In [None]:
# extract hour, date, weekday(boolean) from datetime columns
pick_drop['dropoff_hour_now'] = pick_drop.dropoff_datetime_now.dt.hour
pick_drop['pickup_hour_next'] = pick_drop.pickup_datetime_next.dt.hour
pick_drop['dropoff_day_now'] = pick_drop.dropoff_datetime_now.dt.date
pick_drop['pickup_day_next'] = pick_drop.pickup_datetime_next.dt.date
pick_drop['weekday'] = (pick_drop.loc[:,'dropoff_datetime_now'].dt.dayofweek // 5 == 1).astype(int)
pick_drop.head()

In [None]:
# calculate percentage of pickup and dropoff in each hour, averaged across date
pickup_per = pick_drop.groupby(['pickup_day_next','weekday','pickup_hour_next'])['pickup_hour_next'].count()
# percentage = # pickup in each hour / # pickup in the day
pickup_per = pickup_per.groupby(level=0).apply(lambda x: x / x.sum() * 100) 
pickup_per = pd.DataFrame(pickup_per).rename(columns={'pickup_hour_next':'pickup_percent'}).reset_index().\
                                        rename(columns={'pickup_day_next':'date',
                                                        'pickup_hour_next':'hour'})

dropoff_per = pick_drop.groupby(['dropoff_day_now','weekday','dropoff_hour_now'])['dropoff_hour_now'].count()
dropoff_per = dropoff_per.groupby(level=0).apply(lambda x: x / x.sum() * 100) 
dropoff_per = pd.DataFrame(dropoff_per).rename(columns={'dropoff_hour_now':'dropoff_percent'}).reset_index().\
                                        rename(columns={'dropoff_day_now':'date',
                                                        'dropoff_hour_now':'hour'})
pick_drop_per = pickup_per.merge(dropoff_per)
pick_drop_per.head()

In [None]:
# Frequency of Pick-up and Drop-off Time
pick_drop_per = pick_drop_per.groupby(['hour'])['pickup_percent','dropoff_percent'].mean().\
                                reset_index().set_index('hour')

fig,ax = plt.subplots(figsize = (8,5))
pick_drop_per.plot(kind='bar',ax=ax);
ax.set_xlabel('Hour');
ax.set_ylabel('Percentage');
ax.set_title('Frequency of Pick-up and Drop-off Time (k={})'.format(k));
ax.legend();

In [None]:
# Frequency of Pick-up and Drop-off Time (weekday and weekend)
pick_drop_per = pick_drop_per.groupby(['hour','weekday'])['pickup_percent','dropoff_percent'].mean().\
                                reset_index().set_index('hour')

fig,ax = plt.subplots(1,2,figsize = (15,5))
pick_drop_per[pick_drop_per.weekday == 0].drop('weekday',axis=1).plot(kind='bar',ax=ax[0]);
pick_drop_per[pick_drop_per.weekday == 1].drop('weekday',axis=1).plot(kind='bar',ax=ax[1]);
ax[0].set_xlabel('Hour');
ax[0].set_ylabel('Percentage');
ax[0].set_title('Frequency of Pick-up and Drop-off Time (weekday, k={})'.format(k));
ax[0].legend();

ax[1].set_xlabel('Hour');
ax[1].set_ylabel('Percentage');
ax[1].set_title('Frequency of Pick-up and Drop-off Time (weekend, k={})'.format(k));
ax[1].legend();

In [None]:
# plt.boxplot(pick_drop.work_hour);
n = pick_drop[pick_drop.work_hour>=24].shape[0] / pick_drop.shape[0] * 100
print('drop {n:.2f}% records'.format(n=n))
pick_drop = pick_drop[pick_drop.work_hour<24]

In [None]:
fig,ax = plt.subplots(figsize = (8,5))
ax.hist(pick_drop.loc[:,'work_hour']);
ax.set_xlabel('Work time (hour)');
ax.set_ylabel('Count');
ax.set_title('Distribution of Work time (k={})'.format(k));

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,5))
ax[0].hist(pick_drop.loc[pick_drop.weekday == 1,'work_hour']);
ax[1].hist(pick_drop.loc[pick_drop.weekday == 0,'work_hour']);
ax[0].set_xlabel('Work time (hour)');
ax[0].set_ylabel('Count');
ax[0].set_title('Distribution of Work time (weekday, k={})'.format(k));
ax[1].set_xlabel('Work time (hour)');
ax[1].set_ylabel('Count');
ax[1].set_title('Distribution of Work time (weekend, k={})'.format(k));