In [1]:
import pandas as pd
import numpy as np
import datetime
import sqlite3
import matplotlib.pyplot as plt
%matplotlib inline 
from random import sample

In [3]:
!pwd

/Users/panchanok/Desktop/didi-vehicle-repositioning-strategy/notebooks


### Slide `pickup_taxizone_id` column up by 1 within key (`hack_license`, `medallion`)

In [4]:
# path = '../data/'
# conn = sqlite3.connect(path + 'trip_data.db')

# q_license = """
# SELECT distinct(hack_license) as UNIQ_LIC
# FROM trip
# """

# unique_license = pd.read_sql(q_license, conn)

# LIC_SAMPLE_NUM = 10
# sampled_drivers = sample(unique_license.UNIQ_LIC.tolist(), LIC_SAMPLE_NUM)

# # find the pickup and dropoff time of next trip
# q_trip = """
# SELECT dropoff_datetime, dropoff_taxizone_id, 
#         lag(pickup_taxizone_id, -1, NULL) over (partition by hack_license, medallion order by pickup_datetime) as pickup_taxizone_id_next,
#         lag(pickup_datetime, -1, NULL) over (partition by hack_license order by pickup_datetime) as pickup_datetime_next
#     FROM trip
#     WHERE hack_license in {0}
# ORDER BY hack_license ASC, medallion ASC, pickup_datetime ASC, dropoff_datetime ASC
# """

# trip_df = pd.read_sql(q_trip.format(tuple(sampled_drivers)), conn)
# conn.close()

In [None]:
path = '../data/'
conn = sqlite3.connect(path + 'trip_data.db')

# find the pickup and dropoff time of next trip
q_trip = """
SELECT dropoff_datetime, dropoff_taxizone_id, 
        lag(pickup_taxizone_id, -1, NULL) over (partition by hack_license, medallion order by pickup_datetime) as pickup_taxizone_id_next,
        lag(pickup_datetime, -1, NULL) over (partition by hack_license order by pickup_datetime) as pickup_datetime_next
    FROM trip
ORDER BY hack_license ASC, medallion ASC, pickup_datetime ASC, dropoff_datetime ASC
LIMIT 10
"""

trip_df = pd.read_sql(q_trip, conn)
conn.close()

### Drop rows where `pickup_taxizone_id_next` is NA

In [6]:
k = 5

In [7]:
selected_trip_df = trip_df.dropna(subset = ['pickup_taxizone_id_next'])
print('#records: {}'.format(selected_trip_df.shape[0]))

#records: 4776


## Keep only records whose nex pickup zone == current dropoff zone

In [8]:
selected_trip_df = selected_trip_df.loc[selected_trip_df['dropoff_taxizone_id'] == 
                                        selected_trip_df['pickup_taxizone_id_next']]
print('#records: {}'.format(selected_trip_df.shape[0]))

#records: 1797


In [9]:
selected_trip_df.head()

Unnamed: 0,dropoff_datetime,dropoff_taxizone_id,pickup_taxizone_id_next,pickup_datetime_next
2,2013-06-01 18:48:17,43.0,43.0,2013-06-01 18:49:29
3,2013-06-01 19:06:19,246.0,246.0,2013-06-01 19:14:07
4,2013-06-01 19:24:16,239.0,239.0,2013-06-01 19:30:05
6,2013-06-01 19:48:25,229.0,229.0,2013-06-01 19:52:58
11,2013-06-01 22:26:00,211.0,211.0,2013-06-01 22:37:35


## Compute cruise time

In [10]:
selected_trip_df['pickup_datetime_next'] = pd.to_datetime(selected_trip_df['pickup_datetime_next'])
selected_trip_df['dropoff_datetime'] = pd.to_datetime(selected_trip_df['dropoff_datetime'])

selected_trip_df['cruise_time'] = (selected_trip_df['pickup_datetime_next'] - selected_trip_df['dropoff_datetime'])/np.timedelta64(1, 'm')
selected_trip_df.reset_index(inplace=True)

In [11]:
print('Cruise Time less than 0 minutes: {}'.format(selected_trip_df.loc[(selected_trip_df['cruise_time'] < 0)].shape[0]))
print('Cruise Time greater than {} minutes: {}'.format(k*60, selected_trip_df.loc[(selected_trip_df['cruise_time'] <= k*60)].shape[0]))
print('These records are dropped')

Cruise Time less than 0 minutes: 0
Cruise Time greater than 300 minutes: 1788
These records are dropped


In [12]:
selected_trip_df.shape

(1797, 6)

In [13]:
selected_trip_df.head()

Unnamed: 0,index,dropoff_datetime,dropoff_taxizone_id,pickup_taxizone_id_next,pickup_datetime_next,cruise_time
0,2,2013-06-01 18:48:17,43.0,43.0,2013-06-01 18:49:29,1.2
1,3,2013-06-01 19:06:19,246.0,246.0,2013-06-01 19:14:07,7.8
2,4,2013-06-01 19:24:16,239.0,239.0,2013-06-01 19:30:05,5.816667
3,6,2013-06-01 19:48:25,229.0,229.0,2013-06-01 19:52:58,4.55
4,11,2013-06-01 22:26:00,211.0,211.0,2013-06-01 22:37:35,11.583333


## Remove outliers/mistakes

In [14]:
selected_trip_df

Unnamed: 0,index,dropoff_datetime,dropoff_taxizone_id,pickup_taxizone_id_next,pickup_datetime_next,cruise_time
0,2,2013-06-01 18:48:17,43.0,43.0,2013-06-01 18:49:29,1.200000
1,3,2013-06-01 19:06:19,246.0,246.0,2013-06-01 19:14:07,7.800000
2,4,2013-06-01 19:24:16,239.0,239.0,2013-06-01 19:30:05,5.816667
3,6,2013-06-01 19:48:25,229.0,229.0,2013-06-01 19:52:58,4.550000
4,11,2013-06-01 22:26:00,211.0,211.0,2013-06-01 22:37:35,11.583333
...,...,...,...,...,...,...
1792,4805,2013-06-26 20:06:00,113.0,113.0,2013-06-26 20:11:00,5.000000
1793,4810,2013-06-26 22:19:00,236.0,236.0,2013-06-26 22:22:00,3.000000
1794,4813,2013-06-26 23:09:00,249.0,249.0,2013-06-26 23:13:00,4.000000
1795,4814,2013-06-26 23:23:00,79.0,79.0,2013-06-26 23:25:00,2.000000


In [15]:
selected_trip_df = selected_trip_df.loc[(selected_trip_df['cruise_time'] <= k*60) &
                                        (selected_trip_df['cruise_time'] > 0)]



## Round DO time to 15 minute and 1 hour interval

In [16]:
selected_trip_df_15m = selected_trip_df.copy()
selected_trip_df_1h = selected_trip_df.copy()

In [20]:
selected_trip_df_15m['dropoff_datetime_interval'] = selected_trip_df_15m['dropoff_datetime'].dt.round('15min').dt.time
selected_trip_df_1h['dropoff_datetime_interval'] = selected_trip_df_1h['dropoff_datetime'].dt.round('1h').dt.time

In [22]:
selected_trip_df_15m['dropoff_datetime'].dt.round('15min').dt.time


0       18:45:00
1       19:00:00
2       19:30:00
3       19:45:00
4       22:30:00
          ...   
1792    20:00:00
1793    22:15:00
1794    23:15:00
1795    23:30:00
1796    01:00:00
Name: dropoff_datetime, Length: 1765, dtype: object

In [None]:
np.array(['2013-06-26 20:06:00'])

## [aside] Coversion table 

In [4]:
time_index_15m = []
time_index_1h = []
time_interval = []
m_index = 0
h_index = 0
for h in range(24):
    for q in range(4):
        time_interval.append(datetime.time(h, q*15))
        time_index_1h.append(h_index)
        time_index_15m.append(m_index)
        m_index += 1
    h_index += 1
       

In [5]:
interval_index_table = pd.DataFrame({'interval': time_interval, 'index_15m': time_index_15m, 'index_1h': time_index_1h})
interval_index_table

Unnamed: 0,interval,index_15m,index_1h
0,00:00:00,0,0
1,00:15:00,1,0
2,00:30:00,2,0
3,00:45:00,3,0
4,01:00:00,4,1
...,...,...,...
91,22:45:00,91,22
92,23:00:00,92,23
93,23:15:00,93,23
94,23:30:00,94,23


In [6]:
interval_index_table.to_csv('../data/interval_index_table.csv', index=False)

In [23]:
conversion_15m = dict(zip(interval_index_table.interval, interval_index_table.index_15m))
conversion_1h = dict(zip(interval_index_table.interval, interval_index_table.index_1h))

## convert interval to index

In [27]:
selected_trip_df_15m['dropoff_datetime_index'] = [conversion_15m[t] for t in selected_trip_df_15m['dropoff_datetime_interval']]
selected_trip_df_1h['dropoff_datetime_index'] = [conversion_1h[t] for t in selected_trip_df_1h['dropoff_datetime_interval']]

In [29]:
selected_trip_df_15m.head()

Unnamed: 0,index,dropoff_datetime,dropoff_taxizone_id,pickup_taxizone_id_next,pickup_datetime_next,cruise_time,dropoff_datetime_interval,dropoff_datetime_index
0,3,2013-06-24 20:28:00,141.0,141.0,2013-06-24 20:30:00,2.0,20:30:00,82
1,4,2013-06-24 20:35:00,263.0,263.0,2013-06-24 20:37:00,2.0,20:30:00,82
2,5,2013-06-24 20:49:00,239.0,239.0,2013-06-24 20:52:00,3.0,20:45:00,83
3,8,2013-06-26 17:35:00,233.0,233.0,2013-06-26 17:36:00,1.0,17:30:00,70
4,12,2013-06-26 18:50:00,114.0,114.0,2013-06-26 18:53:00,3.0,18:45:00,75


In [30]:
selected_trip_df_1h.head()

Unnamed: 0,index,dropoff_datetime,dropoff_taxizone_id,pickup_taxizone_id_next,pickup_datetime_next,cruise_time,dropoff_datetime_interval,dropoff_datetime_index
0,3,2013-06-24 20:28:00,141.0,141.0,2013-06-24 20:30:00,2.0,20:00:00,20
1,4,2013-06-24 20:35:00,263.0,263.0,2013-06-24 20:37:00,2.0,21:00:00,21
2,5,2013-06-24 20:49:00,239.0,239.0,2013-06-24 20:52:00,3.0,21:00:00,21
3,8,2013-06-26 17:35:00,233.0,233.0,2013-06-26 17:36:00,1.0,18:00:00,18
4,12,2013-06-26 18:50:00,114.0,114.0,2013-06-26 18:53:00,3.0,19:00:00,19


In [49]:
cruise_time_15m = selected_trip_df_15m.groupby(['dropoff_datetime_index', 'dropoff_datetime_interval', 'dropoff_taxizone_id'], as_index=False)['cruise_time'].agg(['mean', 'median', 'count']).reset_index()
cruise_time_15m.rename(columns = {'dropoff_taxizone_id': 'taxizone_id',
                                  'mean': 'avg_cruise_time',
                                  'median': 'med_cruise_time'}, inplace=True)


In [63]:
cruise_time_15m = cruise_time_15m.pivot_table(index=['dropoff_datetime_index', 'dropoff_datetime_interval'], 
                           columns= 'taxizone_id', fill_value=penalty)
cruise_time_15m = cruise_time_15m.stack().reset_index()

In [64]:
cruise_time_15m

Unnamed: 0,dropoff_datetime_index,dropoff_datetime_interval,taxizone_id,avg_cruise_time,count,med_cruise_time
0,0,00:00:00,1.0,1000.000000,1000,1000.000000
1,0,00:00:00,2.0,0.875000,2,0.875000
2,0,00:00:00,3.0,5.000000,1,5.000000
3,0,00:00:00,4.0,2.413224,426,2.000000
4,0,00:00:00,5.0,1000.000000,1000,1000.000000
...,...,...,...,...,...,...
24667,95,23:45:00,259.0,1000.000000,1000,1000.000000
24668,95,23:45:00,260.0,4.302976,28,2.258333
24669,95,23:45:00,261.0,2.527178,241,2.000000
24670,95,23:45:00,262.0,3.396136,427,2.183333


In [60]:
cruise_time_1h = selected_trip_df_1h.groupby(['dropoff_datetime_index', 'dropoff_datetime_interval', 'dropoff_taxizone_id'], as_index=False)['cruise_time'].agg(['mean', 'median', 'count']).reset_index()
cruise_time_1h.rename(columns = {'dropoff_taxizone_id': 'taxizone_id',
                                  'mean': 'avg_cruise_time',
                                  'median': 'med_cruise_time'}, inplace=True)

In [61]:
penalty = 1000
cruise_time_1h = cruise_time_1h.pivot_table(index=['dropoff_datetime_index', 'dropoff_datetime_interval'], 
                           columns= 'taxizone_id', fill_value=penalty)
cruise_time_1h = cruise_time_1h.stack().reset_index()

In [62]:
cruise_time_1h

Unnamed: 0,dropoff_datetime_index,dropoff_datetime_interval,taxizone_id,avg_cruise_time,count,med_cruise_time
0,0,00:00:00,1.0,1000.000000,1000,1000.000000
1,0,00:00:00,2.0,1.454167,4,1.475000
2,0,00:00:00,3.0,3.466667,2,3.466667
3,0,00:00:00,4.0,2.564357,1840,2.000000
4,0,00:00:00,5.0,1000.000000,1000,1000.000000
...,...,...,...,...,...,...
6163,23,23:00:00,259.0,3.708333,2,3.708333
6164,23,23:00:00,260.0,5.523000,100,3.000000
6165,23,23:00:00,261.0,2.720651,1050,2.000000
6166,23,23:00:00,262.0,3.426985,2066,2.300000


## Export files

In [65]:
cruise_time_15m.to_csv(path + 'cruise_time_15m.csv', index=False)
cruise_time_1h.to_csv(path + 'cruise_time_1h.csv', index=False)

## ====

In [51]:
cruise_time_15m.sort_values('count')

Unnamed: 0,dropoff_datetime_index,dropoff_datetime_interval,taxizone_id,avg_cruise_time,med_cruise_time,count
7515,47,11:45:00,190.0,1.000000,1.000000,1
7899,50,12:30:00,63.0,1.400000,1.400000,1
7892,50,12:30:00,47.0,7.466667,7.466667,1
7886,50,12:30:00,38.0,1.000000,1.000000,1
7884,50,12:30:00,36.0,4.716667,4.716667,1
...,...,...,...,...,...,...
14037,90,22:30:00,79.0,3.930376,2.000000,4807
14207,91,22:45:00,79.0,3.944961,2.083333,4842
14904,95,23:45:00,79.0,3.496469,2.000000,4852
14557,93,23:15:00,79.0,3.484715,2.000000,4909


In [None]:
##mean
plt.bar(cruise_time['dropoff_taxizone_id'], cruise_time['avg_cruise_time'])

In [None]:
##median
plt.bar(cruise_time['dropoff_taxizone_id'], cruise_time['med_cruise_time'])

In [None]:
##zones with more than 10 mins median cruising time
cruise_time.loc[cruise_time['med_cruise_time'] > 10]

In [None]:
##zones with few than 10 record counts
cruise_time.loc[cruise_time['count'] < 20]