<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Drop two unneeded columns

In [1]:
import pandas as pd
import numpy as np
# from tqdm.auto import tqdm

In [2]:
# tqdm.pandas(desc='df apply')

In [3]:
DROP_FLAG = False
if DROP_FLAG:
    data = pd.read_feather('data/all_raw.feather')
    data.drop(['level_0', 'index'], axis=1, inplace=True)

In [4]:
data = pd.read_feather('data/all.feather')

In [5]:
data.head()

Unnamed: 0,doy,dropoffX,dropoffY,duration,fare_amount,mow,payment_type,pickupX,pickupY,tip_amount,woy,year
0,182,36,57,17.0,15.5,2881,2,37,68,0.0,27,2015
1,182,36,42,12.0,18.0,2885,1,25,54,0.0,27,2015
2,182,42,58,3.0,5.0,2880,1,44,55,0.03,27,2015
3,182,40,60,5.0,5.5,2880,2,39,58,0.0,27,2015
4,182,59,54,10.0,10.5,2880,2,51,55,0.0,27,2015


In [6]:
# data[(data.year == 2016) & (data.doy == 366)]
# data[(data.year == 2016) & (data.doy == 365)]
# data[(data.year == 2016) & (data.doy == 364)]
# data[(data.year == 2016) & (data.doy == 363)]
# data[(data.year == 2016) & (data.woy == 53)]
# data[(data.year == 2016) & (data.doy == 1)]

# data[(data.year == 2015) & (data.doy == 365)].woy.value_counts()

# data[(data.year == 2016) & (data.doy == 1)].woy.value_counts()

# data[(data.year == 2016) & (data.doy == 2)].woy.value_counts()

# data[(data.year == 2016) & (data.doy == 3)].woy.value_counts()

# data[(data.year == 2016) & (data.doy == 4)].woy.value_counts()

In [7]:
# # Change wrong woy of 52 in 2017 to 1

# data.loc[(data.year == 2017) & (data.woy == 52), "woy"] = 1


# Count the time of trips at each hour (should not be used)

**This is prone to simpson's paradox. Not recommended.** If there is a time constrain, this can be used as an approximation.

In [8]:
mod = data.mow % (60*24)
mod.rename('mod', inplace=True);
dow = data.mow // (60*24)
dow.rename('dow', inplace=True);
hod = mod // 60
hod.rename('hod', inplace=True);

In [9]:
time_df = pd.DataFrame({'dow': dow, 'hod': hod, 'woy': data.woy, 'year': data.year})

In [10]:
h_count = time_df.hod.value_counts()

In [11]:
# Count the amount of trips in 12 hrs with each hour as starting time
start_h_count = []
for i in h_count.index:
    index = np.arange(i, i + 12) % 24
    shift_total = h_count.loc[index].sum()
    start_h_count.append([i, shift_total])

In [12]:
sorted(start_h_count, key=lambda x:x[1], reverse=True)

[[12, 179482350],
 [11, 179108383],
 [13, 176280104],
 [10, 176234397],
 [9, 173004428],
 [14, 170308786],
 [8, 169392362],
 [7, 162128440],
 [15, 161647026],
 [16, 151750103],
 [6, 150817952],
 [17, 142023144],
 [5, 138739433],
 [18, 129944625],
 [4, 129012474],
 [3, 119115551],
 [19, 118634137],
 [20, 111370215],
 [2, 110453791],
 [21, 107758149],
 [22, 104528180],
 [1, 104482473],
 [23, 101654194],
 [0, 101280227]]

In [13]:
# # Or use the following.
# window_len = 12
# hrs_in_day = 24

# pd.concat(
#     [
#         h_count.sort_index(),
#         h_count.sort_index(),
#     ]
# ).rolling(
#     window_len
# ).sum(
# ).shift(
#     - window_len + 1
# ).iloc[
#     : hrs_in_day
# ].sort_values(
#     ascending=False
# )

**Above shows 12 (noon) is the best hour to start.**

# Get best starting time.

## (assume starting the same time everyday)

In [14]:
days_since_2015 = data.doy + (data.year - 2015) * 365

# Add 1 day for 2017 ones as 2016 has 366 days.
days_since_2015.loc[data.year == 2017] += 1

days_since_2015.rename('days_since_2015', inplace=True);

In [15]:
day_df = pd.concat([days_since_2015, hod], axis=1)

In [16]:
d_h_count = day_df.groupby(['days_since_2015', 'hod']).size()
d_h_count = d_h_count.to_frame(name='counts')

In [17]:
d_h_count.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
days_since_2015,hod,Unnamed: 2_level_1
182,0,15064
182,1,8638
182,2,5210
182,3,3489
182,4,3382


In [18]:
shift_window = 12
rolling_sum = d_h_count['counts'].rolling(shift_window).sum().to_frame()
rolling_sum = rolling_sum.shift(periods=- shift_window + 1)

In [19]:
rolling_sum.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
days_since_2015,hod,Unnamed: 2_level_1
182,0,153321.0
182,1,158377.0
182,2,169932.0
182,3,185039.0
182,4,201020.0


In [20]:
best_hour = []
# Note the [: -1]. As hour counts for the last day in the dataset is
# incomplete, therfore, the last day is discarded.
for i in rolling_sum.index.levels[0][: -1]:
    best_hour.append(rolling_sum.loc[i].idxmax().iloc[0])

In [21]:
from collections import Counter

In [22]:
best_hour = sorted(Counter(best_hour).items(), key=lambda x: x[1], reverse=True)

In [23]:
best_hour

[(11, 248),
 (12, 187),
 (14, 107),
 (10, 77),
 (13, 74),
 (8, 18),
 (15, 8),
 (9, 5),
 (0, 3),
 (16, 2),
 (23, 1)]

In [24]:
for i in best_hour:
    print(f'Starting time: {i[0]}. Number of times being best: {i[1]}')

Starting time: 11. Number of times being best: 248
Starting time: 12. Number of times being best: 187
Starting time: 14. Number of times being best: 107
Starting time: 10. Number of times being best: 77
Starting time: 13. Number of times being best: 74
Starting time: 8. Number of times being best: 18
Starting time: 15. Number of times being best: 8
Starting time: 9. Number of times being best: 5
Starting time: 0. Number of times being best: 3
Starting time: 16. Number of times being best: 2
Starting time: 23. Number of times being best: 1


# Find optimum starting time for each day of week

In [25]:
dow_h_df = pd.concat([days_since_2015, dow, hod], axis=1)

In [26]:
dow_h_count = dow_h_df.groupby(['days_since_2015', 'dow', 'hod']).size()

In [27]:
dow_h_count = dow_h_count.to_frame(name='counts')

In [28]:
with_dow_rolling_sum = dow_h_count['counts'].rolling(12).sum().to_frame()
with_dow_rolling_sum = with_dow_rolling_sum.shift(periods=-11)

In [29]:
def get_best_hour(rolling_sum):
    best_hour = []
    for i in set(rolling_sum.index.get_level_values(0)):
        best_hour.append(rolling_sum.loc[i].idxmax().iloc[0])
    return sorted(Counter(best_hour).items(), key=lambda x: x[1], reverse=True)

In [30]:
best_hour_dow = []
for i in set(with_dow_rolling_sum.index.levels[1]):
    # Get instances with i day of week.
    best_hour_dow.append([i, get_best_hour(with_dow_rolling_sum.xs(i, level='dow'))])

In [31]:
best_hour_dow

[[0, [(11, 82), (8, 12), (10, 6), (9, 2), (13, 1), (12, 1)]],
 [1, [(11, 81), (12, 14), (8, 4), (9, 3), (23, 1), (10, 1)]],
 [2, [(12, 54), (11, 49), (8, 2)]],
 [3, [(12, 93), (11, 6), (13, 5), (16, 1)]],
 [4, [(14, 64), (13, 27), (12, 8), (15, 4), (11, 1), (0, 1)]],
 [5,
  [(14, 43), (13, 40), (12, 12), (15, 4), (10, 2), (16, 1), (11, 1), (0, 1)]],
 [6, [(10, 68), (11, 28), (12, 6), (13, 1), (0, 1)]]]

In [32]:
for i in best_hour_dow:
    print(f'Day of week: {i[0]}. Starting hour: {i[1][0][0]}')

Day of week: 0. Starting hour: 11
Day of week: 1. Starting hour: 11
Day of week: 2. Starting hour: 12
Day of week: 3. Starting hour: 12
Day of week: 4. Starting hour: 14
Day of week: 5. Starting hour: 14
Day of week: 6. Starting hour: 10
