# 10. Dates and times

## Exercise 39 - Short, medium, and long taxi rides

In [519]:
# read in CSV, specify columns, parse date columns
july_19 = pd.read_csv(root_path + 'nyc_taxi_2019-07.csv',
                     usecols=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'total_amount'],
                     parse_dates = ['tpep_pickup_datetime', 'tpep_dropoff_datetime',])

july_19.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3


In [520]:
# create "trip_time", amount of time each taxi ride took as timedelta
july_19['trip_time'] = july_19['tpep_dropoff_datetime'] - july_19['tpep_pickup_datetime']

july_19.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94,0 days 00:00:29
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3,0 days 00:19:42
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67,0 days 00:35:47
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36,0 days 00:41:55
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3,0 days 00:12:10


In [521]:
# number and % of rides taking less than 1 minute
july_19.loc[july_19['trip_time'] < '1 minute', 'trip_time'].count()

70212

In [522]:
# % less than 1 minute
july_19.loc[july_19['trip_time'] < '1 minute', 'trip_time'].count() * 100 / len(july_19)

1.1126361022936828

In [523]:
# average fare for such short trips
july_19.loc[july_19['trip_time'] <= '1 minute', 'total_amount'].mean()

30.245995073198202

In [524]:
# number and percentage of rides taking more than 10 hours
july_19.loc[july_19['trip_time'] >= '10 hours', 'trip_time'].count()

16698

In [525]:
# % more than 10 hours
july_19.loc[july_19['trip_time'] >= '10 hours', 'trip_time'].count() * 100 / len(july_19)

0.2646100045020782

In [526]:
# create new column, "trip_time_group" with categories
# short (< 10 minutes)
# medium (>= 10 minutes and 1 hour)
# long (> 1 hours)

short_seconds = 10 * 60
medium_seconds = 60 * 60

def trip_category(s):
    if s.seconds < short_seconds:
        return 'short'
    elif s.seconds >= short_seconds and s.seconds < medium_seconds:
        return 'medium'
    else:
        return 'long'

july_19['trip_time_group'] = july_19['trip_time'].apply(trip_category)
july_19.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time,trip_time_group
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94,0 days 00:00:29,short
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3,0 days 00:19:42,medium
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67,0 days 00:35:47,medium
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36,0 days 00:41:55,medium
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3,0 days 00:12:10,medium


In [527]:
# proportion of rides in each group
july_19['trip_time_group'].value_counts(normalize=True)

trip_time_group
medium   0.55
short    0.43
long     0.01
Name: proportion, dtype: float64

In [528]:
# average num passengers for each group
july_19.groupby('trip_time_group')['passenger_count'].mean()

trip_time_group
long     1.70
medium   1.59
short    1.55
Name: passenger_count, dtype: float64

### Exercise 39b

In [530]:
# how many trips are not from july 2019?
july_19.loc[(july_19['tpep_pickup_datetime'] < '2019-07-01') | (july_19['tpep_pickup_datetime'] > '2019-07-31 23:59'), :]

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time,trip_time_group
184,2019-06-30 14:54:49,2019-06-30 15:04:50,1.00,1.71,13.30,0 days 00:10:01,medium
185,2019-06-30 15:19:34,2019-06-30 15:37:32,1.00,7.05,26.30,0 days 00:17:58,medium
206,2019-06-30 23:41:12,2019-06-30 23:48:54,1.00,1.00,10.30,0 days 00:07:42,short
274,2019-06-30 23:52:06,2019-07-01 00:26:02,1.00,10.83,39.90,0 days 00:33:56,medium
421,2019-06-30 23:56:48,2019-07-01 00:03:34,1.00,1.57,10.30,0 days 00:06:46,short
...,...,...,...,...,...,...,...
6276049,2019-07-31 23:59:22,2019-08-01 00:04:29,4.00,0.82,9.30,0 days 00:05:07,short
6276069,2019-08-01 00:04:50,2019-08-01 00:13:48,1.00,2.46,12.80,0 days 00:08:58,short
6276128,2019-07-31 23:59:22,2019-08-01 00:05:27,1.00,0.80,9.80,0 days 00:06:05,short
6276258,2019-07-31 23:59:05,2019-08-01 00:07:34,1.00,1.90,14.75,0 days 00:08:29,short


In [531]:
# mean trip time for each number of passengers
july_19.groupby('passenger_count')['trip_time'].mean()

passenger_count
0.00   0 days 00:14:18.929810752
1.00   0 days 00:17:46.148103924
2.00   0 days 00:18:34.024342704
3.00   0 days 00:19:02.079604271
4.00   0 days 00:20:10.057290100
5.00   0 days 00:22:29.870464324
6.00   0 days 00:20:54.109564300
7.00   0 days 00:16:38.206896551
8.00      0 days 00:11:00.500000
9.00      0 days 00:49:16.125000
Name: trip_time, dtype: timedelta64[ns]

In [532]:
# load taxi data from july 2019 and 2020, for each year, and then for each num passengers, what was mean amount paid?
july_19 = pd.read_csv(root_path + 'nyc_taxi_2019-07.csv',
                     usecols=['passenger_count','total_amount'])

july_19['year'] = 2019

july_20 = pd.read_csv(root_path + 'nyc_taxi_2020-07.csv',
                     usecols=['passenger_count','total_amount'])

july_20['year'] = 2020

comparison_df = pd.concat([july_19, july_20])

comparison_df.groupby(['year', 'passenger_count'])['total_amount'].mean()

year  passenger_count
2019  0.00              18.98
      1.00              19.28
      2.00              20.10
      3.00              20.21
      4.00              21.06
      5.00              19.42
      6.00              19.39
      7.00              70.08
      8.00              74.76
      9.00              93.51
2020  0.00              16.54
      1.00              16.86
      2.00              17.19
      3.00              17.10
      4.00              17.96
      5.00              16.73
      6.00              16.81
      7.00              22.46
      8.00              10.30
      9.00              11.76
Name: total_amount, dtype: float64

## Exercise 40 - Writing dates, reading dates

In [534]:
# load in CSV, specify columns, parse dates
july_19 = pd.read_csv(root_path + 'nyc_taxi_2019-07.csv',
                      usecols=['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'total_amount'],
                      parse_dates=['tpep_pickup_datetime'])
july_19.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,1.0,0.0,4.94
1,2019-07-01 00:46:04,1.0,4.16,20.3
2,2019-07-01 00:25:09,1.0,18.8,70.67
3,2019-07-01 00:33:32,1.0,18.46,66.36
4,2019-07-01 00:00:55,0.0,1.7,15.3


In [535]:
# convert to CSV, specify columns, tab-separator, and specified date format
july_19.to_csv('new_taxi.csv',
               columns=['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'total_amount'],
               sep='\t',
               date_format='%d/%m/%Y %Hh:%Mm:%Ss')

In [536]:
# read back in the CSV with an odd date format
weird_dates = pd.read_csv('new_taxi.csv',
                          sep='\t',
                          usecols=['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'total_amount'],
                          parse_dates=['tpep_pickup_datetime'],
                          date_format = '%d/%m/%Y %Hh:%Mm:%Ss')
weird_dates.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,1.0,0.0,4.94
1,2019-07-01 00:46:04,1.0,4.16,20.3
2,2019-07-01 00:25:09,1.0,18.8,70.67
3,2019-07-01 00:33:32,1.0,18.46,66.36
4,2019-07-01 00:00:55,0.0,1.7,15.3


In [537]:
# display data types
weird_dates.dtypes

tpep_pickup_datetime    datetime64[ns]
passenger_count                float64
trip_distance                  float64
total_amount                   float64
dtype: object

### Exercise 40b

In [539]:
# export "tpep_pickup_datetime" as Unix time

july_19['unix_time'] = (july_19['tpep_pickup_datetime'] - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')

july_19['unix_time']

0          1561942264
1          1561941964
2          1561940709
3          1561941212
4          1561939255
              ...    
6310414    1564418093
6310415    1564416477
6310416    1564416091
6310417    1564419480
6310418    1564416960
Name: unix_time, Length: 6310419, dtype: int64

In [540]:
# convert to CSV
july_19.to_csv('unix_taxi.csv',
               columns=['unix_time'])

In [541]:
# read back in CSV
unix_df = pd.read_csv('unix_taxi.csv',
                      usecols=['unix_time'])
unix_df

Unnamed: 0,unix_time
0,1561942264
1,1561941964
2,1561940709
3,1561941212
4,1561939255
...,...
6310414,1564418093
6310415,1564416477
6310416,1564416091
6310417,1564419480


In [542]:
# convert back to datetime column
pd.to_datetime(unix_df['unix_time'], unit='s', origin='unix')

0         2019-07-01 00:51:04
1         2019-07-01 00:46:04
2         2019-07-01 00:25:09
3         2019-07-01 00:33:32
4         2019-07-01 00:00:55
                  ...        
6310414   2019-07-29 16:34:53
6310415   2019-07-29 16:07:57
6310416   2019-07-29 16:01:31
6310417   2019-07-29 16:58:00
6310418   2019-07-29 16:16:00
Name: unix_time, Length: 6310419, dtype: datetime64[ns]

In [543]:
# how long to read in read_csv vs in separate to_datetime step?

In [544]:
# it appears read_csv cannot read unix and author does not have a solution either

"""
t0 = time.time()

unix_df = pd.read_csv('unix_taxi.csv',
                      usecols=['unix_time'],
                      parse_dates=['unix_time'])

t1 = time.time()
total_n = t1-t0
total_n
"""

"\nt0 = time.time()\n\nunix_df = pd.read_csv('unix_taxi.csv',\n                      usecols=['unix_time'],\n                      parse_dates=['unix_time'])\n\nt1 = time.time()\ntotal_n = t1-t0\ntotal_n\n"

## Exercise 41 - Oil prices

In [546]:
# read in CSV, parse dates, set 'Date' as index
oil = pd.read_csv(root_path + 'wti-daily.csv',
                  parse_dates=['Date'],
                  index_col=['Date'])
oil.head()

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
1986-01-02,25.56
1986-01-03,26.0
1986-01-06,26.53
1986-01-07,25.85
1986-01-08,25.87


In [547]:
# average price of barrel of oil in June 1992?
oil.loc['1992-06'].mean()

Price   22.38
dtype: float64

In [548]:
# average price of barrel of oil in all of 1987?
oil.loc['1987'].mean()

Price   19.20
dtype: float64

In [549]:
# average price from September 2003 through July 2014
oil.loc['2003-09':'2014-07'].mean()

Price   76.46
dtype: float64

In [550]:
# price of oil at end of each quarter in data set
oil.loc[oil.index.is_quarter_end]

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
1986-03-31,10.25
1986-06-30,12.80
1986-09-30,14.70
1986-12-31,17.93
1987-03-31,18.82
...,...
2020-09-30,40.05
2020-12-31,48.35
2021-03-31,59.19
2021-06-30,73.52


In [551]:
# for each year, show average price
oil.resample('A').mean()

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
1986-12-31,15.05
1987-12-31,19.2
1988-12-31,15.97
1989-12-31,19.64
1990-12-31,24.53
1991-12-31,21.54
1992-12-31,20.58
1993-12-31,18.43
1994-12-31,17.2
1995-12-31,18.43


In [552]:
# on which date were oil prices the highest?
oil.loc[oil['Price'] == oil['Price'].max()]

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2008-07-03,145.31


In [553]:
# sort values by 'Price', top 5 highest prices
oil.sort_values('Price', ascending=False).head()

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2008-07-03,145.31
2008-07-14,145.16
2008-07-11,144.96
2008-07-02,143.74
2008-07-10,141.47


In [554]:
# on which date were oil prices the lowest?
oil.loc[oil['Price'] == oil['Price'].min()]

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2020-04-20,-36.98


In [555]:
# sort by 'Price' ascending, 5 lowest prices
oil.sort_values('Price', ascending=True).head()

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2020-04-20,-36.98
2020-04-21,8.91
1986-03-31,10.25
1998-12-10,10.82
1986-07-25,10.83


### Exercise 41b

In [557]:
# use resample to find, for each quarter, mean and SD in price
oil.resample('Q').agg(['mean','std'])

Unnamed: 0_level_0,Price,Price
Unnamed: 0_level_1,mean,std
Date,Unnamed: 1_level_2,Unnamed: 2_level_2
1986-03-31,17.22,4.86
1986-06-30,13.87,1.35
1986-09-30,13.81,1.81
1986-12-31,15.41,0.82
1987-03-31,18.25,0.66
...,...,...
2020-12-31,42.52,3.84
2021-03-31,58.09,4.93
2021-06-30,66.19,4.40
2021-09-30,70.58,3.08


In [558]:
# in which quarter was biggest increase in mean price from previous quarter?
oil.resample('Q').mean().diff().sort_values('Price', ascending=False)

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2008-06-30,25.92
2009-06-30,16.47
2021-03-31,15.57
2007-12-31,15.36
2020-09-30,12.93
...,...
2020-06-30,-17.37
2014-12-31,-24.66
2015-03-31,-24.73
2008-12-31,-59.61


In [559]:
# what was the biggest percentage increase in oil prices across quarters?
oil.resample('Q').mean().pct_change().sort_values('Price', ascending=False)

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
1990-09-30,0.48
2020-09-30,0.46
2009-06-30,0.38
2021-03-31,0.37
2016-06-30,0.36
...,...
1991-03-31,-0.32
2015-03-31,-0.34
2020-06-30,-0.38
2008-12-31,-0.50


## Exercise 42 - Best tippers

In [561]:
# read in CSV, specify columns, parse dates
jan_19 = pd.read_csv(root_path + 'nyc_taxi_2019-01.csv',
                     usecols=['tpep_pickup_datetime',
                              'passenger_count',
                              'trip_distance',
                              'fare_amount',
                              'extra',
                              'mta_tax',
                              'tip_amount',
                              'tolls_amount',
                              'improvement_surcharge',
                              'total_amount',
                              'congestion_surcharge'],
                     parse_dates=['tpep_pickup_datetime'])

jan_19.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,2019-01-01 00:59:47,1,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2018-12-21 13:48:30,3,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2018-11-28 15:52:25,5,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2018-11-28 15:56:57,5,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [562]:
# ibid for july instead
jul_19 = pd.read_csv(root_path + 'nyc_taxi_2019-07.csv',
                     usecols=['tpep_pickup_datetime',
                              'passenger_count',
                              'trip_distance',
                              'fare_amount',
                              'extra',
                              'mta_tax',
                              'tip_amount',
                              'tolls_amount',
                              'improvement_surcharge',
                              'total_amount',
                              'congestion_surcharge'],
                     parse_dates=['tpep_pickup_datetime'])

jul_19.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-07-01 00:51:04,1.0,0.0,2.5,0.5,0.5,1.14,0.0,0.3,4.94,0.0
1,2019-07-01 00:46:04,1.0,4.16,16.5,0.5,0.5,0.0,0.0,0.3,20.3,2.5
2,2019-07-01 00:25:09,1.0,18.8,52.0,0.0,0.5,11.75,6.12,0.3,70.67,0.0
3,2019-07-01 00:33:32,1.0,18.46,52.0,0.0,0.5,11.06,0.0,0.3,66.36,2.5
4,2019-07-01 00:00:55,0.0,1.7,9.5,3.0,0.5,2.0,0.0,0.3,15.3,2.5


In [563]:
# combine dataframes
taxi_tips = pd.concat([jan_19, jul_19])

In [564]:
# fill NA with 0
taxi_tips = taxi_tips.fillna('0.00')
taxi_tips.dtypes

tpep_pickup_datetime     datetime64[ns]
passenger_count                  object
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge             object
dtype: object

In [565]:
# change dtypes to float
taxi_tips['passenger_count'] = taxi_tips['passenger_count'].astype(float)
taxi_tips['congestion_surcharge'] = taxi_tips['congestion_surcharge'].astype(float)
taxi_tips.dtypes

tpep_pickup_datetime     datetime64[ns]
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

In [566]:
# create column, 'pre_tip_amount' with all payments except total_amount and tip_amount
taxi_tips['pre_tip_amount'] = taxi_tips['fare_amount'] + taxi_tips['extra'] + taxi_tips['mta_tax'] + taxi_tips['tolls_amount'] + taxi_tips['improvement_surcharge'] + taxi_tips['congestion_surcharge']

taxi_tips['pre_tip_amount']

0          8.30
1         15.30
2          5.80
3          4.80
4         52.80
           ... 
6310414   29.00
6310415   54.43
6310416   65.40
6310417   43.00
6310418   22.15
Name: pre_tip_amount, Length: 13978211, dtype: float64

In [567]:
# (total amount - tip amount) should match
taxi_tips['total_amount'] - taxi_tips['tip_amount']

0          8.30
1         15.30
2          5.80
3          7.55
4         55.55
           ... 
6310414   29.00
6310415   54.43
6310416   65.40
6310417   43.00
6310418   22.15
Length: 13978211, dtype: float64

In [568]:
# create new column, 'tip_percentage', showing percentage of pre_tip_amount that the tip was
taxi_tips['tip_percentage'] = taxi_tips['tip_amount'] / taxi_tips['pre_tip_amount']
taxi_tips.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,0.0,8.3,0.2
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,0.0,15.3,0.07
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,0.0,5.8,0.0
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,0.0,4.8,0.0
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,0.0,52.8,0.0


In [569]:
# mean tip percentage across all tips in data set?
taxi_tips['tip_percentage'].mean()

0.13003974566357937

In [570]:
# how many times did people tip more than the pretip amount?
len(taxi_tips.loc[taxi_tips['tip_amount'] > taxi_tips['pre_tip_amount'], 'tip_amount'])

28232

In [571]:
# this is correct, as sometimes "pre_tip_amount" was negative and "tip_amount" was 0
(taxi_tips['tip_percentage'] > 1).value_counts()

tip_percentage
False    13970379
True         7832
Name: count, dtype: int64

In [572]:
# which day of the week do people tip the greatest percentage of fare, on average?

# set index
taxi_tips = taxi_tips.set_index('tpep_pickup_datetime')

In [573]:
# reset index in place
taxi_tips.reset_index(inplace=True)

In [574]:
# perform groupby and calculations
taxi_tips.groupby(taxi_tips['tpep_pickup_datetime'].dt.weekday)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
3   0.13
2   0.13
1   0.13
4   0.13
0   0.13
6   0.13
5   0.13
Name: tip_percentage, dtype: float64

In [575]:
# 3 = Thursday

In [576]:
# at which hour do people tip the greatest percentage?
taxi_tips.groupby(taxi_tips['tpep_pickup_datetime'].dt.hour)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
22   0.14
20   0.14
21   0.14
8    0.14
19   0.14
23   0.13
18   0.13
9    0.13
7    0.13
0    0.13
2    0.13
1    0.13
17   0.13
10   0.13
11   0.13
16   0.12
13   0.12
12   0.12
14   0.12
15   0.12
3    0.12
6    0.12
4    0.12
5    0.11
Name: tip_percentage, dtype: float64

In [577]:
# set index
taxi_tips = taxi_tips.set_index('tpep_pickup_datetime')

In [578]:
# reset index in place
taxi_tips.reset_index(inplace=True)

In [579]:
# do people typically tip more in January or July?
jan_mean = taxi_tips.loc[taxi_tips['tpep_pickup_datetime'].dt.month == 1, 'tip_percentage'].mean()

# january mean
jan_mean

0.13701104824387902

In [580]:
jul_mean = taxi_tips.loc[taxi_tips['tpep_pickup_datetime'].dt.month == 7, 'tip_percentage'].mean()

# july mean
jul_mean

0.12157035841573742

In [581]:
# can also group by month to get stats of all
taxi_tips.groupby(taxi_tips['tpep_pickup_datetime'].dt.month)['tip_percentage'].mean().sort_values(ascending=False)

tpep_pickup_datetime
5    0.20
8    0.16
3    0.15
9    0.14
1    0.14
2    0.13
7    0.12
12   0.11
6    0.11
10   0.10
4    0.07
11   0.05
Name: tip_percentage, dtype: float64

In [582]:
# set index
taxi_tips = taxi_tips.set_index('tpep_pickup_datetime')

In [583]:
# what was 1-day period when people tipped the greatest percentage on average?
taxi_tips.resample('D')['tip_percentage'].mean().sort_values(ascending=False).head()

tpep_pickup_datetime
2019-02-13   0.36
2019-02-25   0.25
2019-08-20   0.24
2019-11-27   0.20
2019-08-15   0.20
Name: tip_percentage, dtype: float64

### Exercise 42b

In [585]:
# 32% of riders don't tip. of those who do, what % do they tip, on average
taxi_tips.loc[taxi_tips['tip_amount'] > 0, 'tip_percentage'].mean()

0.19146519965282618

In [586]:
# how many rides in the data set are from outside the correct dates?
len(taxi_tips) - len(taxi_tips.loc['2019-01']) - len(taxi_tips.loc['2019-07'])

816

In [587]:
# looking at dates in jan and july, what week did passengers tip the greatest percentage?
jan_correct = taxi_tips.loc['2019-01']
jul_correct = taxi_tips.loc['2019-07']

taxi_correct = pd.concat([jan_correct, jul_correct])
taxi_correct.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13977395 entries, 2019-01-01 00:46:40 to 2019-07-29 16:16:00
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   passenger_count        float64
 1   trip_distance          float64
 2   fare_amount            float64
 3   extra                  float64
 4   mta_tax                float64
 5   tip_amount             float64
 6   tolls_amount           float64
 7   improvement_surcharge  float64
 8   total_amount           float64
 9   congestion_surcharge   float64
 10  pre_tip_amount         float64
 11  tip_percentage         float64
dtypes: float64(12)
memory usage: 1.4 GB


In [588]:
taxi_correct.resample('1W')['tip_percentage'].mean().sort_values(ascending=False).head()

tpep_pickup_datetime
2019-02-03   0.14
2019-01-27   0.14
2019-01-20   0.14
2019-01-13   0.14
2019-01-06   0.13
Name: tip_percentage, dtype: float64

## Chapter 10 notes
- Setting Datetime index makes searching much easier, permits use of df.resample('1D').mean() and useful attributes such as df.index.is_quarter_end
- NaN values can cause some errors in even basic addition, returning NaN as output
- Remember dt accessor when extracting date parts from datetime, df['column'].dt.weekday