In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Load taxi data from January 2019 into a data frame, 
# using only the columns `tpep_pickup_datetime`, `tpep_dropoff_datetime`, 
# `passenger_count`, `trip_distance`, and `total_amount`, 
# making sure to load `tpep_pickup_datetime` and `tpep_dropoff_datetime` as `datetime` columns.

filename = '../data/nyc_taxi_2019-07.csv'

df = pd.read_csv(filename,
                usecols=['tpep_pickup_datetime',
                        'tpep_dropoff_datetime',
                         'trip_distance', 'passenger_count', 'total_amount'],
                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3


In [3]:
# Create a new column, `trip_time`, containing the amount of time each taxi ride took.
df['trip_time'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['trip_time'].head()

0   0 days 00:00:29
1   0 days 00:19:42
2   0 days 00:35:47
3   0 days 00:41:55
4   0 days 00:12:10
Name: trip_time, dtype: timedelta64[ns]

In [4]:
# What number of rides took less than 1 minute?
df['trip_time'][df['trip_time'] < '1 minute'].count()

70212

In [5]:
# What percentage of rides took less than 1 minute?
df['trip_time'][df['trip_time'] < '1 minute'].count() / df['trip_time'].count() * 100

1.1126361022936828

In [17]:
# What was the average fare paid by people taking these short trips?
df['total_amount'][df['trip_time'] < '1 minute'].mean()

30.397584031219733

In [6]:
# What number of rides took more than 10 hours?
df['trip_time'][df['trip_time'] > '10 hours'].count() 

16698

In [7]:
# What percentage of rides took more than 10 hours?
df['trip_time'][df['trip_time'] > '10 hours'].count() / df['trip_time'].count() * 100

0.2646100045020782

In [8]:
#  Now create a new column, `trip_time_group`, in which the values will be `short`
# (< 10 minutes), `medium` (>= between 10 minutes and 1 hour), or `long` (> 1 hour).

df['trip_time_group'] = pd.cut(df['trip_time'],
                              bins=[pd.to_timedelta(arg)
                                    for arg in ['0 seconds', '10 minutes', '1 hour', '100 hours']],
                              labels=['short', 'medium', 'long'])

In [9]:
# What proportion of rides were in each group?
df.groupby('trip_time_group')['passenger_count'].mean()

trip_time_group
short     1.552411
medium    1.585806
long      1.700859
Name: passenger_count, dtype: float64