In [6]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame

Populating the interactive namespace from numpy and matplotlib


In [1]:
# (1) Dates and times in general
# (2) Dates and times in Pandas -- loading data
# (3) Intervals / timedeltas -- creating and calculating
# (4) Resampling and aggregate measures

In [2]:
# http://files.lerner.co.il/data-science-exercise-files.zip

# Two types of data structures for dates and times

- timestamp, datetime object: reflects a specific point in time
    - your birthday
    - your anniversary
    - when a meeting is going to happen
- timedelta: reflects a span of time — measured in years/months/etc., but no fixed point
    - how long is a meeting going to last?
    - how many seconds have you been alive?
    - how much longer must I sit through this class

In [3]:
!ls -l taxi.csv

-rw-r--r-- 1 reuven staff 1566751 Feb 16  2016 taxi.csv


In [4]:
!head taxi.csv

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954429626464844,40.764141082763672,1,N,-73.974754333496094,40.754093170166016,2,17,0,0.5,0,0,0.3,17.8
2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,.46,-73.971443176269531,40.758941650390625,1,N,-73.978538513183594,40.761909484863281,1,6.5,0,0.5,1,0,0.3,8.3
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,.87,-73.978111267089844,40.738433837890625,1,N,-73.990272521972656,40.745437622070313,1,8,0,0.5,2.2,0,0.3,11
2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892333984375,40.773529052734375,1,N,-73.971527099609375,40.760330200195312,1,13.5,0,0.5,2.86,0,0.3,17.16
1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979087829589844,40.776771545410156,1

In [8]:
df = pd.read_csv('taxi.csv',
                usecols=['tpep_pickup_datetime',
                        'tpep_dropoff_datetime',
                        'passenger_count',
                        'trip_distance',
                        'total_amount'])

df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8
1,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0
3,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3


In [9]:
df.shape

(9999, 5)

In [10]:
df.dtypes

tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
total_amount             float64
dtype: object

In [11]:
df = pd.read_csv('taxi.csv',
                usecols=['tpep_pickup_datetime',
                        'tpep_dropoff_datetime',
                        'passenger_count',
                        'trip_distance',
                        'total_amount'],
                parse_dates=['tpep_pickup_datetime',
                            'tpep_dropoff_datetime'])

df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8
1,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0
3,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3


In [12]:
df.dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                   int64
trip_distance                   float64
total_amount                    float64
dtype: object

In [13]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers:

read_csv(filepath_or_buffer: Union[str, pathlib.Path, IO[~AnyStr]], sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal: str = '.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)
    Read a comma-separated values (csv) file in

# Exercise: Load taxi data

1. Load the data from `taxi.csv` into a data frame
2. Make sure (using `df.dtypes`) that the pickup and dropoff columns are indeed `datetime64` objects.

In [14]:
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8
1,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0
3,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3


In [20]:
df['tpep_pickup_datetime'].dt.year

0       2015
1       2015
2       2015
3       2015
4       2015
        ... 
9994    2015
9995    2015
9996    2015
9997    2015
9998    2015
Name: tpep_pickup_datetime, Length: 9999, dtype: int64

# Exercise: Most popular travel hours

1. Using our datetime information, which hour was the most popular for taking taxis in New York from our sample data? 
2. What were all of the different hours for which we have data?  Can we then trust the assertion made in question 1?

In [23]:
df['tpep_pickup_datetime'].dt.hour.value_counts()

11    4396
15    2536
0     2439
16     628
Name: tpep_pickup_datetime, dtype: int64

In [24]:
# timestamp + delta = timestamp
# timestamp - timestamp = delta

In [25]:
df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']

0      0 days 00:28:23
1      0 days 00:08:26
2      0 days 00:10:59
3      0 days 00:19:31
4      0 days 00:13:17
             ...      
9994   0 days 00:11:19
9995   0 days 00:15:17
9996   0 days 00:24:25
9997   0 days 00:06:08
9998   0 days 00:23:29
Length: 9999, dtype: timedelta64[ns]

In [26]:
df['trip_time'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']

In [27]:
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
0,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8,0 days 00:28:23
1,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3,0 days 00:08:26
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0,0 days 00:10:59
3,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16,0 days 00:19:31
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3,0 days 00:13:17


In [30]:
df[df['trip_time'] < '00:10:00'].shape

(3758, 6)

# Exercise: Trip times

1. How many trips took longer than 1 hour?
2. Of the trips that took longer than 1 hour, how much did they cost, on average?
3. What was the average number of passengers in trips take < 10 minutes?

In [36]:
df[df['trip_time'] > '1 hour']

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
7,2015-06-02 11:19:35,2015-06-02 12:36:46,4,11.90,73.84,0 days 01:17:11
88,2015-06-02 11:20:37,2015-06-02 12:35:21,2,23.76,94.00,0 days 01:14:44
126,2015-06-02 11:21:03,2015-06-03 00:00:00,3,1.06,11.16,0 days 12:38:57
174,2015-06-02 11:21:42,2015-06-02 12:22:51,1,6.65,45.96,0 days 01:01:09
246,2015-06-02 11:19:46,2015-06-02 12:26:33,1,0.00,3.30,0 days 01:06:47
...,...,...,...,...,...,...
7907,2015-06-04 15:23:01,2015-06-04 16:30:03,1,26.60,96.84,0 days 01:07:02
8042,2015-06-06 16:53:20,2015-06-06 17:54:20,1,17.98,58.34,0 days 01:01:00
8171,2015-06-06 16:53:33,2015-06-06 18:01:48,1,18.01,58.34,0 days 01:08:15
8201,2015-06-06 16:53:56,2015-06-06 17:54:22,1,17.36,69.00,0 days 01:00:26


In [40]:
df.query('trip_time > "1 hour"')['total_amount']

7        73.84
88       94.00
126      11.16
174      45.96
246       3.30
         ...  
7907     96.84
8042     58.34
8171     58.34
8201     69.00
8513    160.05
Name: total_amount, Length: 170, dtype: float64

In [41]:
df.query('trip_time > "1 hour"')['total_amount'].mean()

62.57652941176472

In [42]:
df.query('trip_time > "1 hour"')['total_amount'].describe()

count    170.000000
mean      62.576529
std       15.843810
min        3.300000
25%       58.340000
50%       58.615000
75%       69.980000
max      160.050000
Name: total_amount, dtype: float64

In [45]:
df.query('trip_time < "10 minutes"')['passenger_count'].mean()

1.6375731772219266

In [46]:
df.query('trip_time > "1 hour"')['passenger_count'].mean()

1.6941176470588235

In [47]:
df.head(20)

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
0,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8,0 days 00:28:23
1,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3,0 days 00:08:26
2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0,0 days 00:10:59
3,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16,0 days 00:19:31
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3,0 days 00:13:17
5,2015-06-02 11:19:33,2015-06-02 11:28:48,1,1.4,10.55,0 days 00:09:15
6,2015-06-02 11:19:34,2015-06-02 11:38:46,1,1.8,16.3,0 days 00:19:12
7,2015-06-02 11:19:35,2015-06-02 12:36:46,4,11.9,73.84,0 days 01:17:11
8,2015-06-02 11:19:36,2015-06-02 11:45:19,1,1.27,15.8,0 days 00:25:43
9,2015-06-02 11:19:38,2015-06-02 11:23:50,1,0.6,6.3,0 days 00:04:12


In [49]:
df[df['tpep_pickup_datetime'] == '2015-06-02 11:19:32']

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
4,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3,0 days 00:13:17
231,2015-06-02 11:19:32,2015-06-02 11:41:50,1,1.68,18.36,0 days 00:22:18
232,2015-06-02 11:19:32,2015-06-02 11:36:04,6,4.2,17.3,0 days 00:16:32
459,2015-06-02 11:19:32,2015-06-02 11:27:28,1,1.1,7.8,0 days 00:07:56
460,2015-06-02 11:19:32,2015-06-02 11:38:08,3,2.1,16.55,0 days 00:18:36
1068,2015-06-02 11:19:32,2015-06-02 11:30:07,1,1.0,8.8,0 days 00:10:35


In [50]:
# use a datetime column as the index into the data frame!

df = df.set_index('tpep_pickup_datetime')
df.head()

Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8,0 days 00:28:23
2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3,0 days 00:08:26
2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0,0 days 00:10:59
2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16,0 days 00:19:31
2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3,0 days 00:13:17


In [52]:
# (1) with a datetime index, I can retrieve rows by datetime

df.loc['2015-06-02 11:19:32']

Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3,0 days 00:13:17
2015-06-02 11:19:32,2015-06-02 11:41:50,1,1.68,18.36,0 days 00:22:18
2015-06-02 11:19:32,2015-06-02 11:36:04,6,4.2,17.3,0 days 00:16:32
2015-06-02 11:19:32,2015-06-02 11:27:28,1,1.1,7.8,0 days 00:07:56
2015-06-02 11:19:32,2015-06-02 11:38:08,3,2.1,16.55,0 days 00:18:36
2015-06-02 11:19:32,2015-06-02 11:30:07,1,1.0,8.8,0 days 00:10:35


In [53]:
# (2) with a datetime index, I can retrieve fuzzy dates

df.loc['2015-06-02 11:19']  # no seconds == wildcard seconds


Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.80,0 days 00:28:23
2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.30,0 days 00:08:26
2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.00,0 days 00:10:59
2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16,0 days 00:19:31
2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,10.30,0 days 00:13:17
...,...,...,...,...,...
2015-06-02 11:19:56,2015-06-02 11:24:18,1,0.60,7.80,0 days 00:04:22
2015-06-02 11:19:57,2015-06-02 11:26:37,1,0.90,8.16,0 days 00:06:40
2015-06-02 11:19:58,2015-06-02 11:27:59,1,0.90,9.80,0 days 00:08:01
2015-06-02 11:19:59,2015-06-02 11:39:51,1,2.01,16.80,0 days 00:19:52


In [54]:
df.loc['2015-06-02 11']  # no minutes == wildcard seconds + minutes


Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.80,0 days 00:28:23
2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.30,0 days 00:08:26
2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.00,0 days 00:10:59
2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16,0 days 00:19:31
2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,10.30,0 days 00:13:17
...,...,...,...,...,...
2015-06-02 11:33:35,2015-06-02 11:42:16,1,0.73,7.80,0 days 00:08:41
2015-06-02 11:33:35,2015-06-02 11:38:53,1,0.96,8.50,0 days 00:05:18
2015-06-02 11:33:36,2015-06-02 11:52:32,2,3.50,18.80,0 days 00:18:56
2015-06-02 11:33:35,2015-06-02 11:55:02,5,2.14,14.80,0 days 00:21:27


In [56]:
# (3) Use slice to retrieve several rows from the data frame, by timestamp

df.loc['2015-06-02 11:19:30':'2015-06-02 11:20:15']

Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.30,0 days 00:08:26
2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.00,0 days 00:10:59
2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16,0 days 00:19:31
2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,10.30,0 days 00:13:17
2015-06-02 11:19:33,2015-06-02 11:28:48,1,1.40,10.55,0 days 00:09:15
...,...,...,...,...,...
2015-06-02 11:20:12,2015-06-02 11:27:39,2,1.07,7.80,0 days 00:07:27
2015-06-02 11:20:12,2015-06-02 11:27:53,2,0.86,8.76,0 days 00:07:41
2015-06-02 11:20:14,2015-06-02 11:49:25,1,2.30,18.30,0 days 00:29:11
2015-06-02 11:20:14,2015-06-02 11:44:26,5,2.31,16.30,0 days 00:24:12


# Exercises: Datetime indexes

1. Set the index to be `tpep_pickup_datetime`.
2. Get all of the records which took place between 2015-06-02 11:45 and 11:47. How many rides were there, and how much did they pay, on average?
3. Find the average trip distance where the date is `2015-06-02`.


In [66]:
df = pd.read_csv('taxi.csv',
                usecols=['tpep_pickup_datetime',
                        'tpep_dropoff_datetime',
                        'passenger_count',
                        'trip_distance',
                        'total_amount'],
                parse_dates=['tpep_pickup_datetime',
                            'tpep_dropoff_datetime'] ,
                index_col='tpep_pickup_datetime')

df.head()

Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8
2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3
2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0
2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16
2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9999 entries, 2015-06-02 11:19:29 to 2015-06-01 00:13:04
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   tpep_dropoff_datetime  9999 non-null   datetime64[ns]
 1   passenger_count        9999 non-null   int64         
 2   trip_distance          9999 non-null   float64       
 3   total_amount           9999 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 390.6 KB


In [73]:
df.loc['2015-06-02 11:25':'2015-06-02 11:27']

Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-06-02 11:25:00,2015-06-02 11:32:39,1,1.00,7.80
2015-06-02 11:25:01,2015-06-02 11:26:49,1,0.60,4.80
2015-06-02 11:25:01,2015-06-02 11:29:56,5,0.43,6.80
2015-06-02 11:25:03,2015-06-02 11:44:57,1,3.00,19.75
2015-06-02 11:25:03,2015-06-02 11:50:38,1,5.80,28.56
...,...,...,...,...
2015-06-02 11:27:57,2015-06-02 11:30:41,1,0.60,10.30
2015-06-02 11:27:58,2015-06-02 11:37:44,1,2.27,11.30
2015-06-02 11:27:59,2015-06-02 12:23:22,1,11.40,59.79
2015-06-02 11:27:59,2015-06-02 11:55:14,1,1.50,20.15


In [74]:
df.loc['2015-06-02 11:25':'2015-06-02 11:27'].shape

(953, 4)

In [76]:
df.loc['2015-06-02 11:25':'2015-06-02 11:27']['total_amount'].mean()

16.93557187827912

In [78]:
df.loc['2015-06-02']['trip_distance'].mean()

2.6611988171064604

In [79]:
df.head(20)

Unnamed: 0_level_0,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,17.8
2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,8.3
2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,11.0
2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,17.16
2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,10.3
2015-06-02 11:19:33,2015-06-02 11:28:48,1,1.4,10.55
2015-06-02 11:19:34,2015-06-02 11:38:46,1,1.8,16.3
2015-06-02 11:19:35,2015-06-02 12:36:46,4,11.9,73.84
2015-06-02 11:19:36,2015-06-02 11:45:19,1,1.27,15.8
2015-06-02 11:19:38,2015-06-02 11:23:50,1,0.6,6.3


In [80]:
df.resample('1H')

<pandas.core.resample.DatetimeIndexResampler object at 0x1243aea60>

In [81]:
df.resample('1H')['trip_distance'].mean()

tpep_pickup_datetime
2015-06-01 00:00:00    4.312809
2015-06-01 01:00:00         NaN
2015-06-01 02:00:00         NaN
2015-06-01 03:00:00         NaN
2015-06-01 04:00:00         NaN
                         ...   
2015-06-06 12:00:00         NaN
2015-06-06 13:00:00         NaN
2015-06-06 14:00:00         NaN
2015-06-06 15:00:00         NaN
2015-06-06 16:00:00    2.852166
Freq: H, Name: trip_distance, Length: 137, dtype: float64

In [82]:
df.resample('1H')['trip_distance'].mean().dropna()

tpep_pickup_datetime
2015-06-01 00:00:00    4.312809
2015-06-02 11:00:00    2.661199
2015-06-04 15:00:00    2.986285
2015-06-06 16:00:00    2.852166
Name: trip_distance, dtype: float64

In [83]:
asdfafa asdf afa asdfaf a fa fasf a as fsadf a 
asdfafa asdfafa asdfaf 



SyntaxError: invalid syntax (<ipython-input-83-f68175757d88>, line 1)