In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '../data/nyc_taxi_2019-07.csv'

df = pd.read_csv(filename,
                usecols=['tpep_pickup_datetime', 'trip_distance', 'passenger_count', 'total_amount'],
                 parse_dates=['tpep_pickup_datetime'])

df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,1.0,0.0,4.94
1,2019-07-01 00:46:04,1.0,4.16,20.3
2,2019-07-01 00:25:09,1.0,18.8,70.67
3,2019-07-01 00:33:32,1.0,18.46,66.36
4,2019-07-01 00:00:55,0.0,1.7,15.3


# Beyond 1

Export the `tpep_pickup_datetime` date in Unix time -- i.e., the number of seconds since 1 January 1970. This will be an integer value.

In [3]:
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].view(np.int64) / 10**9

df.to_csv('ex40b1_taxi_07_2019.csv',
         sep='\t',
         columns=['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'total_amount'])

# Beyond 2

Read the data frame from "Beyond 1" back into a data frame. Read the `tpep_pickup_datetime` column into a string column, and use `pd.to_datetime` to convert it into a datetime column.

In [4]:
df = pd.read_csv('ex40b1_taxi_07_2019.csv',
           sep='\t',
           usecols=['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'total_amount'])

df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], unit='s', origin='unix')
df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,1.0,0.0,4.94
1,2019-07-01 00:46:04,1.0,4.16,20.3
2,2019-07-01 00:25:09,1.0,18.8,70.67
3,2019-07-01 00:33:32,1.0,18.46,66.36
4,2019-07-01 00:00:55,0.0,1.7,15.3


# Beyond 3

Repeat "Beyond 2", but using date_format and `lambda`

In [5]:
import datetime 

df = pd.read_csv('ex40b1_taxi_07_2019.csv',
           sep='\t',
           usecols=['tpep_pickup_datetime', 'passenger_count', 'trip_distance', 'total_amount'],
           parse_dates=['tpep_pickup_datetime'],
           date_parser=lambda s: datetime.datetime.fromtimestamp(float(s)))

df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 03:51:04,1.0,0.0,4.94
1,2019-07-01 03:46:04,1.0,4.16,20.3
2,2019-07-01 03:25:09,1.0,18.8,70.67
3,2019-07-01 03:33:32,1.0,18.46,66.36
4,2019-07-01 03:00:55,0.0,1.7,15.3


In [6]:
df.dtypes

tpep_pickup_datetime    datetime64[ns]
passenger_count                float64
trip_distance                  float64
total_amount                   float64
dtype: object