In [22]:
import pygdf
import numpy as np
from pygdf.dataframe import DataFrame
from numba import cuda
import json
from pyarrow import RecordBatchStreamReader, RecordBatchStreamWriter
import pyarrow as pa
import sys, pandas as pd
from pygdf.gpuarrow import GpuArrowReader
import pygdf.gpuarrow as gpuarrow
from python_scripts.numbaHistinMem import numba_gpu_histogram
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
columns = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount"
columns_list = columns.split(',')

In [3]:
df_temp = pd.read_csv("nyc_taxi_data/yellow_tripdata_v1.csv", names=columns_list)

In [4]:
df_temp.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2017-01-09 11:13:28,2017-01-09 11:25:45,1,3.3,1,N,263,161,1,12.5,0.0,0.5,2.0,0.0,0.3,15.3
1,1,2017-01-09 11:32:27,2017-01-09 11:36:01,1,0.9,1,N,186,234,1,5.0,0.0,0.5,1.45,0.0,0.3,7.25
2,1,2017-01-09 11:38:20,2017-01-09 11:42:05,1,1.1,1,N,164,161,1,5.5,0.0,0.5,1.0,0.0,0.3,7.3
3,1,2017-01-09 11:52:13,2017-01-09 11:57:36,1,1.1,1,N,236,75,1,6.0,0.0,0.5,1.7,0.0,0.3,8.5
4,2,2017-01-01 00:00:00,2017-01-01 00:00:00,1,0.02,2,N,249,234,2,52.0,0.0,0.5,0.0,0.0,0.3,52.8


In [5]:
len(df_temp)

113496874

In [6]:
df_temp.dtypes

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
RatecodeID                 int64
store_and_fwd_flag        object
PULocationID               int64
DOLocationID               int64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtype: object

In [7]:
df_temp_1 = df_temp[['PULocationID', 'DOLocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'total_amount']]

In [8]:
df_temp_1.head()

Unnamed: 0,PULocationID,DOLocationID,tpep_pickup_datetime,tpep_dropoff_datetime,total_amount
0,263,161,2017-01-09 11:13:28,2017-01-09 11:25:45,15.3
1,186,234,2017-01-09 11:32:27,2017-01-09 11:36:01,7.25
2,164,161,2017-01-09 11:38:20,2017-01-09 11:42:05,7.3
3,236,75,2017-01-09 11:52:13,2017-01-09 11:57:36,8.5
4,249,234,2017-01-01 00:00:00,2017-01-01 00:00:00,52.8


In [9]:
df_temp_1['tpep_dropoff_datetime'] = pd.to_datetime(df_temp_1.tpep_dropoff_datetime).astype(np.int64) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
df_temp_1['tpep_pickup_datetime'] = pd.to_datetime(df_temp_1.tpep_pickup_datetime).astype(np.int64) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
df_temp_1.dtypes

PULocationID               int64
DOLocationID               int64
tpep_pickup_datetime       int64
tpep_dropoff_datetime      int64
total_amount             float64
dtype: object

In [12]:
len(df_temp_1)

113496874

In [32]:
df_temp_1.to_csv('yellow_tripdata_v2.csv')

In [14]:
df_temp_1.head()

Unnamed: 0,PULocationID,DOLocationID,tpep_pickup_datetime,tpep_dropoff_datetime,total_amount
0,263,161,1483960408000000000,1483961145000000000,15.3
1,186,234,1483961547000000000,1483961761000000000,7.25
2,164,161,1483961900000000000,1483962125000000000,7.3
3,236,75,1483962733000000000,1483963056000000000,8.5
4,249,234,1483228800000000000,1483228800000000000,52.8


## Convert to arrow

In [21]:
def writeArrowFile(path,pa_df):
    path = path+".arrow"
    file = open(path, 'wb')
    writer = pa.ipc.RecordBatchStreamWriter(file, pa_df.schema)
    writer.write_batch(pa_df)
    writer.close()
    file.close()

In [23]:
def convertToArrow(df):
    return pa.RecordBatch.from_pandas(df)

In [24]:
path = 'node_server/uploads/nyc_data_v1'
writeArrowFile(path, convertToArrow(df_temp_1))

In [25]:
def readArrowToDF(source):
    reader = pa.RecordBatchStreamReader(source)
    pa_df = reader.read_all()
    return pa_df.to_pandas()

In [26]:
df = readArrowToDF(path+".arrow")

In [27]:
df.head()

Unnamed: 0,PULocationID,DOLocationID,tpep_pickup_datetime,tpep_dropoff_datetime,total_amount
0,263,161,1483960408000000000,1483961145000000000,15.3
1,186,234,1483961547000000000,1483961761000000000,7.25
2,164,161,1483961900000000000,1483962125000000000,7.3
3,236,75,1483962733000000000,1483963056000000000,8.5
4,249,234,1483228800000000000,1483228800000000000,52.8
