In [57]:
import pygdf
import numpy as np
from pygdf.dataframe import DataFrame
from numba import cuda
import json

In [58]:
from pyarrow import RecordBatchStreamReader
import sys, pandas as pd

In [59]:
from pygdf.gpuarrow import GpuArrowReader
import pygdf.gpuarrow as gpuarrow

In [60]:
from python_scripts.numbaHistinMem import numba_gpu_histogram

## reading and converting to pygdf

In [61]:
def readArrowToDF(source):
    reader = RecordBatchStreamReader(source)
    pa_df = reader.read_all()
    return pa_df.to_pandas()

In [62]:
gdf_df = DataFrame.from_pandas(readArrowToDF("node_server/uploads/uber-dataset.arrow"))

In [63]:
gdf_df.head().to_pandas()

Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time
0,6.0,5.0,17.0,2917.179932,790.280029,2810.97998,1.32
1,3.0,38.0,20.0,339.73999,254.410004,287.540009,1.67
2,2.0,61.0,7.0,1443.359985,374.619995,1397.72998,1.29
3,7.0,25.0,2.0,152.0,180.399994,95.339996,2.61
4,2.0,79.0,22.0,2320.409912,324.850006,2299.219971,1.14


In [53]:
gdf_df = gdf_df.query('sourceid%2 == 0')



In [55]:
len(gdf_df)

4985156

In [40]:
%%time
def test(sourceid,out1):
    for in1 in range(len(sourceid)):
        if sourceid[in1]%2 ==0:
            out1[in1] = sourceid[in1]
            
out_df = gdf_df.apply_rows(test, incols=['sourceid'], outcols={'out1':np.float64},kwargs={})

CPU times: user 204 ms, sys: 4.22 ms, total: 208 ms
Wall time: 205 ms




In [44]:
gdf_df = out_df.query('out1 != 0').loc[:,gdf_df.columns]



In [49]:
len(gdf_df)

4985156

In [66]:
%timeit gdf_df.query('sourceid>100 and sourceid<200')



78.1 ms ± 446 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [67]:
%timeit gdf_df.loc[100:200, ['sourceid','hod']]

82.9 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%timeit gdf_df.groupby(by=['sourceid']).agg({'sourceid':['count']})



446 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%time
a = gdf_df['sourceid'].to_gpu_array()
a = a.copy_to_host()

In [29]:
%timeit np.unique(a)

374 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
gdf1 = gdf_df

In [18]:
%timeit gdf_df.query('hod>17 and mean_travel_time>999')



86.7 ms ± 4.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
len(gdf_df)

10241570

In [12]:
len(gdf1)

10241570

In [14]:
gdf_df.query('mean_travel_time>9999').nlargest(10,'mean_travel_time')



ValueError: n out-of-bound

### Reading file from csv using pandas and converting it to pygdf

In [12]:
%timeit DataFrame.from_pandas(pd.read_csv('node-server/uploads/uber-dataset'))

5.24 s ± 16.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Reading file from arrow using pyarrow and converting it to pygdf

In [13]:
%timeit DataFrame.from_pandas(readArrowToDF("node-server/uploads/uber-dataset.arrow"))

236 ms ± 6.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
len(gdf_df1.columns)

2

In [9]:
len(gdf_df.columns)

7

In [11]:
gdf_df.query("hod == 17.0 and mean_travel_time<200").head()



  sourceid dstid  hod mean_travel_time standard_deviation_travel_time geometric_mean_travel_time geometric_standard_deviation_travel_time
2112     90.0 233.0 17.0           169.55                         173.29                     104.93                                      3.0
3502     78.0 965.0 17.0           132.61                         113.72                      90.51                                     2.63
3699    103.0  61.0 17.0           163.01                         193.34                     117.64                                     2.11
5682    103.0 472.0 17.0            97.06                         122.24                      68.36                                     2.11
5836    109.0 481.0 17.0           198.19                          128.6                     168.64                                     1.81

In [9]:
gdf_df.loc[190:200].to_pandas()

Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time
190,42.0,73.0,12.0,2535.939941,444.480011,2506.689941,1.15
191,9.0,408.0,1.0,1359.400024,399.390015,1306.839966,1.32
192,47.0,30.0,8.0,1649.040039,339.970001,1610.209961,1.25
193,5.0,451.0,13.0,232.330002,35.98,229.699997,1.16
194,8.0,423.0,23.0,1155.0,86.419998,1151.920044,1.07
195,9.0,416.0,2.0,1933.780029,446.929993,1898.859985,1.19
196,9.0,417.0,7.0,1172.670044,107.209999,1167.329956,1.1
197,46.0,47.0,7.0,1382.670044,504.130005,1292.119995,1.45
198,41.0,98.0,12.0,1827.310059,349.890015,1794.969971,1.21
199,47.0,46.0,13.0,1600.290039,571.619995,1510.280029,1.4


In [12]:
a = 'A'
b= 'B'

In [16]:
str((gdf_df[a].min(),gdf_df[b].min()))

'(1.0, 1.0)'

In [14]:
abcd = gdf_df.nlargest(5,['A']).to_pandas().to_dict()



In [15]:
dict = {}
for i in abcd:
    dict[i] = list(abcd[i].values())

In [16]:
json.dumps(dict)

'{"A": [999.0, 999.0, 999.0, 999.0, 999.0], "B": [599.0, 777.0, 749.0, 929.0, 459.0]}'

In [50]:
temp_reader = pa.RecordBatchFileReader("node-server/uploads/uber-dataset.arrow")

In [51]:
print(temp_reader.num_record_batches)
batch = temp_reader.get_record_batch(0)

1


In [53]:
batch.serialize()

<pyarrow.lib.Buffer at 0x7f60e92b21f0>

In [49]:
def readArrowToDF(source):
    temp_reader = open(source)
    print(len(temp_reader))
    return temp_reader
#     batch = temp_reader.read_all()
#     return batch.serialize().to_pybytes(), batch.schema.serialize().to_pybytes()

In [50]:
# df_buffer,schema_buffer = 
t = readArrowToDF("node-server/uploads/uber-dataset.arrow")

1


In [86]:
with open("node-server/uploads/uber-dataset.arrow", "rb") as f:
    byte = f.read()

In [87]:
len(byte)

368697000

In [61]:
len(contents)

0

In [38]:
a = pa.BufferReader(df_buffer).read()

In [40]:
a[:99]

b'\xdc\x01\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x16\x00\x06\x00\x05\x00\x08\x00\x0c\x00\x0c\x00\x00\x00\x00\x03\x03\x00\x18\x00\x00\x00\xc8\xdc\xf9\x15\x00\x00\x00\x00\x00\x00\n\x00\x18\x00\x0c\x00\x04\x00\x08\x00\n\x00\x00\x00\x1c\x01\x00\x00\x10\x00\x00\x00"F\x9c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

In [88]:
df_buffer = byte

In [89]:
schema_buffer

b'\x8c\x07\x00\x00\x10\x00\x00\x00\x0c\x00\x0e\x00\x06\x00\x05\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x00\x01\x03\x00\x10\x00\x00\x00\x00\x00\n\x00\x0c\x00\x00\x00\x04\x00\x08\x00\n\x00\x00\x00L\x05\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x0c\x00\x00\x00\x08\x00\x0c\x00\x04\x00\x08\x00\x08\x00\x00\x00\x08\x00\x00\x00\x10\x00\x00\x00\x06\x00\x00\x00pandas\x00\x00\x15\x05\x00\x00{"index_columns": ["__index_level_0__"], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "sourceid", "field_name": "sourceid", "pandas_type": "float32", "numpy_type": "float32", "metadata": null}, {"name": "dstid", "field_name": "dstid", "pandas_type": "float32", "numpy_type": "float32", "metadata": null}, {"name": "hod", "field_name": "hod", "pandas_type": "float32", "numpy_type": "float32", "metadata": null}, {"name": "mean_travel_time", "field_name": "mean_travel_time", "pandas_type": "float32", "nump

In [90]:
cpu_schema = np.ndarray(shape=len(schema_buffer), dtype=np.byte,
                            buffer=bytearray(schema_buffer))
cpu_data = np.ndarray(shape=len(df_buffer), dtype=np.byte,
                          buffer=bytearray(df_buffer))


In [7]:
schema_data = b"""\x00\x01\x00\x00\x10\x00\x00\x00\x0c\x00\x0e\x00\x06\x00\x05\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x00\x01\x02\x00\x10\x00\x00\x00\x00\x00\n\x00\x08\x00\x00\x00\x04\x00\x00\x00\n\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00l\x00\x00\x00\x04\x00\x00\x00\xb0\xff\xff\xff\x00\x00\x01\x038\x00\x00\x00\x1c\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00\x1c\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x9a\xff\xff\xff\x00\x00\x01\x00\x8c\xff\xff\xff \x00\x01\x00\x94\xff\xff\xff\x01\x00\x02\x00\x08\x00\x00\x00dest_lon\x00\x00\x00\x00\x14\x00\x18\x00\x08\x00\x06\x00\x07\x00\x0c\x00\x00\x00\x10\x00\x14\x00\x00\x00\x14\x00\x00\x00\x00\x00\x01\x03H\x00\x00\x00$\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00,\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x08\x00\x06\x00\x06\x00\x00\x00\x00\x00\x01\x00\xf8\xff\xff\xff \x00\x01\x00\x08\x00\x08\x00\x04\x00\x06\x00\x08\x00\x00\x00\x01\x00\x02\x00\x08\x00\x00\x00dest_lat\x00\x00\x00\x00\x00\x00\x00\x00"""  # noqa: E501
recbatch_data = b"""\xdc\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x16\x00\x06\x00\x05\x00\x08\x00\x0c\x00\x0c\x00\x00\x00\x00\x03\x02\x00\x18\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\n\x00\x18\x00\x0c\x00\x04\x00\x08\x00\n\x00\x00\x00|\x00\x00\x00\x10\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xbf0\x1dB\xd9$\'B\x02E\xecA\xd9$\'B\xbf0\x1dB\x9c\xb3\x1cB\xd1)\xedAw\x7f\x10B\x02E\xecArc\x03B\x02E\xecArc\x03B\xd9$\'B\x93\xb2\x18BC\xf7!B\xd9$\'B\x91\xa7\x06Bg\x8e\xf1A\xd9$\'Bw\x7f\x10B]n\xe3A\xd9$\'B\x02E\xecA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x85m\xbd\xc2>\x81\xaf\xc2\x87\xf0\xc4\xc2>\x81\xaf\xc2\x85m\xbd\xc2\x1eV\x99\xc2\xcb\x8e\xbe\xc2;[\xad\xc2\x87\xf0\xc4\xc2\x1b\xb4\xc1\xc2\x87\xf0\xc4\xc2\x1b\xb4\xc1\xc2>\x81\xaf\xc2\xd5x\xab\xc2;w\xa0\xc2>\x81\xaf\xc2C\xa5\xcb\xc2\xf9V\xc3\xc2>\x81\xaf\xc2;[\xad\xc2\xce\xa1\xa2\xc2>\x81\xaf\xc2\x87\xf0\xc4\xc2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"""  # noqa: E501
cpu_schema = np.ndarray(shape=len(schema_data), dtype=np.byte,
                            buffer=bytearray(schema_data))
cpu_data = np.ndarray(shape=len(recbatch_data), dtype=np.byte,
                          buffer=bytearray(recbatch_data))

In [91]:
gpu_data = cuda.to_device(cpu_data)

In [92]:
del cpu_data

In [93]:
reader = GpuArrowReader(cpu_schema,gpu_data)

In [94]:
temp_df = reader.to_dict()

In [95]:
temp_df.keys()

odict_keys(['sourceid', 'dstid', 'hod', 'mean_travel_time', 'standard_deviation_travel_time', 'geometric_mean_travel_time', 'geometric_standard_deviation_travel_time', '__index_level_0__'])

In [102]:
len(temp_df['geometric_standard_deviation_travel_time'])

2560392

In [30]:
df = DataFrame(temp_df.items())



ValueError: Length of values does not match index length

In [19]:
df.to_pandas()

Unnamed: 0,dest_lat,dest_lon
0,39.297604,-94.713905
1,41.785984,-87.752426
2,29.533695,-98.46978
3,41.785984,-87.752426
4,39.297604,-94.713905
5,39.1754,-76.668198
6,29.645418,-95.278893
7,36.124477,-86.678185
8,29.533695,-98.46978
9,32.847115,-96.851768


In [65]:
masked_col = reader[6]

In [74]:
temp_df.keys()

odict_keys(['sourceid', 'dstid', 'hod', 'mean_travel_time', 'standard_deviation_travel_time', 'geometric_mean_travel_time', 'geometric_standard_deviation_travel_time', '__index_level_0__'])

In [75]:
a = temp_df.pop('__index_level_0__',None)

In [78]:
len(temp_df)

7

In [18]:
df = DataFrame(reader.to_dict().items())



ValueError: Length of values does not match index length

In [30]:
buffer = bytearray(df_buffer)

In [6]:
def readArrow(source):
    reader = pa.RecordBatchStreamReader(source)
    return reader.read_all()

In [7]:
df1 = readArrow("node-server/uploads/uber-dataset.arrow")

In [9]:
df1.to

pyarrow.Table
sourceid: float
dstid: float
hod: float
mean_travel_time: float
standard_deviation_travel_time: float
geometric_mean_travel_time: float
geometric_standard_deviation_travel_time: float
__index_level_0__: int64
metadata
--------
{b'pandas': b'{"index_columns": ["__index_level_0__"], "column_indexes": [{"na'
            b'me": null, "field_name": null, "pandas_type": "unicode", "numpy_'
            b'type": "object", "metadata": {"encoding": "UTF-8"}}], "columns":'
            b' [{"name": "sourceid", "field_name": "sourceid", "pandas_type": '
            b'"float32", "numpy_type": "float32", "metadata": null}, {"name": '
            b'"dstid", "field_name": "dstid", "pandas_type": "float32", "numpy'
            b'_type": "float32", "metadata": null}, {"name": "hod", "field_nam'
            b'e": "hod", "pandas_type": "float32", "numpy_type": "float32", "m'
            b'etadata": null}, {"name": "mean_travel_time", "field_name": "mea'
            b'n_travel_time", "pandas_t

In [5]:
gpu_df = DataFrame.from_pandas(df)

In [19]:
gpu_df[gpu_df.columns[0]].to_gpu_array()

<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7f4245ece2e8>

In [70]:
%timeit a = gpu_df.geometric_mean_travel_time.to_gpu_array()

80.7 µs ± 671 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [69]:
a

<numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7f4752fccf28>

In [55]:
gpu_df.nlargest(2,['mean_travel_time'])



  sourceid  dstid  hod mean_travel_time standard_deviation_travel_time geometric_mean_travel_time geometric_standard_deviation_travel_time
1438470    234.0  729.0 15.0         10578.17                        1514.47                   10469.62                                     1.15
4034052    234.0 1132.0 15.0         10531.17                        1517.13                   10421.73                                     1.16

In [23]:
%timeit df['hod'].head()

63.2 µs ± 297 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [37]:
%timeit gpu_df.loc[2:6]

86.7 ms ± 31.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [41]:
from numba import cuda

In [43]:
a = cuda.to_device(gpu_df.to_pandas)

In [45]:
a.copy_to_host().head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [28]:
%timeit df.loc[df['mean_travel_time'] < 100]

8.98 ms ± 6.57 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
df = DataFrame()
nelem = 10**6  # A million item
df['key1'] = np.random.randint(0, 5, nelem)
df['key2'] = np.random.randint(0, 3, nelem)
df['val1'] = np.arange(1, 1 + nelem)
df['val2'] = np.random.random(nelem)

In [4]:
df.head().to_pandas()

Unnamed: 0,key1,key2,val1,val2
0,1,0,1,0.832418
1,0,1,2,0.70272
2,2,1,3,0.814997
3,4,0,4,0.698803
4,2,2,5,0.632527


In [5]:
df.groupby(by=['key1', 'key2']).mean()



   key1 key2               val1                val2
 0    0    0 499991.13897751615   0.500500273341694
 1    0    1 498149.71224435116 0.49915224768559957
 2    0    2   500246.831658405 0.49932934396900014
 3    1    0  499929.9718701189  0.4997970229389601
 4    1    1  501476.9940390855  0.5006826483188622
 5    1    2  500412.3009169631  0.5002174350180542
 6    2    0  499810.1914909582  0.5017281713406735
 7    2    1  500080.0818064206  0.5018919081998916
 8    2    2  500726.5189073721 0.49894799193962097
 9    3    0  500682.3600508145 0.49976123494834945
10    3    1 499071.18907015416 0.49736679651401494
11    3    2  500453.4085117837   0.500835048713539
12    4    0 499907.88592142676  0.5009347385916217
13    4    1 500420.31843726116  0.5011883284453145
14    4    2  498677.7382152609 0.49821989306868103