In [2]:
import pyarrow as pa

from pygdf.dataframe import DataFrame

import pandas as pd
import json
import numpy as np
from python_scripts.numbaHistinMem import numba_gpu_histogram

In [3]:
def readArrowToDF(source):
    reader = pa.RecordBatchStreamReader(source)
    pa_df = reader.read_all()
    return pa_df.to_pandas()

In [4]:
def arrowToDisk(df, destination):
    pa_df = pa.RecordBatch.from_pandas(df)
    path = destination+".arrow"
    file = open(path, 'wb')
    writer = pa.ipc.RecordBatchStreamWriter(file, pa_df.schema)
    writer.write_batch(pa_df)
    writer.close()
    file.close()

## Read uber-dataset-v2 arrow 

In [5]:
df = readArrowToDF("node_server/uploads/uber-dataset-v2.arrow")

In [6]:
def process(df,num_of_replicas, name, num_rows = 0):
    df_final = df.append([df]*num_of_replicas, ignore_index=True)
    if num_rows == 0:
        print(len(df_final))
        arrowToDisk(df_final,'node_server/uploads/'+name)
    else:
        df_final = df_final.loc[0:num_rows]
        print(len(df_final))
        arrowToDisk(df_final,'node_server/uploads/'+name)

In [6]:
df.head()

Unnamed: 0,sourceid,dstid,hod,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,source_lat,source_long,dst_lat,dst_long
0,650,181,19,1187.61,481.13,1104.91,1.45,-122.402451,37.761448,-122.259369,37.816994
1,232,2221,8,1242.33,183.8,1229.94,1.15,-122.128365,37.463764,-122.030777,37.236847
2,809,2460,23,1276.63,226.23,1257.19,1.19,-122.248901,37.817375,-122.407974,37.802288
3,1838,2363,0,431.26,193.26,405.81,1.38,-122.072083,37.369617,-122.001808,37.316303
4,82,910,8,1858.73,342.7,1825.93,1.21,-122.406548,37.751953,-122.50721,37.777042


## Create a 1-col dataset

### non-filter

In [5]:
uber_1_col = pd.DataFrame(df[['hod']], dtype=np.float32)
del(df)

In [6]:
uber_1_col_final = uber_1_col.append([uber_1_col]*16, ignore_index=True)
print(len(uber_1_col_final))
del(uber_1_col)

821521243


In [7]:
arrowToDisk(uber_1_col_final,'node_server/uploads/uber-1-col-nonfilter')

### filter

In [5]:
uber_1_col = pd.DataFrame(df[['hod']], dtype=np.float32)

In [6]:
uber_1_col_final = uber_1_col.append([uber_1_col]*6, ignore_index=True)
print(len(uber_1_col_final))

338273453


In [7]:
arrowToDisk(uber_1_col_final,'node_server/uploads/uber-1-col')

## Create a 2-col dataset

In [None]:
uber_2_col = pd.DataFrame(df[['hod','mean_travel_time']], dtype=np.float32)

In [9]:
print(uber_2_col.dtypes)

#48 Million rows
len(uber_2_col)

48324779

#### Multiply the rows by replicating

In [10]:
uber_2_col_final = uber_2_col.append([uber_2_col]*4, ignore_index=True)
len(uber_2_col_final)

241623895

In [30]:
arrowToDisk(uber_2_col_final,'node_server/uploads/uber-2-col')

## Create a 3-col dataset

In [7]:
uber_3_col = pd.DataFrame(df[['hod','mean_travel_time','standard_deviation_travel_time']], dtype=np.float32)

In [8]:
print(uber_3_col.dtypes)

#48 Million rows
len(uber_3_col)

hod                               float32
mean_travel_time                  float32
standard_deviation_travel_time    float32
dtype: object


48324779

In [9]:
uber_3_col_final = uber_3_col.append([uber_3_col]*4, ignore_index=True)
len(uber_3_col_final)

241623895

In [10]:
uber_3_col_final = uber_3_col_final.loc[0:200623895]

In [11]:
arrowToDisk(uber_3_col_final,'node_server/uploads/uber-3-col')

## Create a 4-col dataset

In [29]:
uber_4_col = pd.DataFrame(df[['hod','mean_travel_time','standard_deviation_travel_time','geometric_mean_travel_time']], dtype=np.float32)

In [30]:
process(uber_4_col, 3, 'uber-4-col',180623895)

180623896


## Create a 5-col dataset

In [None]:
uber_5_col = pd.DataFrame(df[['hod','mean_travel_time','standard_deviation_travel_time','geometric_mean_travel_time','geometric_standard_deviation_travel_time']], dtype=np.float32)

In [7]:
process(uber_5_col, 3, 'uber-5-col',160623895)

160623896


## Create a 6-col dataset

In [7]:
uber_6_col = pd.DataFrame(df[['hod','mean_travel_time','source_lat','source_long','dst_lat','dst_long']], dtype=np.float32)

In [8]:
process(uber_6_col, 3, 'uber-6-col',140623895)

140623896


## Check pygdf performance

In [32]:
df1 = readArrowToDF('node_server/uploads/uber-6-col.arrow')

In [33]:
gdf = DataFrame.from_pandas(df1)

In [34]:
del(df1)

In [35]:
backup = gdf

In [36]:
gdf.columns

('hod', 'mean_travel_time', 'source_lat', 'source_long', 'dst_lat', 'dst_long')

In [23]:
len(gdf)

140623896

In [24]:
%timeit numba_gpu_histogram(gdf['hod'].to_gpu_array(),24)

73.1 ms ± 8.63 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
gdf = gdf.query('hod>10 and hod<20')

In [26]:
gdf = gdf.query('hod>12 and hod<18')

In [27]:
temp_gdf = gdf.nlargest(100,['hod']).to_pandas().to_dict()

In [28]:
del(gdf)

In [29]:
gdf = backup

In [30]:
del(gdf)
del(backup)