In [1]:
import os
GPU_id = 3
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import warnings
warnings.filterwarnings("ignore")

import cudf as gd
import cupy as cp
import pandas as pd
import numpy as np
import os
import time
import nvstrings
import matplotlib.pyplot as plt
%matplotlib inline

### Global

In [3]:
GPU_RUN_TIME = {}
CPU_RUN_TIME = {}
STEPS = []

### Functions

In [4]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = cp.array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def count_items(data,cols):
    dg = data.groupby(cols+['item_id'],
            as_index=False).agg({'step':['count']})
    if len(cols) == 0:
        tag = 'global'
    else:
        tag = '_'.join(cols)
    dg.columns = cols + ['item_id', 'count_item_%s'%tag]

    if len(cols):
        df = data.groupby(cols,
            as_index=False).agg({'step':['count']})
        df.columns = cols + ['count_item_%s_all'%tag]
    
        dg = dg.merge(df,on=cols,how='left')
        dg['count_item_%s_norm'%tag] = dg['count_item_%s'%tag] / dg['count_item_%s_all'%tag]
    
        dg = dg.drop('count_item_%s_all'%tag,axis=1)
        del df
    return dg

### Read csv data

In [5]:
if os.path.exists('cache')==False:
    os.mkdir('cache')

In [6]:
path = '/datasets/trivago/data/'

### cudf read csv

In [7]:
%%time
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'reference']

train_gd = gd.read_csv('%s/train.csv'%path,usecols=cols)
test_gd = gd.read_csv('%s/test.csv'%path,usecols=cols)
data_gd = gd.concat([train_gd,test_gd])


data_gd = data_gd[cols]
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'item_id']
data_gd.columns = cols

print('combined',data_gd.shape)

combined (19715327, 6)
CPU times: user 2.69 s, sys: 1.69 s, total: 4.38 s
Wall time: 6.11 s


In [8]:
%%time
cols = ['user_id','session_id','item_id','city','device']
data_pair_gd = gd.read_parquet('../cache/data_pair.parquet')[cols]

CPU times: user 428 ms, sys: 960 ms, total: 1.39 s
Wall time: 2.11 s


In [9]:
del train_gd,test_gd

### Only keep interaction rows

In [10]:
step = 'contrain string'
STEPS.append(step)

### cudf find string within string

In [11]:
%%time
start = time.time()

data_gd['is_interaction'] = on_gpu(data_gd['action_type'].data,'contains',
                                          arg='interaction',dtype=np.bool)
data_interaction_gd = data_gd[data_gd['is_interaction']]
data_interaction_gd['item_id'] = data_interaction_gd['item_id'].astype(int)
print(data_interaction_gd['item_id'].unique().shape)
GPU_RUN_TIME[step] = time.time() - start

(268470,)
CPU times: user 2.22 s, sys: 824 ms, total: 3.04 s
Wall time: 4.05 s


### count items with/without other columns

In [12]:
step = 'count items'
STEPS.append(step)

In [13]:
%%time
start = time.time()
count_user_session_gd = count_items(data_interaction_gd,cols=['user_id','session_id'])
count_user_session_city_gd = count_items(data_interaction_gd,cols=['user_id','session_id','city'])
count_user_city_gd = count_items(data_interaction_gd,cols=['user_id','city'])
count_city_gd = count_items(data_interaction_gd,cols=['city'])
count_global_gd = count_items(data_interaction_gd,cols=[])
GPU_RUN_TIME[step] = time.time() - start

CPU times: user 1.22 s, sys: 1.46 s, total: 2.68 s
Wall time: 5.86 s


### Merge the encoding to pair

In [14]:
step = 'merge'
STEPS.append(step)

### cudf merge

In [15]:
%%time
start = time.time()
data_pair_gd = data_pair_gd.merge(count_user_session_gd,on=['user_id','session_id','item_id'],how='left')
data_pair_gd = data_pair_gd.merge(count_user_session_city_gd,on=['user_id','session_id','city','item_id'],how='left')
data_pair_gd = data_pair_gd.merge(count_user_city_gd,on=['user_id','city','item_id'],how='left')
data_pair_gd = data_pair_gd.merge(count_city_gd,on=['city','item_id'],how='left')
GPU_RUN_TIME[step] = time.time() - start

CPU times: user 5.36 s, sys: 4.09 s, total: 9.45 s
Wall time: 12.3 s


### Visualize the timing

In [16]:
GPU_RUN_TIME['Overall'] = sum([GPU_RUN_TIME[i] for i in STEPS])
STEPS.append('Overall')

timing = pd.DataFrame()
timing['step'] = STEPS
timing['GPU'] = [GPU_RUN_TIME[i] for i in STEPS]
timing

Unnamed: 0,step,GPU
0,contrain string,4.048962
1,count items,5.857505
2,merge,12.288357
3,Overall,22.194824
