In [1]:
import os
GPU_id = 6
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import warnings
warnings.filterwarnings("ignore")
import cudf as gd
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import time
import nvstrings
from librmm_cffi import librmm
import matplotlib.pyplot as plt
%matplotlib inline

### Global

In [3]:
GPU_RUN_TIME = {}
CPU_RUN_TIME = {}
STEPS = []

### Functions

In [4]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

### Read data

In [5]:
if os.path.exists('../../cache')==False:
    os.mkdir('../../cache')

In [6]:
path = '/datasets/trivago/data/'

In [7]:
step = 'read csv'
STEPS.append(step)

### pandas read csv

In [8]:
%%time
start = time.time()
train_pd = pd.read_csv('%s/train.csv'%path)
test_pd = pd.read_csv('%s/test.csv'%path)
submission_pd = pd.read_csv('%s/submission_popular.csv'%path)
print("train & test",train_pd.shape,test_pd.shape)
data_pd = pd.concat([train_pd,test_pd])
print('combined',data_pd.shape)
CPU_RUN_TIME[step] = time.time() - start

train & test (15932992, 12) (3782335, 12)
combined (19715327, 12)
CPU times: user 28.6 s, sys: 6.12 s, total: 34.7 s
Wall time: 34.7 s


In [9]:
del train_pd,test_pd

### Only keep click out rows

In [10]:
step = 'string comparsion and masking'
STEPS.append(step)

### pandas string comparsion and masking

In [11]:
%%time
start = time.time()
data_pd['is_click_out'] = data_pd['action_type']=='clickout item'
data_pd = data_pd[data_pd['is_click_out']]

data_pd.drop('is_click_out',axis=1,inplace=True)
print("# of clickouts:",data_pd.shape[0])
data_pd['clickout_missing'] = data_pd['reference'].isnull()

print('true test',data_pd[data_pd['clickout_missing']].shape)
assert submission_pd.shape[0] == data_pd[data_pd['clickout_missing']].shape[0]
print('true test shape match submission shape')
CPU_RUN_TIME[step] = time.time() - start

# of clickouts: 2115365
true test (253573, 13)
true test shape match submission shape
CPU times: user 5.2 s, sys: 2.86 s, total: 8.05 s
Wall time: 8.05 s


In [12]:
%%time
data_pd['row_id'] = np.arange(data_pd.shape[0])

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 6.5 ms


### Create recommendation list from `impressions`

In [13]:
step = 'string column split & expand'
STEPS.append(step)

### pandas string column split and expand

In [14]:
%%time
start = time.time()
candidates_pd = data_pd['impressions'].str.split("|", expand = True) 
prices_pd = data_pd['prices'].str.split("|", expand = True) 
CPU_RUN_TIME[step] = time.time() - start

CPU times: user 33.4 s, sys: 3.69 s, total: 37.1 s
Wall time: 37.1 s


In [15]:
%%time
data_pd.drop('impressions',axis=1,inplace=True)
data_pd.drop('prices',axis=1,inplace=True)

CPU times: user 412 ms, sys: 136 ms, total: 548 ms
Wall time: 544 ms


### Assign string columns to dataframe

In [16]:
step = 'assign string columns to dataframe'
STEPS.append(step)

In [17]:
%%time
start = time.time()
data_pd_rec_list = data_pd[['row_id']]
for i in range(candidates_pd.shape[1]):
    data_pd_rec_list['item_%d'%i] = candidates_pd[i]
    data_pd_rec_list['price_%d'%i] = prices_pd[i]
data_pd_rec_list = data_pd_rec_list.set_index('row_id')
CPU_RUN_TIME[step] = time.time() - start

CPU times: user 13.5 s, sys: 2.1 s, total: 15.6 s
Wall time: 15.6 s


### Create data pair

In [18]:
step = 'create data pair'
STEPS.append(step)

In [19]:
%%time
start = time.time()

cols = [i for i in data_pd_rec_list.columns if i.startswith('item_')]
items = data_pd_rec_list[cols].stack().reset_index()
items.columns = ['row_id','candidate_order','item_id']

cols = [i for i in data_pd_rec_list.columns if i.startswith('price_')]
prices = data_pd_rec_list[cols].stack().reset_index()
prices.columns = ['row_id','candidate_order','price']

items['price'] = prices['price'].astype(int)
items['candidate_order'] = items['candidate_order'].apply(lambda x:x.split('_')[1]).astype(int)

count = items['row_id'].value_counts()
items['row_id_count'] = items['row_id'].map(count)
items = items[items['row_id_count']>1]

data_pd['clickout_missing'] = data_pd['clickout_missing'].astype(int)
data_pair_pd = items.merge(data_pd,on='row_id',how='left')

data_pair_pd['reference'] = data_pair_pd['reference'].fillna(-1).astype(int)
data_pair_pd['item_id'] = data_pair_pd['item_id'].fillna(-1).astype(int)
data_pair_pd['target'] = data_pair_pd['reference'] == data_pair_pd['item_id']
data_pair_pd['target'] = data_pair_pd['target'].astype(int)

CPU_RUN_TIME[step] = time.time() - start

CPU times: user 1min 19s, sys: 23.7 s, total: 1min 43s
Wall time: 1min 43s


#### Save To Parquet

#### Get Model Ready & Export to Parquet
Take the current dataframe and export to training, validation, and test sets for processing.

In [20]:
train_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']==0]
test_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']>0]
train_pair_pd['is_va'] = train_pair_pd.row_id%5 == 0
train_pair = train_pair_pd[train_pair_pd['is_va']==0]
valid_pair = train_pair_pd[train_pair_pd['is_va']>0]

In [24]:
train_pair = train_pair.drop(columns=['is_va'])
valid_pair = valid_pair.drop(columns=['is_va'])

In [22]:
valid_pair.shape

(8551343, 18)

In [25]:
# for Fastai version
data_pair_pd.to_parquet('../../cache/data_pair.parquet')
# for Tensor version
valid_pair.to_parquet('../../cache/valid.parquet')
train_pair.to_parquet('../../cache/train.parquet')
test_pair_pd.to_parquet('../../cache/test.parquet')

### Visualize the timing

In [None]:
CPU_RUN_TIME['Overall'] = sum([CPU_RUN_TIME[i] for i in STEPS])
STEPS.append('Overall')

timing = pd.DataFrame()
timing['step'] = STEPS
timing['CPU'] = [CPU_RUN_TIME[i] for i in STEPS]
timing