In [1]:
import os
GPU_id = 0
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf

from preproc import *
from batchloader import *
from helpers import get_mean_reciprocal_rank, roc_auc_score
cudf.__version__

'0.9.0'

In [3]:
%load_ext snakeviz
# load snakeviz if you want to run profiling 

<h1> <center> <a id=batchdatabunch>New Data Bunch </a></center> </h1> 

### Define a custom databunch fastai that takes a TensorBatchDataLoader instead of the usual torch DataLoader 

In [4]:
class BatchDataBunch(DataBunch):
    
    @classmethod
    def remove_tfm(cls, tfm:Callable)->None:
        "Remove `tfm` from `self.tfms`."
        if tfm in cls.tfms: cls.tfms.remove(tfm)
            
    @classmethod
    def add_tfm(cls,tfm:Callable)->None:
        "Add `tfm` to `self.tfms`."
        cls.tfms.append(tfm)

    
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs=None, 
                      num_workers:int=defaults.cpus, device:torch.device=None,
                      collate_fn:Callable=data_collate, tfms: List[Callable]=None, 
                       size:int=None, **kwargs)->'BatchDataBunch':
        
        
        cls.tfms = listify(tfms)
        
        
        val_bs = ifnone(val_bs, bs)
        
        datasets = [TensorBatchDataset(train_ds, batch_size=bs), 
                    TensorBatchDataset(valid_ds, batch_size=bs)]
        
        if valid_ds is not None:
            cls.empty_val = False
        else:
            cls.empty_val = True
            
        if test_ds is not None:
            datasets.append(TensorBatchDataset(test_ds, batch_size=bs))
        else: 
            datasets.append(test_ds)
        
        cls.device = defaults.device if device is None else device
        
        dls = [BatchDataLoader(d, shuffle=s, pin_memory=False, drop_last=False, device=cls.device) for d,s in
               zip(datasets,(True,False,False)) if d is not None]

        cls.path = path 
        
        cls.dls = dls
    
        
        
        assert not isinstance(dls[0],DeviceDataLoader)
        
        
        # load batch in device 
        
        if test_ds is not None:
            cls.train_dl, cls.valid_dl, cls.test_dl = dls
        else: 
            cls.train_dl, cls.valid_dl = dls
            
            
        cls.path = Path(path)
        return cls
    


## Config of the fastest workflow 

In [5]:
to_cpu = False
batch_size = 4096*50
lr = 0.09

## Process data : 

In [6]:
# %%snakeviz 
# uncomment the line above to generate the snakeviz profile of preprocessing 

data_path = '../cache/'
TEST = 'test'
VALID = 'valid'
TRAIN = 'train'

start0 = time()
data = {}

############################
#                          #
# Fit processing train set #
#                          #
############################
start = time()
path = os.path.join(data_path,TRAIN+'.parquet' )
ds = cudf.read_parquet(path)
print(f"read {TRAIN} used {time()-start:.2f} seconds.")

print(ds.head())

# get variable names 
start = time()
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in ds.columns if i.startswith('is_')]
cont_names = ['price','candidate_order'] + [i for i in ds.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
print(f"get variables names used {time()-start:.2f} seconds.")

# init the processing class 
proc = PreprocessDF(cat_names=cat_names, cont_names=cont_names, label_name='target', to_cpu=to_cpu)

# Fit training 
start = time()
x, y = proc.preproc_dataframe(ds, mode=TRAIN)
print(f"processing {TRAIN} used {time()-start:.2f} seconds.")
del ds
data[TRAIN] = (x, y)

############################
#                          #
# Transform test and valid #
#                          #
############################  
ds_name = [TEST, VALID]
for name in ds_name:
    path = os.path.join(data_path,name+'.parquet' )
    ds = cudf.read_parquet(path)

    print(f"read {name} used {time()-start:.2f} seconds.")
    start = time()
    x, y = proc.preproc_dataframe(ds, mode=name)
    print(f"processing {name} used {time()-start:.2f} seconds.")
    data[name] = (x, y)
    del ds

print(f"The whole processing used {time()-start0:.2f} seconds.")

read train used 5.29 seconds.
    row_id  candidate_order  item_id  price  row_id_count       user_id  \
25       1                0    55109    162            25  00RL8Z82B2Z1   
26       1                1   129343     25            25  00RL8Z82B2Z1   
27       1                2    54824    150            25  00RL8Z82B2Z1   
28       1                3  2297972    143            25  00RL8Z82B2Z1   
29       1                4   109014    101            25  00RL8Z82B2Z1   

       session_id   timestamp  step    action_type  reference platform  \
25  aff3928535f48  1541038485    16  clickout item    1257342       AU   
26  aff3928535f48  1541038485    16  clickout item    1257342       AU   
27  aff3928535f48  1541038485    16  clickout item    1257342       AU   
28  aff3928535f48  1541038485    16  clickout item    1257342       AU   
29  aff3928535f48  1541038485    16  clickout item    1257342       AU   

                 city  device current_filters  clickout_missing  target  


## Fastai training 

In [7]:
train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
databunch = BatchDataBunch.create(train, validation, device='cuda', bs=batch_size)   

In [8]:
%%snakeviz
emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8)]  

model = TabularModel(emb_szs = emb_sz, n_cont=len(cont_names), out_sz=2, layers=[64, 32]).cuda()

learn =  Learner(databunch, model, metrics=None)

learn.loss_func = torch.nn.CrossEntropyLoss()

start = time()
learn.fit_one_cycle(1, lr)
t_final = time() - start 

epoch,train_loss,valid_loss,time
0,0.15194,0.145247,00:19


 
*** Profile stats marshalled to file '/tmp/tmpg9grtpb_'. 
Embedding SnakeViz in this document...


In [9]:
# get validation metrics 
ds = pd.read_parquet("./parquet_data/data_pair_all/valid.parquet")
yp,y_valid = learn.get_preds(databunch)
cv = ds[['row_id','reference','item_id', 'target']].copy()
cv['prob'] = yp.numpy()[:,1]
cv = cv.sort_values(by=['row_id','prob'],ascending=False)
auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
mean_reciprocal_rank = get_mean_reciprocal_rank(cv)

In [10]:
print("the mrr of the best mdodel is: %s " %mean_reciprocal_rank)

print("the auc of the best mdodel is: %s " %auc)

print("the best mdodel's training time is %s " %t_final)

the mrr of the best mdodel is: 0.4684004794019787 
the auc of the best mdodel is: 0.803668717305735 
the best mdodel's training time is 19.02902913093567 
