In [1]:
import os
GPU_id = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [2]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf

from preproc import *
from batchloader import *
from helpers import get_mean_reciprocal_rank, roc_auc_score

- In this notebook we want to benchmark the processing and training time for three diffrent models: 

- The two first models are using our CuDF processing workflow <a href=#cudf_workflow> section I </a>:
     1.  <a href=#first_model> Model 1 </a> : CuDF processing with CPU a copy
     2.  <a href=#second_model> Model 2 </a> : CuDF processing in-memory without copy    

           
 - <a href=#third_model> Model 3 </a> : In the second <a href=#fastai_workflow> section II </a>, we are using the Fastai processing workflow to get the scores of the best model found in the section I.  We directly process and create databunch from data_pair_all.pkl dataframe 

**N.B** : For each model, you need to re-start the kernel to free the GPU memory and be able to run all the experiments 

In [3]:
%load_ext snakeviz
# load snakeviz if you want to run profiling 

<h1> <center> <a id=batchdatabunch>New Data Bunch </a></center> </h1> 

### Define a custom databunch fastai that takes a TensorBatchDataLoader instead of the usual torch DataLoader 

In [5]:
class BatchDataBunch(DataBunch):
    
    @classmethod
    def remove_tfm(cls, tfm:Callable)->None:
        "Remove `tfm` from `self.tfms`."
        if tfm in cls.tfms: cls.tfms.remove(tfm)
            
    @classmethod
    def add_tfm(cls,tfm:Callable)->None:
        "Add `tfm` to `self.tfms`."
        cls.tfms.append(tfm)

    
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs=None, 
                      num_workers:int=defaults.cpus, device:torch.device=None,
                      collate_fn:Callable=data_collate, tfms: List[Callable]=None, 
                       size:int=None, **kwargs)->'BatchDataBunch':
        
        
        cls.tfms = listify(tfms)
        
        
        val_bs = ifnone(val_bs, bs)
        
        datasets = [TensorBatchDataset(train_ds, batch_size=bs), 
                    TensorBatchDataset(valid_ds, batch_size=bs)]
        
        if valid_ds is not None:
            cls.empty_val = False
        else:
            cls.empty_val = True
            
        if test_ds is not None:
            datasets.append(TensorBatchDataset(test_ds, batch_size=bs))
        else: 
            datasets.append(test_ds)
        
        cls.device = defaults.device if device is None else device
        
        dls = [BatchDataLoader(d, shuffle=s, pin_memory=False, drop_last=False, device=cls.device) for d,s in
               zip(datasets,(True,False,False)) if d is not None]

        cls.path = path 
        
        cls.dls = dls
    
        
        
        assert not isinstance(dls[0],DeviceDataLoader)
        
        
        # load batch in device 
        
        if test_ds is not None:
            cls.train_dl, cls.valid_dl, cls.test_dl = dls
        else: 
            cls.train_dl, cls.valid_dl = dls
            
            
        cls.path = Path(path)
        return cls
    


- To use the new BatchDatabunch class, we have to build the following processed tensors ( using cudf)  : 
    - train : cat_tensor, cont_tensor, label_tensor 
    
    - valid : cat_tensor, cont_tensor, label_tensor 
    
    - test : cat_tensor, cont_tensor, label_tensor 
    
- The size of vocaublary of each categorical variable need to be known 

<h2> <a >Tabular Data in-memory  </a> </h2>

In [6]:
to_cpu = False 

<h3> Processing: Definition of train, validation and test tensors </h3>

In [7]:
# %%snakeviz 
# uncomment the line above to generate the snakeviz profile of preprocessing 

data_path = './parquet_data/data_pair_all/'
TEST = 'test'
VALID = 'valid'
TRAIN = 'train'

start0 = time()
data = {}

############################
#                          #
# Fit processing train set #
#                          #
############################
start = time()
path = os.path.join(data_path,TRAIN+'.parquet' )
ds = cudf.read_parquet(path)
print(f"read {TRAIN} used {time()-start:.2f} seconds.")

# get variable names 
start = time()
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in ds.columns if i.startswith('is_')]
cont_names = ['price','candidate_order'] + [i for i in ds.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
print(f"get variables names used {time()-start:.2f} seconds.")

# init the processing class 
proc = PreprocessDF(cat_names=cat_names, cont_names=cont_names, label_name='target', to_cpu=to_cpu)

# Fit training 
start = time()
x, y = proc.preproc_dataframe(ds, mode=TRAIN)
print(f"processing {TRAIN} used {time()-start:.2f} seconds.")
del ds
data[TRAIN] = (x, y)

############################
#                          #
# Transform test and valid #
#                          #
############################  
ds_name = [TEST, VALID]
for name in ds_name:
    path = os.path.join(data_path,name+'.parquet' )
    ds = cudf.read_parquet(path)

    print(f"read {name} used {time()-start:.2f} seconds.")
    start = time()
    x, y = proc.preproc_dataframe(ds, mode=name)
    print(f"processing {name} used {time()-start:.2f} seconds.")
    data[name] = (x, y)
    del ds

print(f"The whole processing used {time()-start0:.2f} seconds.")

read train used 7.86 seconds.
get variables names used 0.00 seconds.
processing train used 11.92 seconds.
read test used 12.70 seconds.
processing test used 9.36 seconds.
read valid used 10.35 seconds.
processing valid used 11.71 seconds.
The whole processing used 42.63 seconds.


<h3> Benchmark : Get the best (batch size, learning rate)</h3> 

- Fine tune the best couple (batch_size, lr) : The criterion used is the CrossEntropy loss function 
    - The range of batch sizes is : 4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200
    - The range of max learning rate was set w.r.t to the plot of the results of the Fastai method find_lr : [6e-2, 9e-2, 2e-1] 
    
    
- **N.B:** Some of the batch_sizes require more than one epoch to get the best score (numbers shown in the paper). However, to the complexity of the notebook, we'll run all the training with 1 epoch as our best model (fastest training time) converges in 1 epoch. 

In [8]:
batch_sizes = [4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200]
lrs = [6e-2, 9e-2, 2e-1] 

In [9]:
# Define batch databunch 
from fastai.tabular import TabularModel
from time import time 
benchmark_results = [] 

for batch_size in batch_sizes: 
    train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
    validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
    test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
    databunch = BatchDataBunch.create(train, validation, device='cuda', bs=batch_size)   
    del train 
    del validation 
    del test
    for learning_rate in lrs: 
        #define the model 
        
        emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
                  (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]  


        model = TabularModel(emb_szs = emb_sz, n_cont=len(cont_names), out_sz=2, layers=[64, 32])

        learn =  Learner(databunch, model, metrics=None)
        
        learn.loss_func = torch.nn.CrossEntropyLoss()
        
        # launch training 
        start = time()
        learn.fit_one_cycle(1, learning_rate)
        t_final = time() - start 
        benchmark_results.append([batch_size, learning_rate, learn.recorder.val_losses[0], 1, t_final] ) 
        print('training for the couple: lr: %s, bs: %s used %.2f' %(learning_rate, batch_size, t_final))

        del learn 
        del model
        torch.cuda.empty_cache()   
        
    del databunch
    torch.cuda.empty_cache()   

epoch,train_loss,valid_loss,time
0,0.113469,0.117696,02:56


training for the couple: lr: 0.06, bs: 4096 used 176.37


epoch,train_loss,valid_loss,time
0,0.113058,0.116693,03:01


training for the couple: lr: 0.09, bs: 4096 used 181.84


epoch,train_loss,valid_loss,time
0,0.115227,0.120717,02:53


training for the couple: lr: 0.2, bs: 4096 used 173.59


epoch,train_loss,valid_loss,time
0,0.111515,0.117041,02:23


training for the couple: lr: 0.06, bs: 8192 used 143.79


epoch,train_loss,valid_loss,time
0,0.111174,0.1161,02:27


training for the couple: lr: 0.09, bs: 8192 used 147.70


epoch,train_loss,valid_loss,time
0,0.114644,0.11782,02:23


training for the couple: lr: 0.2, bs: 8192 used 143.74


epoch,train_loss,valid_loss,time
0,0.109096,0.118919,02:13


training for the couple: lr: 0.06, bs: 20480 used 133.46


epoch,train_loss,valid_loss,time
0,0.110107,0.116204,02:14


training for the couple: lr: 0.09, bs: 20480 used 134.21


epoch,train_loss,valid_loss,time
0,0.110983,0.45233,02:13


training for the couple: lr: 0.2, bs: 20480 used 133.54


epoch,train_loss,valid_loss,time
0,0.110156,0.115422,02:16


training for the couple: lr: 0.06, bs: 40960 used 136.28


epoch,train_loss,valid_loss,time
0,0.110243,0.116298,02:15


training for the couple: lr: 0.09, bs: 40960 used 135.80


epoch,train_loss,valid_loss,time
0,0.11085,0.132129,02:15


training for the couple: lr: 0.2, bs: 40960 used 135.39


epoch,train_loss,valid_loss,time
0,0.112239,0.11568,02:18


training for the couple: lr: 0.06, bs: 81920 used 138.78


epoch,train_loss,valid_loss,time
0,0.112199,0.11537,02:18


training for the couple: lr: 0.09, bs: 81920 used 138.36


epoch,train_loss,valid_loss,time
0,0.113172,0.116391,02:18


training for the couple: lr: 0.2, bs: 81920 used 138.35


epoch,train_loss,valid_loss,time
0,0.125682,0.117607,01:37


training for the couple: lr: 0.06, bs: 204800 used 97.95


epoch,train_loss,valid_loss,time
0,0.123097,0.11706,01:37


training for the couple: lr: 0.09, bs: 204800 used 97.68


epoch,train_loss,valid_loss,time
0,0.121157,0.11704,01:38


training for the couple: lr: 0.2, bs: 204800 used 98.01


epoch,train_loss,valid_loss,time
0,0.165766,0.123385,01:46


training for the couple: lr: 0.06, bs: 409600 used 106.95


epoch,train_loss,valid_loss,time
0,0.159684,0.126204,01:46


training for the couple: lr: 0.09, bs: 409600 used 106.64


epoch,train_loss,valid_loss,time
0,0.151141,0.123508,01:46


training for the couple: lr: 0.2, bs: 409600 used 106.78


epoch,train_loss,valid_loss,time
0,0.199256,0.130854,01:48


training for the couple: lr: 0.06, bs: 819200 used 108.96


epoch,train_loss,valid_loss,time
0,0.195795,0.131586,01:48


training for the couple: lr: 0.09, bs: 819200 used 108.70


epoch,train_loss,valid_loss,time
0,0.191146,0.136116,01:48


training for the couple: lr: 0.2, bs: 819200 used 108.75


In [10]:
results = pd.DataFrame(benchmark_results)
results.columns = ['batch size', 'learning rate', 'validation loss', 'N epochs', 'training time']

In [11]:
results.sort_values(by=['training time', 'validation loss'], ascending=True).head(10)

Unnamed: 0,batch size,learning rate,validation loss,N epochs,training time
16,204800,0.09,0.11706,1,97.675189
15,204800,0.06,0.117607,1,97.948497
17,204800,0.2,0.11704,1,98.00945
19,409600,0.09,0.126204,1,106.638807
20,409600,0.2,0.123508,1,106.780997
18,409600,0.06,0.123385,1,106.953093
22,819200,0.09,0.131586,1,108.699999
23,819200,0.2,0.136116,1,108.745657
21,819200,0.06,0.130854,1,108.957399
6,20480,0.06,0.118919,1,133.462606


**Conclusion** The best trade-off between training time and validation loss is reached for the couple **(204800, 0.09)**

<h3> Compute average validation scores of the best model </h3>

In [12]:
from helpers import get_mean_reciprocal_rank, roc_auc_score
path = os.path.join(data_path,'valid.parquet' )
ds = pd.read_parquet(path)

In [13]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 4096*50
best_lr = 9e-2

# create databunch 
train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
databunch = BatchDataBunch.create(train, validation, device='cuda', bs=best_bs)   

del train 
del validation 
del test

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]  


# run the model 5 times to get 5 scores 

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=len(cont_names), out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(databunch, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds(databunch)
    cv = ds[['row_id','reference','item_id', 'target']].copy()
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    
    del model 
    del learn
    torch.cuda.empty_cache()
    

In [14]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))

the mrr of the best mdodel is: 0.611057369826576 +/- 0.0012801593160030626
the auc of the best mdodel is: 0.8766171778021861 +/- 0.004299151814914718
the best mdodel's training time is 97.94034910202026 +/- 0.126405132714989
