In [1]:
import os
GPU_id = 6
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [2]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf

from preproc import *
from batchloader import *
from helpers import get_mean_reciprocal_rank, roc_auc_score

- In this notebook we want to benchmark the processing and training time for three diffrent models: 

- The two first models are using our CuDF processing workflow <a href=#cudf_workflow> section I </a>:
     1.  <a href=#first_model> Model 1 </a> : CuDF processing with CPU a copy
     2.  <a href=#second_model> Model 2 </a> : CuDF processing in-memory without copy    

           
 - <a href=#third_model> Model 3 </a> : In the second <a href=#fastai_workflow> section II </a>, we are using the Fastai processing workflow to get the scores of the best model found in the section I.  We directly process and create databunch from data_pair_all.pkl dataframe 

**N.B** : For each model, you need to re-start the kernel to free the GPU memory and be able to run all the experiments 

In [3]:
%load_ext snakeviz
# load snakeviz if you want to run profiling 

<h1> <center> <a id=batchdatabunch>New Data Bunch </a></center> </h1> 

### Define a custom databunch fastai that takes a TensorBatchDataLoader instead of the usual torch DataLoader 

In [4]:
class BatchDataBunch(DataBunch):
    
    @classmethod
    def remove_tfm(cls, tfm:Callable)->None:
        "Remove `tfm` from `self.tfms`."
        if tfm in cls.tfms: cls.tfms.remove(tfm)
            
    @classmethod
    def add_tfm(cls,tfm:Callable)->None:
        "Add `tfm` to `self.tfms`."
        cls.tfms.append(tfm)

    
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs=None, 
                      num_workers:int=defaults.cpus, device:torch.device=None,
                      collate_fn:Callable=data_collate, tfms: List[Callable]=None, 
                       size:int=None, **kwargs)->'BatchDataBunch':
        
        
        cls.tfms = listify(tfms)
        
        
        val_bs = ifnone(val_bs, bs)
        
        datasets = [TensorBatchDataset(train_ds, batch_size=bs), 
                    TensorBatchDataset(valid_ds, batch_size=bs)]
        
        if valid_ds is not None:
            cls.empty_val = False
        else:
            cls.empty_val = True
            
        if test_ds is not None:
            datasets.append(TensorBatchDataset(test_ds, batch_size=bs))
        else: 
            datasets.append(test_ds)
        
        cls.device = defaults.device if device is None else device
        
        dls = [BatchDataLoader(d, shuffle=s, pin_memory=False, drop_last=False, device=cls.device) for d,s in
               zip(datasets,(True,False,False)) if d is not None]

        cls.path = path 
        
        cls.dls = dls
    
        
        
        assert not isinstance(dls[0],DeviceDataLoader)
        
        
        # load batch in device 
        
        if test_ds is not None:
            cls.train_dl, cls.valid_dl, cls.test_dl = dls
        else: 
            cls.train_dl, cls.valid_dl = dls
            
            
        cls.path = Path(path)
        return cls
    


- To use the new BatchDatabunch class, we have to build the following processed tensors ( using cudf)  : 
    - train : cat_tensor, cont_tensor, label_tensor 
    
    - valid : cat_tensor, cont_tensor, label_tensor 
    
    - test : cat_tensor, cont_tensor, label_tensor 
    
- The size of vocaublary of each categorical variable need to be known 

- The two first models are using our CuDF processing workflow <a href=#cudf_workflow> section II </a>:
     1.  <a href=#first_model> Model 1 </a> : CuDF processing with CPU a copy
     2.  <a href=#second_model> Model 2 </a> : CuDF processing in-memory without copy    

           
 - The <a href=#fastai_workflow> third model </a> will use the Fastai processing workflow: Directly process and create databunch from data_pair_all.pkl dataframe 

<h1> <center>  <a id=cudf_workflow> Test of Tabular Learner with CuDF workflow </a></center> </h1>

**N.B:** For this section, you need to define the new custom BatchDataBunch class, if not go back to <a href=#batchdatabunch> section 1 </a>

<h2> 1. <a id=first_model> First model: Tabular Data copied to cpu </a> </h2>

In [5]:
to_cpu = True 

<h3> <a id=cudf_proc> Processing: Definition of train, validation and test tensors </a></h3>

In [6]:
# %%snakeviz 
# uncomment the line above to generate the snakeviz profile of preprocessing 

data_path = '../cache/'
TEST = 'test'
VALID = 'valid'
TRAIN = 'train'

start0 = time()
data = {}

############################
#                          #
# Fit processing train set #
#                          #
############################
start = time()
path = os.path.join(data_path,TRAIN+'.parquet' )
ds = cudf.read_parquet(path)
print(f"read {TRAIN} used {time()-start:.2f} seconds.")

# get variable names 
start = time()
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in ds.columns if i.startswith('is_')]
cont_names = ['price','candidate_order'] + [i for i in ds.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
print(f"get variables names used {time()-start:.2f} seconds.")

# init the processing class 
proc = PreprocessDF(cat_names=cat_names, cont_names=cont_names, label_name='target', to_cpu=to_cpu)

# Fit training 
start = time()
x, y = proc.preproc_dataframe(ds, mode=TRAIN)
print(f"processing {TRAIN} used {time()-start:.2f} seconds.")
del ds
data[TRAIN] = (x, y)

############################
#                          #
# Transform test and valid #
#                          #
############################  
ds_name = [TEST, VALID]
for name in ds_name:
    path = os.path.join(data_path,name+'.parquet' )
    ds = cudf.read_parquet(path)

    print(f"read {name} used {time()-start:.2f} seconds.")
    start = time()
    x, y = proc.preproc_dataframe(ds, mode=name)
    print(f"processing {name} used {time()-start:.2f} seconds.")
    data[name] = (x, y)
    del ds

print(f"The whole processing used {time()-start0:.2f} seconds.")

read train used 6.77 seconds.
get variables names used 0.00 seconds.
processing train used 2.78 seconds.
read test used 3.13 seconds.
processing test used 5.11 seconds.
read valid used 6.58 seconds.
processing valid used 23.40 seconds.
The whole processing used 39.88 seconds.


<h3> Benchmark : Get the best (batch size, learning rate)</h3> 

- Fine tune the best couple (batch_size, lr) : The criterion used is the CrossEntropy loss function 
    - The range of batch sizes is : 4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200
    - The range of max learning rate was set w.r.t to the plot of the results of the Fastai method find_lr : [6e-2, 9e-2, 2e-1] 
    
    
- **N.B:** Some of the batch_sizes require more than one epoch to get the best score (numbers shown in the paper). However, to the complexity of the notebook, we'll run all the training with 1 epoch as our best model (fastest training time) converges in 1 epoch. 

In [7]:
batch_sizes = [4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200]
lrs = [6e-2, 9e-2, 2e-1] 

In [8]:
# Define batch databunch 
benchmark_results = [] 

for batch_size in batch_sizes: 
    train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
    validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
    test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
    databunch = BatchDataBunch.create(train, validation, device='cuda', bs=batch_size)   
    
    for learning_rate in lrs: 
        print('Launch training for the couple: lr: %s, bs: %s ' %(learning_rate, batch_size))
        #define the model 
        emb_sz = [(938604, 16), (904722, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
                  (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]    

        model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])

        learn =  Learner(databunch, model, metrics=None)
        
        learn.loss_func = torch.nn.CrossEntropyLoss()
        
        # launch training 
        start = time()
        learn.fit_one_cycle(1, learning_rate)
        t_final = time() - start 
        benchmark_results.append([batch_size, learning_rate, learn.recorder.val_losses[0], 1, t_final] ) 
        del learn 
    del databunch

Launch training for the couple: lr: 0.06, bs: 4096 


epoch,train_loss,valid_loss,time


IndexError: index 6 is out of bounds for dimension 1 with size 6

In [None]:
results = pd.DataFrame(benchmark_results)
results.columns = ['batch size', 'learning rate', 'validation loss', 'N epochs', 'training time']

In [None]:
results.sort_values(by=['validation loss', 'training time'], ascending=True).head(10)

- **Conclusion** The best trade-off between training time and validation loss is reached for the couple **(204800, 0.09)**

<h3> Compute average validation scores of the best model </h3>

In [None]:
ds = pd.read_parquet("./parquet_data/data_pair_all/valid.parquet")

In [None]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 204800*50
best_lr = 9e-2

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]    

train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
databunch = BatchDataBunch.create(train, validation, device='cuda', bs=best_bs)   

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(databunch, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds(databunch)
    cv = ds[['row_id','reference','item_id', 'target']].copy()
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    

In [None]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))

<h2> 2. <a id=second_model> Second model: Tabular Data in-memory  </a> </h2>

In [None]:
to_cpu = False 

<h3> Processing: Definition of train, validation and test tensors </h3>

In [None]:
# %%snakeviz 
# uncomment the line above to generate the snakeviz profile of preprocessing 

data_path = './parquet_data/data_pair_all'
TEST = 'test'
VALID = 'valid'
TRAIN = 'train'

start0 = time()
data = {}

############################
#                          #
# Fit processing train set #
#                          #
############################
start = time()
path = os.path.join(data_path,TRAIN+'.parquet' )
ds = cudf.read_parquet(path)
print(f"read {TRAIN} used {time()-start:.2f} seconds.")

# get variable names 
start = time()
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in ds.columns if i.startswith('is_')]
cont_names = ['price','candidate_order'] + [i for i in ds.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
print(f"get variables names used {time()-start:.2f} seconds.")

# init the processing class 
proc = PreprocessDF(cat_names=cat_names, cont_names=cont_names, label_name='target', to_cpu=to_cpu)

# Fit training 
start = time()
x, y = proc.preproc_dataframe(ds, mode=TRAIN)
print(f"processing {TRAIN} used {time()-start:.2f} seconds.")
del ds
data[TRAIN] = (x, y)

############################
#                          #
# Transform test and valid #
#                          #
############################  
ds_name = [TEST, VALID]
for name in ds_name:
    path = os.path.join(data_path,name+'.parquet' )
    ds = cudf.read_parquet(path)

    print(f"read {name} used {time()-start:.2f} seconds.")
    start = time()
    x, y = proc.preproc_dataframe(ds, mode=name)
    print(f"processing {name} used {time()-start:.2f} seconds.")
    data[name] = (x, y)
    del ds

print(f"The whole processing used {time()-start0:.2f} seconds.")

<h3> Benchmark : Get the best (batch size, learning rate)</h3> 

- Fine tune the best couple (batch_size, lr) : The criterion used is the CrossEntropy loss function 
    - The range of batch sizes is : 4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200
    - The range of max learning rate was set w.r.t to the plot of the results of the Fastai method find_lr : [6e-2, 9e-2, 2e-1] 
    
    
- **N.B:** Some of the batch_sizes require more than one epoch to get the best score (numbers shown in the paper). However, to the complexity of the notebook, we'll run all the training with 1 epoch as our best model (fastest training time) converges in 1 epoch. 

In [None]:
batch_sizes = [4096, 8192, 20480, 40960, 81920, 204800, 409600, 819200]
lrs = [6e-2, 9e-2, 2e-1] 

In [None]:
# Define batch databunch 
from fastai.tabular import TabularModel
from time import time 
benchmark_results = [] 

for batch_size in batch_sizes: 
    train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
    validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
    test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
    databunch = BatchDataBunch.create(train, validation, device='cuda', bs=batch_size)   
    del train 
    del validation 
    del test
    for learning_rate in lrs: 
        #define the model 
        emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
                  (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]    

        model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])

        learn =  Learner(databunch, model, metrics=None)
        
        learn.loss_func = torch.nn.CrossEntropyLoss()
        
        # launch training 
        start = time()
        learn.fit_one_cycle(1, learning_rate)
        t_final = time() - start 
        benchmark_results.append([batch_size, learning_rate, learn.recorder.val_losses[0], 1, t_final] ) 
        print('training for the couple: lr: %s, bs: %s used %.2f' %(learning_rate, batch_size, t_final))

        del learn 
        del model
        torch.cuda.empty_cache()   
        
    del databunch
    torch.cuda.empty_cache()   

In [None]:
results = pd.DataFrame(benchmark_results)
results.columns = ['batch size', 'learning rate', 'validation loss', 'N epochs', 'training time']

In [None]:
results.sort_values(by=['training time', 'validation loss'], ascending=True).head(10)

**Conclusion** The best trade-off between training time and validation loss is reached for the couple **(204800, 0.09)**

<h3> Compute average validation scores of the best model </h3>

In [None]:
from helpers import get_mean_reciprocal_rank, roc_auc_score
ds = pd.read_parquet("./parquet_data/data_pair_all/valid.parquet")

In [None]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 4096*50
best_lr = 9e-2

# create databunch 
train = [data['train'][0][0], data['train'][0][1], data['train'][1].long()]
validation = [data['valid'][0][0], data['valid'][0][1], data['valid'][1].long()]
test = [data['test'][0][0], data['test'][0][1], data['test'][1].long()]
databunch = BatchDataBunch.create(train, validation, device='cuda', bs=best_bs)   

del train 
del validation 
del test

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]  

# run the model 5 times to get 5 scores 

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(databunch, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds(databunch)
    cv = ds[['row_id','reference','item_id', 'target']].copy()
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    
    del model 
    del learn
    torch.cuda.empty_cache()
    

In [None]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))

<h1> <center>  <a id=fastai_workflow> Test of Tabular Learner with Fastai workflow </a></center> </h1>

- As the processing time is taking more than 6minutes and our purpose is to benchmark the best model using our proposed workflow against the Fastai workflow. We'll directly compute the scores of the Tabular model with batch size of 204800 and learning rate 0.09 

<h3> <a id=third_model> Fastai model </a> </h3> 

In [None]:
batch_size = 4096*50

In [None]:
from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from batchloader import *

In [None]:
%%time
import pandas as pd
data_pair = pd.read_pickle('/rapids/notebooks/jperez/recsys/cache/data_pair_all.pkl')

<h3> Create pre-processed databunch </h3> 

In [None]:
%%time
# split to train / test 
train = data_pair[data_pair.clickout_missing==0]
test = data_pair[data_pair.clickout_missing>0]
print(train.shape,test.shape)

# get categorical and continious variables names 
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in train.columns if i.startswith('is_')]
cont_names = ['price','candidate_order'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]

# define validation rows
train['is_va'] = train.row_id%5 == 0
del data_pair

In [None]:
%%time
procs = [FillMissing, Categorify, Normalize]

test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(train, path='./', cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_from_df('is_va')
                           .label_from_df(cols='target')
                           .add_test(test_list)
                           .databunch(num_workers=8,bs=batch_size, device='cuda'))

<h3> Compute average validation scores of the best model  </h3> 

In [None]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 4096*50
best_lr = 9e-2

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=25, out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(data, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds()
    cv = train.loc[train['is_va']>0,['row_id','reference','item_id', 'target']].copy()
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    

In [None]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))