This notebook illustrate how to speedup inference by :

    - sort input text from decreasing length
    
    - each batch has samples of similar token lengths. we pad each sample to the longest length in the batch.
    
since most of the input text are short, this method significantly speedup inference when compared to methdos that uses long fxied length padding.

make sure you will have to train your model to be robust against different length input. (which shouldn't be an issue since transformer has input attention mask)

with this code, i can run3x deberta-xlarge and 2x deberta-large model  (total 5 models) in under 3 hrs on the kaggle test set.
    

In [None]:
%reset -f 

if 1:
    # https://www.kaggle.com/nbroad/deberta-v2-3-fast-tokenizer
    import shutil
    from pathlib import Path

    transformers_path = Path('/opt/conda/lib/python3.7/site-packages/transformers') 
    input_dir = Path('../input/feedback-prize-submit-02/deberta_v2_convert_tokenizer')

    convert_file = input_dir / 'convert_slow_tokenizer.py'
    conversion_path = transformers_path/convert_file.name 
    if conversion_path.exists():
        conversion_path.unlink() 
    shutil.copy(convert_file, transformers_path)
    
    deberta_v2_path = transformers_path / 'models' / 'deberta_v2' 
    for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
        filepath = deberta_v2_path/filename
        if filepath.exists():
            filepath.unlink() 
        shutil.copy(input_dir/filename, filepath)

#----------------------------------------------------------------------------------------------------------
import sys
sys.path.append('../input/feedback-prize-submit-01')
sys.path.append('../input/feedback-prize-submit-02')

import os
import numpy as np
import glob
import pandas as pd
from timeit import default_timer as timer

  
import torch
from torch.nn.parallel.data_parallel import data_parallel

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import *

import gc
import psutil 

import torch.cuda.amp as amp
is_amp   = True  #True #False
is_cuda  = True
is_debug = True


#helper
def time_to_str(t, mode='min'): 
    if mode=='min':
        t  = int(t)/60
        hr = t//60
        min = t%60
        return '%2d hr %02d min'%(hr,min)
    elif mode=='sec':
        t   = int(t)
        min = t//60
        sec = t%60
        return '%2d min %02d sec'%(min,sec)
    else:
        raise NotImplementedError
        

#https://stackoverflow.com/questions/61366458/how-to-find-memory-usage-of-kaggle-notebook
def memory_used_to_str():
    pid = os.getpid()
    processs = psutil.Process(pid)
    memory_use = processs.memory_info()[0] / 2. ** 30
    return 'use ram memory gb ' + str(np.round(memory_use, 2))

#kaggle limit is 13 gb

In [None]:
#config 

discourse_marker_to_label = {
    'O': 0,
    'B-Lead': 1,
    'I-Lead': 2,
    'B-Position': 3,
    'I-Position': 4,
    'B-Claim': 5,
    'I-Claim': 6,
    'B-Counterclaim': 7,
    'I-Counterclaim': 8,
    'B-Rebuttal': 9,
    'I-Rebuttal': 10,
    'B-Evidence': 11,
    'I-Evidence': 12,
    'B-Concluding Statement': 13,
    'I-Concluding Statement': 14,
    'IGNORE': -100,
}
label_to_discourse_marker = {v: k for k, v in discourse_marker_to_label.items()}
num_discourse_marker = 15 #len(label_to_discourse_marker)-1 #15

length_threshold = {
    'Lead'                : 9,
    'Position'            : 5,
    'Claim'               : 3,
    'Counterclaim'        : 6,
    'Rebuttal'            : 4,
    'Evidence'            : 14,
    'Concluding Statement': 11,
}
probability_threshold = {
    'Lead'                : 0.70,
    'Position'            : 0.55,
    'Claim'               : 0.55,
    'Counterclaim'        : 0.50,
    'Rebuttal'            : 0.55,
    'Evidence'            : 0.65,
    'Concluding Statement': 0.70,
}
max_length = 1600

#-------------------------------
submit_dir = ''

if is_debug:
    text_dir = '../input/feedback-prize-2021/train'
    df = pd.read_csv('../input/feedback-prize-submit-01/train.fold10.csv')
    valid_df = df[df['fold'] == 0].reset_index(drop=True)
    valid_df = valid_df[:2000]
    valid_id = valid_df['id'].unique()

else:
    text_dir = '../input/feedback-prize-2021/test'
    valid_id = [ f.split('/')[-1][:-4] for f in glob.glob(text_dir+'/*.txt') ] 
    
size = [os.path.getsize(text_dir+'/%s.txt'%id) for id in valid_id] 
valid_id = [id for id, s in sorted(zip(valid_id, size), key=lambda pair: -pair[1])]
num_valid = len(valid_id)
print('len(valid_id)',len(valid_id))

#print(valid_id)
#print([os.path.getsize(text_dir+'/%s.txt'%id) for id in valid_id] )



In [None]:
#data

df_text=[]
for id in valid_id:
    text_file = text_dir +'/%s.txt'%id
    with open(text_file, 'r') as f:
        text = f.read()

    text = text.replace(u'\xa0', u' ')
    text = text.rstrip()
    text = text.lstrip()
    df_text.append((id,text))
df_text = pd.DataFrame(df_text, columns=['id','text'])
print('df_text.shape',df_text.shape)
print(df_text)

class FeedbackDataset(Dataset):
    def __init__(self, df_text, tokenizer, max_length = 1600):

        self.df_text  = df_text
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.length = len(self.df_text)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        d    = self.df_text.iloc[index]
        id   = d['id']
        text = d.text

        #text to token
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
            max_length=1600, #<todo>
            truncation=True,
        )
        token_id     =  encoded['input_ids']
        token_offset =  encoded['offset_mapping']

        # add end, start token id
        token_id = [self.tokenizer.cls_token_id] + token_id

        token_id = token_id[: self.max_length - 1]
        token_id = token_id + [self.tokenizer.sep_token_id]
        
        token_length = len(token_id)

        # padding
        token_mask = [1] * len(token_id)

        padding_length = max_length - len(token_id)
        if padding_length > 0:
            if self.tokenizer.padding_side == 'right':
                token_id    = token_id    + [self.tokenizer.pad_token_id] * padding_length
                token_mask  = token_mask  + [0] * padding_length
            else:
                raise NotImplementedError

        #-------------------------------------
        r = {}
        r['index'] = index
        r['id'   ] = id
        r['text' ] = text
        r['token_offset'] = str(token_offset) #force batch loader store as list
        r['token_id'    ] = torch.tensor(token_id,    dtype=torch.long)
        r['token_mask'  ] = torch.tensor(token_mask,  dtype=torch.long)
        r['token_length'] = token_length
        return r



In [None]:
#net

from bigbird_base_model import Net as BidBirdBaseNet
from longformer_base_model import Net as LongformerBaseNet
from bigbird_large_model import Net as BidBirdLargeNet
from longformer_large_model import Net as LongformerLargeNet
from funnel_medium_model import Net as FunnelMediumNet
from funnel_large_model import Net as FunnelLargeNet
from deberta_base_model import Net as DebertaBaseNet
from deberta_large_model import Net as DebertaLargeNet
from deberta_xlarge_model import Net as DebertaXLargeNet
from deberta_v3_large_model import Net as DebertaV3LargeNet


 
 

ensemble =(

   {
        'net'  : DebertaXLargeNet,
        'arch' : '../input/feedback-prize-submit-01/microsoft-deberta-xlarge',
        'checkpoint' : 	[  
                '../input/feedback-prize-submit-01/debert-xlarge-10kf-1600-fine-03-fold-1-00015000.model.pth',
                #'../input/feedback-prize-submit-01/debert-xlarge-10kf-1600-03-fold-0-00014000.model.pth',  
           ],
        'batch_size' : 6,
   },

#    {
#         'net'  : DebertaLargeNet,
#         'arch' : '../input/feedback-prize-submit-01/microsoft-deberta-large',
#         'checkpoint' : 	[
#                 '../input/feedback-prize-submit-01/debert-large-10kf-1600-03-fold-0-00013000.model.pth',  
#                 '../input/feedback-prize-submit-01/debert-large-10kf-1600-03-fold-1-00013000.model.pth',
#                 #'../input/feedback-prize-submit-02/deberta-large-10kf-1600-ds-fold-0-00012000.model.pth',
#             ],
#         'batch_size' : 8,
#    },
 
)
num_model = len(ensemble)

In [None]:
#processing

def text_to_word(text):
    word = text.split()
    word_offset = []

    start = 0
    for w in word:
        r = text[start:].find(w)

        if r==-1:
            raise NotImplementedError
        else:
            start = start+r
            end   = start+len(w)
            word_offset.append((start,end))
            #print('%32s'%w, '%5d'%start, '%5d'%r, text[start:end])
        start = end

    return word, word_offset

def word_probability_to_predict_df(text_to_word_probability, id):
    len_word = len(text_to_word_probability)
    word_predict = text_to_word_probability.argmax(-1)
    word_score   = text_to_word_probability.max(-1)
    predict_df = []

    t = 0
    while 1:
        if word_predict[t] not in [
            discourse_marker_to_label['O'],
            discourse_marker_to_label['IGNORE'],
        ]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t+1
            if t== len_word-1: break
            continue

        t = t+1
        if t== len_word-1: break

        #----
        if   label_to_discourse_marker[b_marker_label][0]=='B':
            i_marker_label = b_marker_label+1
        elif label_to_discourse_marker[b_marker_label][0]=='I':
            i_marker_label = b_marker_label
        else:
            raise NotImplementedError

        while 1:
            #print(t)
            if (word_predict[t] != i_marker_label) or (t ==len_word-1):
                end = t
                prediction_string = ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()
                discourse_type = label_to_discourse_marker[b_marker_label][2:]
                discourse_score = word_score[start:end].tolist()
                predict_df.append((id, discourse_type, prediction_string, str(discourse_score)))
                #print(predict_df[-1])
                break
            else:
                t = t+1
                continue
        if t== len_word-1: break

    predict_df = pd.DataFrame(predict_df, columns=['id', 'class', 'predictionstring', 'score'])
    return predict_df

def do_threshold(submit_df, use=['length','probability']):
    df = submit_df.copy()
    df = df.fillna('')

    if 'length' in use:
        df['l'] = df.predictionstring.apply(lambda x: len(x.split()))
        for key, value in length_threshold.items():
            #value=3
            index = df.loc[df['class'] == key].query('l<%d'%value).index
            df.drop(index, inplace=True)

    if 'probability' in use:
        df['s'] = df.score.apply(lambda x: np.mean(eval(x)))
        for key, value in probability_threshold.items():
            index = df.loc[df['class'] == key].query('s<%f'%value).index
            df.drop(index, inplace=True)

    df = df[['id', 'class', 'predictionstring']]
    return df

#evaluation for debug ----
# https://www.kaggle.com/cpmpml/faster-metric-computation

def compute_overlap(predict, truth):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    # Length of each and intersection
    try:
        len_truth   = len(truth)
        len_predict = len(predict)
        intersect = len(truth & predict)
        overlap1 = intersect/ len_truth
        overlap2 = intersect/ len_predict
        return (overlap1, overlap2)
    except:  # at least one of the input is NaN
        return (0, 0)

def compute_f1_score_one(predict_df, truth_df, discourse_type):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    t_df = truth_df.loc[truth_df['discourse_type'] == discourse_type,   ['id', 'predictionstring']].reset_index(drop=True)
    p_df = predict_df.loc[predict_df['class'] == discourse_type,  ['id', 'predictionstring']].reset_index(drop=True)

    p_df.loc[:,'predict_id'] = p_df.index
    t_df.loc[:,'truth_id'] = t_df.index
    p_df.loc[:,'predictionstring'] = [set(p.split(' ')) for p in p_df['predictionstring']]
    t_df.loc[:,'predictionstring'] = [set(p.split(' ')) for p in t_df['predictionstring']]

    # Step 1. all ground truths and predictions for a given class are compared.
    joined = p_df.merge(t_df,
                           left_on='id',
                           right_on='id',
                           how='outer',
                           suffixes=('_p','_t')
                          )
    overlap = [compute_overlap(*predictionstring) for predictionstring in zip(joined.predictionstring_p, joined.predictionstring_t)]

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['potential_TP'] = [(o[0] >= 0.5 and o[1] >= 0.5) for o in overlap]
    joined['max_overlap' ] = [max(*o) for o in overlap]
    joined_tp = joined.query('potential_TP').reset_index(drop=True)
    tp_pred_ids = joined_tp\
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','truth_id'])['predict_id'].first()

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = set(joined['predict_id'].unique()) - set(tp_pred_ids)

    matched_gt_ids   = joined_tp['truth_id'].unique()
    unmatched_gt_ids = set(joined['truth_id'].unique()) -  set(matched_gt_ids)

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    f1 = TP / (TP + 0.5*(FP+FN))
    return f1

def compute_lb_f1_score(predict_df, truth_df):
    f1_score = {}
    for discourse_type in truth_df.discourse_type.unique():
        f1_score[discourse_type] = compute_f1_score_one(predict_df, truth_df, discourse_type)
    #f1 = np.mean([v for v in class_scores.values()])
    return f1_score



In [None]:
## main submission function !!!!


def run_submit():
    if is_debug: print("THIS IS DEBUG ####################################")
    all_time = 0
    print('start', memory_used_to_str())

    ensemble_result = []
    for m in range(num_model):
        model = ensemble[m]
        num_net = len(model['checkpoint'])

        net = model['net'](model['arch'])
        tokenizer = net.get_tokenizer()

        valid_dataset = FeedbackDataset(df_text, tokenizer, max_length)
        valid_loader  = DataLoader(
            valid_dataset,
            sampler = SequentialSampler(valid_dataset),
            batch_size  = model['batch_size'], #4, #
            drop_last   = False,
            num_workers = 2, #0, #
            pin_memory  = False,
            #collate_fn = null_collate_fn,
        )


        model_result = []
        for n in range(num_net):
            net.load_state_dict(torch.load(model['checkpoint'][n], map_location=lambda storage, loc: storage)['state_dict'],strict=False)
            #net.half()
            net.eval()
            net.cuda()
            print('load ok : [model=%d, net=%d] %s : %s'%(m, n, net.arch, model['checkpoint'][n].split('/')[-1]))
            print('         ', memory_used_to_str())
            #---

            net_result  = {
                'token_offset':[],
                'probability' :[],
            }

            T=0
            start_timer = timer()
            for t, batch in enumerate(valid_loader):
                batch_size = len(batch['index'])
                length = batch['token_length'].max().item()
                token_mask_short = batch['token_mask'][:,:length]
                token_id_short   = batch['token_id'][:,:length]
                if is_cuda:
                    token_mask_short = token_mask_short.cuda()
                    token_id_short = token_id_short.cuda()

                with torch.no_grad():
                    with amp.autocast(enabled=is_amp):
                        probability = data_parallel(net,(token_id_short, token_mask_short))

                        probability = (probability*255).byte().data.cpu().numpy()
                        probability = np.pad(probability, ((0, 0),(0,max_length-length),(0,0)), 'constant', constant_values=0)
                        net_result['probability'].append( probability )
                        net_result['token_offset' ] += [eval(x) for x in batch['token_offset']]

                T += batch_size
                print('\r\t%d/%d  %s'%(T, len(valid_dataset), time_to_str(timer() - start_timer,'sec')),end='',flush=True)
            #------------
            all_time += timer() - start_timer
            print('')

            model_result.append({
                'probability' : np.concatenate(net_result['probability']),
                'token_offset': np.array(net_result['token_offset'], object)
            })

            # sys.getsizeof(model_result[0]['probability'])/ 2. ** 30
            # model_result[0]['probability'].shape (202, 1600, 15)

            #del net_result['probability'], net_result['token_offset']
            gc.collect()
            if is_cuda: torch.cuda.empty_cache()
            #print('         ', memory_used_to_str())
            print('gc.collect()', memory_used_to_str())
            print('')

        #--------------------------------------------
        #average
        probability = 0
        for n in range(num_net):
            probability += model_result[n]['probability']/255.0
        probability = probability/num_net
        probability = (probability*255).astype(np.uint8)
        ensemble_result.append({
            'probability' : probability,
            'token_offset': model_result[0]['token_offset']
        })


    #------------------------------------------------------------------------

    submit_df = []
    for i in range(num_valid):
        d  = df_text.iloc[i]
        id = d.id
        text = d.text
        word, word_offset = text_to_word(text)
        #print(i,id[i], len(text), len(word))

        #ensemble -----
        token_to_text_probability = np.full((len(text),num_discourse_marker),0, np.float32)
        for m in range(num_model):
            p = ensemble_result[m]['probability'][i][1:]/255
            for t,(start,end) in enumerate(ensemble_result[m]['token_offset'][i]):
                if t==max_length-1: break #assume max_length, else use token_mask to get length
                token_to_text_probability[start:end]+=p[t] #**0.5
        token_to_text_probability = token_to_text_probability/num_model
        #ensemble -----


        text_to_word_probability = np.full((len(word),num_discourse_marker),0, np.float32)
        for t,(start,end) in enumerate(word_offset):
            text_to_word_probability[t]=token_to_text_probability[start:end].mean(0)

        predict_df = word_probability_to_predict_df(text_to_word_probability, id)
        submit_df.append(predict_df)
        #print('\r preparing submit_df :', i, id, len(text), len(word), end ='', flush=True)
    print('')

    #----------------------------------------
    submit_df = pd.concat(submit_df).reset_index(drop=True)
    submit_df.to_csv(submit_dir + '/submission.csv', index=False)
    submit_df = do_threshold(submit_df, use=['length', 'probability'])
    submit_df.to_csv('submission.csv', index=False)

    print('------------------')
    for t in range(3): print(submit_df.iloc[t],'\n')
    print('submission ok!----')

    if is_debug:
        print("THIS IS DEBUG ####################################")
        f1_score = compute_lb_f1_score(submit_df, valid_df)
        print('f1 macro : %f\n' % np.mean([v for v in f1_score.values()]))
        for k,v in f1_score.items():
            print('%20s : %05f'%(k,v))
        print('')

        #all_time = timer() - start_timer0
        print('all_time : %s'%(time_to_str(all_time,'sec')))
        print('estimated for 10k text files : %s'%(time_to_str(all_time/len(valid_id)*10_000,'min')))



In [None]:
#check function
def run_check_dataset():

    tokenizer = net[0].get_tokenizer()
    dataset = FeedbackDataset(df_text, tokenizer, max_length)

    for i in range(5):
        r = dataset[i]
        print(r['index'],'-----------')
        for k in ['token_id', 'token_mask']:
            v = r[k]
            print(k)
            print('\t',v.shape, v.is_contiguous())
            print('\t',v)
        print('')

In [None]:
# '''
# cross validation results 
# WITHOUT SORTED TEXT INPUT #############################################
# ../input/feedback-prize-submit-01/microsoft-deberta-large ( one model )
# 202/202   1 min 36 sec

# f1 macro : 0.680797
# estimated for 10k text files :  1 hr 19 min

# ----
# ../input/feedback-prize-submit-01/microsoft-deberta-xlarge ( one model )
# 202/202   3 min 10 sec

# f1 macro : 0.687624
# estimated for 10k text files :  2 hr 36 min


# WITH SORTED TEXT INPUT #############################################

# ../input/feedback-prize-submit-01/microsoft-deberta-xlarge ( one model )
# 202/202   0 min 59 sec
    
    
# f1 macro : 0.687624
# estimated for 10k text files :  0 hr 49 min

In [None]:
#run_check_dataset()
run_submit()