### Sequence bucketing - PyTorch implementation

I've unsuccessfully tried a couple of times to implement Sequence Bucketing in PyTorch. Recently I found this notebook that solves this problem: [Notebook](https://www.kaggle.com/shahules/guide-pytorch-data-samplers-sequence-bucketing/notebook)

Thanks [Shahules](https://www.kaggle.com/shahules)

I just modified it for this competition and made it a little bit more pythonic.

In my case it speeds up training from 18:40 min to 8:54 min for one epoch and batch size 4. If you are allready truncating your input, you may see less improvements than me.

If you have any questions or ideas for improvements, please let me know!

In [None]:
import os

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, Subset, DataLoader, RandomSampler, SequentialSampler

from transformers import LongformerTokenizerFast

from sklearn.model_selection import train_test_split

from tqdm import tqdm

import matplotlib.pyplot as plt

### 1 . Dataloading and preprocessing

In [None]:
# load the train text data

config = {
    'model_name': 'allenai/longformer-base-4096',
    'batch_size': 4,
}

TEXT_FILES = os.listdir('../input/feedback-prize-2021/train')
TEXT_FILES = [f'../input/feedback-prize-2021/train/{file}' for file in TEXT_FILES]

text_data = dict()
for file_path in tqdm(TEXT_FILES):
    with open(file_path, 'r') as file:
        idx = os.path.basename(file_path).split('.txt')[0]
        text_data[idx] = file.read()
        
# 1. delete spaces from texts ends
for key, value in text_data.items():
    text_data[key] = value.rstrip()


Here, I am going to use the Longformer tokenizer.

In [None]:
tokenizer = LongformerTokenizerFast.from_pretrained(config['model_name'])

data_tokenized = []

for idx, text in tqdm(text_data.items()):
    
    # get inputs
    inputs = tokenizer(text, add_special_tokens=True)
        
    data_tokenized.append([inputs['input_ids'], inputs['attention_mask']])
    
tokenized_df = pd.DataFrame(data_tokenized, columns=['input_ids', 'attention_mask'])
tokenized_df.head()

#### Sequence length histogram

In [None]:
seq_len = tokenized_df['attention_mask'].apply(len)

plt.rcParams['figure.figsize'] = (17, 8)
bins = np.linspace(0, 2000, 100)

plt.hist(seq_len, bins=bins, alpha=0.75, label='sequence length')
plt.vlines(seq_len.mean(), ymin=0, ymax=700, colors='red', label='mean sequence length')
plt.legend(loc='upper right')
plt.show()

### 2. Create custom PyTorch Dataset

In [None]:
class LongformerDataset(Dataset):
    """Dataset for the longformer model."""
    
    def __init__(self, data: pd.DataFrame):
        self.data = data        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        return {
            'input_ids': self.data.loc[idx, 'input_ids'],
            'attention_mask': self.data.loc[idx, 'attention_mask']
        }
      

Now instead of using a PyTorch Sampler, we need to make a class that processes a batch ourselfs, and call it as collate_fn parameter when making a DataLoader.

See this discusion: [Link](https://discuss.pytorch.org/t/how-to-use-collate-fn/27181)

In [None]:
from typing import List

class Collate:
    
    def __call__(self, batch: List[dict]) -> dict:
        
        output = dict()
        
        # since our custom Dataset's __getitem__ method returns dictionary
        # the collate_fn function will receive list of dictionaries
        output['input_ids'] = [sample['input_ids'] for sample in batch]
        output['attention_mask'] = [sample['attention_mask'] for sample in batch]
        
        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output['input_ids']])
        
        # add padding
        output['input_ids'] = [sample + (batch_max-len(sample)) * [tokenizer.pad_token_id] for sample in output['input_ids']]
        output['attention_mask'] = [sample + (batch_max-len(sample)) * [0] for sample in output['attention_mask']]
        
        # convert to tensors
        output['input_ids'] = torch.tensor(output['input_ids'], dtype=torch.long)
        output['attention_mask'] = torch.tensor(output['attention_mask'], dtype=torch.long)
    
        return output


### 4. Create Dataloaders

In [None]:
collate = Collate()
dataset = LongformerDataset(tokenized_df)

train_data, val_data = train_test_split(dataset, test_size=0.2, shuffle=True)

train_dataloader = DataLoader(train_data,
                              batch_size=config['batch_size'],
                              collate_fn=collate,
                              shuffle=True)

val_dataloader = DataLoader(val_data,
                            batch_size=config['batch_size'],
                            collate_fn=collate
                           )

### Lets see the average batch max len

In [None]:
train_batch_sizes = pd.Series([batch['input_ids'].size(1) for batch in train_dataloader])

print(f'Mean: {round(train_batch_sizes.mean(), 2)}')
print(f'Mean absolute deviation: {round(train_batch_sizes.mad(), 2)}')

With higher batch sizes, there should be more padding tokens on average, therefore less runtime improvements!

In [None]:
config['batch_size'] = 8

train_dataloader = DataLoader(train_data,
                              batch_size=config['batch_size'],
                              collate_fn=collate,
                              shuffle=True)

train_batch_sizes = pd.Series([batch['input_ids'].size(1) for batch in train_dataloader])

print(f'Mean: {round(train_batch_sizes.mean(), 2)}')
print(f'Mean absolute deviation: {round(train_batch_sizes.mad(), 2)}')