In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle

import torch
import torch.nn as nn
from torch.optim import lr_scheduler

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import wandb

from sklearn.metrics import roc_auc_score

from modules.training import SBERT, SentencesDataset

In [3]:
path_to_model = 'distiluse-base-multilingual-cased'
df = pd.read_parquet('data/test.parquet')
X_train, y_train = df['question'].str.lower(), df.drop('question', axis=1)

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("DeepPavlov/bert-base-multilingual-cased-sentence")
			
model = transformers.BertModel.from_pretrained("DeepPavlov/bert-base-multilingual-cased-sentence")

Downloading: 100%|██████████| 642/642 [00:00<00:00, 353kB/s]
Downloading: 100%|██████████| 996k/996k [00:02<00:00, 371kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 34.6kB/s]
Downloading: 100%|██████████| 24.0/24.0 [00:00<00:00, 8.12kB/s]
Downloading: 100%|██████████| 711M/711M [39:17<00:00, 302kB/s]


In [3]:
enc = tokenizer.encode_plus('Привет, мир!')
tokenizer.decode(enc['input_ids'])

'[CLS] Привет, мир! [SEP]'

In [4]:
for key in enc:
    enc[key] = torch.tensor(enc[key]).unsqueeze(0)
enc

{'input_ids': tensor([[  101, 14337, 41102,   117, 29345,   106,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [6]:
model(input_ids=enc['input_ids'], attention_mask=enc['attention_mask'], token_type_ids=enc['token_type_ids'])[1].size()

torch.Size([1, 768])

In [8]:
d = {'d': 0}
if 'd' not in d:
    print('yep')

In [27]:
class DatasetClass(torch.utils.data.Dataset):
    """
    Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all
    sequences to the max length.
    The SentenceBertEncoder.smart_batching_collate is required for this to work.
    SmartBatchingDataset does *not* work without it.
    """
    def __init__(self, features: np.array, target: np.array, tokenizer_path: str, show_progress_bar: bool = True):
        """
        Create a new SentencesDataset with the tokenized texts and the labels as Tensor
        """
        self.show_progress_bar = show_progress_bar
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
        self.features = self.make_tokens(features)
        self.target = target

    def make_tokens(self, texts):
        tokens = []
        for text in tqdm(texts, desc='Tokenizing...'):
            tokens.append(self.tokenizer.encode_plus(text))

        return tokens

    def collate_fn(self, batch):
        """
        Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
        :param batch:
            a batch from a SmartBatchingDataset
        :return:
            a batch of tensors for the model
        """
        tokens, labels = [], []
        for token, label in batch:
            tokens.append(token)
            labels.append(label)

        max_len = 0
        for token in tokens:
            max_len = max(max_len, len(token['input_ids']))
        
        for token in tokens:
            padding_length = max_len - len(token['input_ids'])
            if padding_length > 0:
                token['input_ids']= token['input_ids'] + ([0] * padding_length)
                token['attention_mask'] = token['attention_mask'] + ([0] * padding_length)
                token['token_type_ids'] = token['token_type_ids'] + ([0] * padding_length)

            token['input_ids'] = torch.tensor(token['input_ids'])
            token['attention_mask'] = torch.tensor(token['attention_mask'])
            token['token_type_ids'] = torch.tensor(token['token_type_ids'])

        tokens_dict = {}
        for token in tokens:
            for key in token:
                #if key not present in dict
                #add it with an empty list
                if key not in tokens_dict:
                    tokens_dict[key] = []

                tokens_dict[key].append(token[key])

        #stack all lists
        for key in tokens_dict:
            tokens_dict[key] = torch.stack(tokens_dict[key])

        return tokens_dict, torch.argmax(torch.tensor(labels, dtype=torch.long), dim=1)

    def __getitem__(self, item):
        return self.features[item], self.target[item]

    def __len__(self):
        return len(self.features)

In [10]:
test = pd.read_parquet('data/test.parquet')
annos = test.columns[1:]
features, labels = test['question'], test.drop('question', axis=1)
del test

In [28]:
ds = DatasetClass(features.to_numpy()[:100], labels.to_numpy()[:100], "DeepPavlov/bert-base-multilingual-cased-sentence")
dl = torch.utils.data.DataLoader(ds, batch_size=2, collate_fn=ds.collate_fn)

Tokenizing...: 100%|██████████| 100/100 [00:00<00:00, 1175.37it/s]


In [29]:
batch = next(iter(dl))

In [30]:
batch

({'input_ids': tensor([[   101,  28572,  10241,  10913,  41102,    106,  12624,  99842,    131,
            18963,  33190,  16541,  36257,  10517,  10913,  11429, 109120,  58154,
            77870,  10179,    117,  16429,  37235,  11092,  74308,  88908,  73578,
            10851,    551,  50985,  42500,  11805,  30317,    136,    102,      0,
                0,      0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0],
          [   101,    509,  35261,  19544,  91369,  45544,    543,  19689,  94364,
              136,    516,  10433,  17186,  10122,  64553,  33023,  24425,    543,
            19689,  94364,    118,  18963,  16541,  36257,  10517,  86860,  18805,
            45680,    549,    558, 101351,  71004, 105378,  15692,  18805,  98813,
              136,    521,  10316,  24751,  98813,  10122,    109,  10407,  14736,
              558,  10

In [226]:
transformers.__version__

'2.8.0'

In [224]:
__transfer_data_to_device(next(iter(dl)), 'cuda')

<class 'dict'>


({'input_ids': tensor([[   101,  15393,  26856,    106,  27548,    156,   7497,  11089,  17141,
             2790,  18797,  22648,  50312,    626,    128,  25121,  21256,  16875,
            55047,    861,  25795,  66965,  30829,    166,    102,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0],
          [   101,    777,  10962,  22655,  12620,    845,   9378,  17134,    166,
           116161,   3650,   1469,  12942,  67606,    845,   9378,  17134,    130,
             7497,  17141,  17050,   8528,  12313,    851,  21934,  15411,  16244,
             5247,   8528,  19715,    166,  32091,  19715,   1469,    112,   7993,
              798,  75837,    845,   6620,  70207,  34272,    166,    102]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [223]:
def __transfer_data_to_device(batch, device, gpu_id=None):
        if device == 'tpu' and XLA_AVAILABLE:
            # base case: object can be directly moved using `to`
            if callable(getattr(batch, 'to', None)):
                return batch.to(xm.xla_device())

        if device == 'gpu':
            # base case: object can be directly moved using `cuda` or `to`
            if callable(getattr(batch, 'cuda', None)):
                # non_blocking will be ignored if tensor is not pinned.
                # so we can always set it to True
                return batch.cuda(gpu_id, non_blocking=True)

            if callable(getattr(batch, 'to', None)):
                # non_blocking will be ignored if tensor is not pinned.
                # so we can always set it to True
                return batch.to(torch.device('cuda', gpu_id), non_blocking=True)

        # when list
        if isinstance(batch, list):
            for i, x in enumerate(batch):
                batch[i] = __transfer_data_to_device(x, device, gpu_id)
            return batch

        # when tuple
        if isinstance(batch, tuple):
            # when namedtuple
            if hasattr(batch, '_fields'):
                elem_type = type(batch)
                return elem_type(*(__transfer_data_to_device(x, device, gpu_id) for x in batch))
            else:
                batch = list(batch)
                for i, x in enumerate(batch):
                    batch[i] = __transfer_data_to_device(x, device, gpu_id)
                return tuple(batch)

        # when dict
        if isinstance(batch, dict):
            for k, v in batch.items():
                batch[k] = __transfer_data_to_device(v, device, gpu_id)

            return batch

        # nothing matches, return the value as is without transform
        return batch

In [213]:
if callable(getattr(batch[0], 'cuda', None)):
    print('callable')

In [156]:
print(f"Input batch size: {batch[0]['input_ids'].size()}")
print(f"Model output size: {model(input_ids=batch[0]['input_ids'], attention_mask=batch[0]['attention_mask'], token_type_ids=batch[0]['token_type_ids'])[1].size()}")

Input batch size: torch.Size([2, 44])
Model output size: torch.Size([2, 768])


In [103]:
def full_batch():
    for i, batch in enumerate(dl):
        break

In [104]:
%timeit full_batch()

1.76 s ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [158]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [160]:
class DeepPavlovBertClassModel(pl.LightningModule):
    def __init__(self, hparams):
        super(DeepPavlovBertClassModel, self).__init__()
        self.hparams = hparams
        #define model layers
        self.bert = transformers.BertModel.from_pretrained(hparams.model_path)
        self.drop = nn.Dropout(hparams.dropout_prob)
        self.lin = nn.Linear(self.bert.pooler.dense.out_features, hparams.num_classes)
        #define loss, metric and softmax
        self.soft = nn.Softmax(dim=1)
        self.loss_fn = nn.CrossEntropyLoss(weight=hparams.weigths, ignore_index=0) #ignoring PAD index
        self.auc = roc_auc_score
    
    def forward(self, input):
        #get sentence embeddings
        embs = self.bert(input_ids=input['input_ids'], attention_mask=input['attention_mask'], token_type_ids=input['token_type_ids'])[1]
        logits = self.lin(self.drop(embs))

        return logits
    
    def prepare_data(self):
        df_train = pd.read_parquet(self.hparams.train_file_path)
        df_val = pd.read_parquet(self.hparams.val_file_path)

        self.train_ds = SentencesDataset(features=df_train['question'].to_numpy(), target=df_train.drop('question', axis=1).to_numpy(), 
                                         tokenizer_path=self.hparams.tokenizer_path)
        self.val_ds = SentencesDataset(features=df_val['question'].to_numpy(), target=df_val.drop('question', axis=1).to_numpy(), 
                                       tokenizer_path=self.hparams.tokenizer_path)

        self.num_train_steps = int(len(self.train_ds) / self.hparams.train_batch_size * self.hparams.num_epochs)
    
    def train_dataloader(self):
        loader = torch.utils.data.DataLoader(self.train_ds, collate_fn=self.train_ds.collate_fn,
                                             batch_size=self.hparams.train_batch_size,
                                             num_workers=4, shuffle=True)
        return loader
     
    def val_dataloader(self):
        loader = torch.utils.data.DataLoader(self.val_ds, collate_fn=self.train_ds.collate_fn,
                                             batch_size=self.hparams.valid_batch_size,
                                             num_workers=4)      
        return loader
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        preds = self(x)
        loss = self.loss_fn(preds, y)
        try:
            labels = np.eye(self.hparams.num_classes, dtype=np.int)[y.cpu().numpy()]
            auc = torch.tensor(self.auc(labels, self.soft(preds).detach().cpu(), average='macro', multi_class='ovo'))
        except ValueError:
            auc = torch.tensor(0)
        
        logs = {'train_loss': loss, 'train_auc': auc}
        
        return {'loss': logs['train_loss'], 'log': logs, 'labels': y, 'preds': preds}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()

        label = torch.cat([x['labels'] for x in outputs], dim=0)
        preds = torch.cat([x['preds'] for x in outputs], dim=0)
        label = np.eye(self.hparams.num_classes, dtype=np.int)[label.cpu().numpy()]
        auc = torch.tensor(self.auc(label, self.soft(preds).detach().cpu(), average='macro', multi_class='ovo'))

        logs = {'train_epoch_loss': avg_loss, 'train_epoch_auc': auc}
        return {'log': logs}
    
    def validation_step(self, batch, batch_nb):
        x, y = batch
        
        preds = self(x)
        loss = self.loss_fn(preds, y)
        try:
            labels = np.eye(self.hparams.num_classes, dtype=np.int)[y.cpu().numpy()]
            auc = torch.tensor(self.auc(labels, self.soft(preds).detach().cpu(), average='macro', multi_class='ovo'))
        except ValueError:
            auc = torch.tensor(0)
        
        logs = {'val_loss': loss, 'val_auc': auc}
        self.logger.experiment.log(logs)
        
        return {'val_loss': logs['val_loss'], 'labels': y, 'preds': preds}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        
        label = torch.cat([x['labels'] for x in outputs], dim=0)
        preds = torch.cat([x['preds'] for x in outputs], dim=0)
        label = np.eye(self.hparams.num_classes, dtype=np.int)[label.cpu().numpy()]
        try:
          auc = torch.tensor(self.auc(label, self.soft(preds).detach().cpu(), average='macro', multi_class='ovo'))
        except ValueError:
          auc = torch.tensor(0)

        logs = {'val_epoch_loss': avg_loss, 'val_epoch_auc': auc}
        return {'val_epoch_auc': logs['val_epoch_auc'], 'log': logs}
    
    def configure_optimizers(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        ]
        optimizer = AdamW(optimizer_parameters, lr=self.hparams.lr)
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps=0, 
                                                    num_training_steps=self.num_train_steps)
        
        return [optimizer], [scheduler]

    def predict(self, texts, batch_size=32):
        ds = SentencesDataset(features=texts, target=np.zeros(len(texts)), tokenizer_path=self.hparams.tokenizer_path)
        loader = torch.utils.data.DataLoader(ds, collate_fn=ds.collate_fn, batch_size=batch_size, num_workers=4)

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.to(device)
        preds = []
        with torch.no_grad():
            for batch in tqdm(loader, total=len(loader), desc=f"Predicting on {device}"):
                features = batch[0]
                #move to proper device
                for feature_name in features:
                    features[feature_name] = features[feature_name].to(device)

                preds.append(self.soft(self(features)))
            
            preds = torch.cat(preds, dim=0)
        return preds

In [161]:
with open('data/label_weights.pkl', 'rb') as f:
    class_weights = pickle.load(f)

In [201]:
from argparse import Namespace

config = Namespace(num_epochs = 5,
                   train_batch_size = 2,
                   valid_batch_size = 4,
                   dropout_prob=0.1,
                   num_classes=len(class_weights),
                   lr=3e-5,
                   weigths=torch.tensor(list(class_weights.values())),
                   model_path = 'DeepPavlov/rubert-base-cased-sentence',
                   tokenizer_path='DeepPavlov/rubert-base-cased-sentence',
                   train_file_path = 'data/train.parquet',
                   val_file_path = 'data/val.parquet')

In [169]:
import gc

def memory_cleanup():
    """
    Cleans up GPU memory 
    https://github.com/huggingface/transformers/issues/1742
    """
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            del obj
    gc.collect()
    torch.cuda.empty_cache()

In [205]:
memory_cleanup()

In [178]:
model = DeepPavlovBertClassModel(config)

In [183]:
preds = model.predict(features, batch_size=4)

Predicting on cuda: 100%|██████████| 17922/17922 [51:56<00:00,  5.75it/s]


In [197]:
array = preds.cpu().numpy()
array_sum = np.sum(array)
array_has_nan = np.isnan(array_sum)
print(array_has_nan)

False


In [196]:
array = np.nan_to_num(array)

In [198]:
roc_auc_score(labels.to_numpy(), array, average='macro', multi_class='ovo')

0.49466620870990863

In [206]:
model = DeepPavlovBertClassModel(config)

run_name = f'test'
wandb_logger = WandbLogger(project='ODS_QA', name=run_name)
stop_patience = 4 #it's 2, but there is a bug in lightning, where early_stopping_callback called twice
early_stopping = EarlyStopping(monitor='val_epoch_auc',
                            min_delta=0,
                            patience=stop_patience,
                            verbose=True,
                            mode='max')
checkpoint_callback = ModelCheckpoint(filepath=wandb_logger.experiment.dir+'/'+run_name+'_{epoch}-{val_epoch_auc:.2f}', 
                                    save_top_k=1, 
                                    monitor='val_epoch_auc')
trainer = Trainer(fast_dev_run=True, gpus=1, early_stopping_callback=early_stopping, checkpoint_callback=checkpoint_callback, logger=wandb_logger, max_epochs=config.num_epochs)
trainer.fit(model)

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]

    | Name                                             | Type              | Params
-----------------------------------------------------------------------------------
0   | bert                                             | BertModel         | 177 M 
1   | bert.embeddings                                  | BertEmbeddings    | 92 M  
2   | bert.embeddings.word_embeddings                  | Embedding         | 91 M  
3   | bert.embeddings.position_embeddings              | Embedding         | 393 K 
4   | bert.embeddings.token_type_embeddings            | Embedding         | 1 K   
5   | bert.embeddings.LayerNorm                        | LayerNorm         | 1 K   
6   | bert.

RuntimeError: CUDA out of memory. Tried to allocate 18.00 MiB (GPU 0; 3.78 GiB total capacity; 2.29 GiB already allocated; 27.62 MiB free; 2.38 GiB reserved in total by PyTorch)