# DeBERTa + RoBERTa
This is an ensemble of deberta-base and roberta-base, emphasizing the difference in the range
of offset mappings.

DeBERTa tokenizer includes preceding space ' ' in offset_mapping, while many other tokenizers do not, including RoBERTa.

### tokenize('Hello world!')

begin, end, text[begin:end]


DeBERTa
```
  0   5 'Hello'
  5  11 ' world'
 11  12 '!'
```

RoBERTa
```
  0   5  'Hello'
  6  11  'world'
 11  12  '!'
```

If you do not handle this difference appropriately, you might:

* Mark first white space as positive, or,
* mark white spaces between positive words as negative,

getting unnecessarily worse scores.

This notebook:
1. *Always* assign token probabilities to preceding ' ' in character-wise probabilities (deberta style).
2. Remove first ' ' in location.

You can do more sophisticated location extractions, but always
important to notice the difference in the offset mappings when you map to character-wise probabilities.

## Scores

```
1-fold deberta-base: 0.856 (Version 2)
1-fold roberta-base: 0.846 (Version 3)
Mean of 1-fold each: 0.859 (Version 4)
Mean of 5-fold each: 0.868 (Version 5)
Weighted mean:       This version
```

The ensemble is 0.6-0.4 weighted mean of deberta and roberta, 5 fold each.

## Reference

The model is Nakama's great baseline:

https://www.kaggle.com/code/yasufuminakama/nbme-deberta-base-baseline-train/notebook



In [None]:
import numpy as np
import pandas as pd
import os
import sys
import time
import pickle

import torch
import torch.nn as nn

from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer

%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data

In [None]:
di = '/kaggle/input/nbme-score-clinical-patient-notes/'

# id, case_num, pn_num, feature_num
test = pd.read_csv(di + 'test.csv')

# Features: feature_num -> case_num
features = pd.read_csv(di + 'features.csv')
features.loc[27, 'feature_text'] = 'Last-Pap-smear-1-year-ago'  # Fix typo I-year -> 1-year

# Patient notes: Main texts
patient_notes = pd.read_csv(di + 'patient_notes.csv')

# Attach text `pn_history` to train annotations
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

test.head()

In [None]:
class Dataset(torch.utils.data.Dataset):
    """
    Dataset(data)
      data (np.array or list-like): input_ids and y
    """
    def __init__(self, data, *, max_length=512):
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        # Padding to max_length
        d = self.data[i]
        n = min(d['n'], self.max_length)

        input_ids = np.zeros(self.max_length, dtype=int)
        input_ids[:n] = d['input_ids']

        attention_mask = np.zeros(self.max_length, dtype=int)
        attention_mask[:n] = 1

        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'n': n}

## Tokenize

In [None]:
def create_data(train, tokenizer, *, max_length=1024):
    """
    Create input_ids and label array y

    Args:
      train (pd.DataFrame)

    Returns: list[dict]
      input_ids (np.array[int])
      n (int): seq length or number of tokens
    """
    sep = tokenizer.sep_token_id
    if tokenizer.__class__.__name__ == 'RobertaTokenizerFast':
        nsep = 3
    else:
        nsep = 2
        
    data = []
    for i, r in train.iterrows():
        text = r.pn_history
        feature_text = r.feature_text

        o = tokenizer(text, feature_text,
                      add_special_tokens=True, max_length=max_length,
                      truncation=True,
                      return_offsets_mapping=True)

        # Input ids
        input_ids = o['input_ids']
        n = len(input_ids)

        input_ids = np.array(o['input_ids'], dtype=int)
        assert np.sum(input_ids == sep) == nsep  # text and feature_text seperated by [SEP]; sep sep for roberta

        # Attention mask
        attention_mask = np.array(o['attention_mask'])
        assert np.all(attention_mask == 1)

        d = {'row_id': r['id'],
             'input_ids': input_ids,
             'text': text,
             'n': n,
             'offset_mapping': o['offset_mapping']}
        data.append(d)

    return np.array(data)



## Model

In [None]:
class Model(nn.Module):
    def __init__(self, model_dir, *, dropout=0.2, pretrained=True):
        super().__init__()

        config = AutoConfig.from_pretrained(model_dir, add_pooling_layer=False)
        if pretrained:
            self.transformer = AutoModel.from_pretrained(model_dir, config=config)
        else:
            self.transformer = AutoModel.from_config(config)

        self.fc_dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(config.hidden_size, 1)

        self._init_weights(self.fc, config)

    def _init_weights(self, module, config):
        module.weight.data.normal_(mean=0.0, std=config.initializer_range)
        if module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids, attention_mask):
        out = self.transformer(input_ids, attention_mask)
        x = out['last_hidden_state']  # batch_size x max_length (512) x 768

        x = self.fc_dropout(x)
        x = self.fc(x)

        return x

## Predict

In [None]:
def character_prob(pred):
    """
    Map token-level probabilites to character-level probabilites
    
    Args:
      pred (dict): pred['y_pred'] and pred['d']

    Returns:
      y_prob (np.array[float]): character-level probabilities
    """
    y_pred = pred['y_pred']
    d = pred['d']
    text = d['text']
    offset_mapping = d['offset_mapping']

    # Map token-level prob to character-level prob
    y_prob = np.zeros(len(text))  # character-wise probabilities
    end_prev = 0
    for p, (begin, end) in zip(y_pred, offset_mapping):
        if end_prev > 0 and begin == 0 and end == 0:
            break  # This is end of patient note (ToDo think better way)

        y_prob[end_prev:end] = p  # assign p to ' ' in end_prev:begin, too
        end_prev = end

    return y_prob


def predict(data, model_info, *, batch_size=12):
    """
    data => pred
    
    Args:
      data (list[dict]): input_ids
    
    Result: preds, probs
      preds (list[dict]): token-level model prediction
      probs (list[dict]): character-level probability
    """
    print(model_info)

    transformer_name = model_info[0]  # deberta-base
    model_name = model_info[1]        # nbaseline
    run_name = model_info[2]          # deberta_base
    ifold = model_info[3]
    best = model_info[4]              # True = with early stopping
    
    best = '_best' if best else ''
    
    transformer_dir = '/kaggle/input/clinical-public/transformers/%s' % transformer_name
    weight_dir = '/kaggle/input/clinical-public/%s_%s' % (model_name, run_name)  # nbaseline_deberta_base
    
    # Data
    loader = DataLoader(Dataset(data), batch_size=batch_size)

    # Model
    model = Model(transformer_dir, pretrained=False)

    weight_filename = '%s/model%d%s.pytorch' % (weight_dir, ifold, best)
    model.load_state_dict(torch.load(weight_filename, map_location=device))
    model.eval()
    model.to(device)

    i = 0
    preds = []
    probs = []
    for d in loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)

        with torch.no_grad():
            y_pred = model(input_ids, attention_mask)

        y_pred = y_pred.sigmoid().cpu().numpy()

        for k, m in enumerate(d['n']):
            pred = {'d': data[i],
                    'y_pred': y_pred[k, :m].copy()}
            
            prob = {'row_id': pred['d']['row_id'],
                    'text':   pred['d']['text'],
                    'y_prob': character_prob(pred)}

            preds.append(pred)
            probs.append(prob)
            i += 1

    del model

    return preds, probs


def add_probs(probs, probs2, w):
    """
    probs += w * prob2
    First probs2 will become probs and get modified
    """
    if probs is None:
        for prob in probs2:
            prob['y_prob'] *= w
        return probs2
    
    assert len(probs) == len(probs2)
    
    for prob, prob2 in zip(probs, probs2):
        assert prob['row_id'] == prob2['row_id']
        prob['y_prob'] += w * prob2['y_prob']
    
    return probs

In [None]:
# Weighted mean of 0.6 deberta - 0.4 roberta
models = [('deberta-base', 'nbaseline', 'deberta_base', ifold, True, 0.6 / 5)
          for ifold in range(5)] + \
         [('roberta-base', 'nbaseline', 'roberta_base', ifold, True, 0.4 / 5)
          for ifold in range(5)]

n = 0
w_sum = 0
probs = None

for model in models:
    transformer_name = model[0]  # deberta-base
    transformer_dir = '/kaggle/input/clinical-public/transformers/%s/tokenizer' % transformer_name
    tokenizer = AutoTokenizer.from_pretrained(transformer_dir)
    data = create_data(test, tokenizer)
    w = model[5]

    _, probs1 = predict(data, model)
    probs = add_probs(probs, probs1, w)
    n += 1
    w_sum += w


print('Weighted average of %d models. Weight sum %.3f' % (n, w_sum))
assert 0.999 < w_sum < 1.001

In [None]:
def format_spans(indices):
    segs = []
    left = last = None
    for i in indices:
        if left is None:
            left = last = i
        elif last + 1 == i:
            last = i
        else:
            # New segment
            segs.append('%d %d' % (left, last + 1))
            left = last = i

    if last is not None:
        segs.append('%d %d' % (left, last + 1))
    
    return ';'.join(segs)


def create_submission(probs, *, th=0.5):
    """
    Create character-level prediction

    Args:
      probs: chacter-level predictions
    """
    assert len(probs) == len(test)
    
    locs = []
    
    ids = []
    locs = []
        
    for prob in probs:
        ids.append(prob['row_id'])
        text = prob['text']
        y_prob = prob['y_prob']
        
        li = []
        i_begin = i_last = None
        for i, (x, p) in enumerate(zip(text, y_prob)):
            if p >= th:
                if i_begin is None and x != ' ':  # Do not include first space in span
                    i_begin = i_last = i
                    li.append(i)
                elif i_begin is not None:         # Positive character is continuing
                    assert i_last + 1 == i
                    i_last = i
                    li.append(i)
            else:
                i_begin = i_last = None           # Negative; reset span

        locs.append(format_spans(li))

    return pd.DataFrame({'id': ids, 'location': locs})


submit = create_submission(probs)
submit.head()

Example:

```
	id	location
0	00016_000	696 724
1	00016_001	668 693
2	00016_002	203 217
3	00016_003	70 91
4	00016_004	222 258
```

In [None]:
submit.to_csv('submission.csv', index=False)

## Additional info

### KFold

```
nfold = 5
kfold = StratifiedGroupKFold(nfold, shuffle=True, random_state=42)
groups = train['pn_num'].values
cases = train['case_num'].values

for ifold, (idx_train, idx_val) in enumerate(kfold.split(data, cases, groups=groups)):
    pass
```

### Training

RoBERTa

```
get_cosine_schedule_with_warmup

learning rate: 1e-4 with 0.1 epochs of warmup
5 epochs
```

DeBERTa

Same as the original baseline.

