# Custom DeBERTA V3

> Model Definitions goes here

In [None]:
#| default_exp models

In [None]:
#| export
import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    BertForSequenceClassification,
)

In [None]:
import pandas as pd

from en_grammar_checker.config import Config
from en_grammar_checker.datasets import get_train_data_loader

In [None]:
#| hide
cnfg = Config()

In [None]:
# checking model config
model_config = AutoConfig.from_pretrained(cnfg.base_model_name)



In [None]:
# checking tokenizer
my_tokenizer = AutoTokenizer.from_pretrained(cnfg.base_model_name)



In [None]:
# dir(my_tokenizer)

In [None]:
my_tokenizer.encode_plus

<bound method PreTrainedTokenizerBase.encode_plus of DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-large', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=

In [None]:
a = my_tokenizer("I am done.")
b = my_tokenizer("Thanks! I am done")
c = my_tokenizer("I am done!")
d = my_tokenizer("[CLS]I am done!")

print(a)
print(b)
print(c)
print(d)

{'input_ids': [1, 273, 481, 619, 260, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 1195, 300, 273, 481, 619, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 273, 481, 619, 300, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 1, 273, 481, 619, 300, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [None]:
# Note: Tokenizer adds a starting token [CLS] and end of sentence token on its own

### Check Model Architecture

In [None]:
base_model = AutoModel.from_pretrained(cnfg.base_model_name, config=model_config)

In [None]:
base_model

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 1024, padding_idx=0)
    (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-23): 24 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        

In [None]:
df_train = pd.read_csv(
    f"{cnfg.train_path}",
    delimiter="\t",
    header=None,
    names=["sentence_source", "label", "label_notes", "sentence"],
)
train_dataloader = get_train_data_loader(cnfg, df_train)
train_dataloader_iterator = iter(train_dataloader)
X, X2, Y = next(train_dataloader_iterator)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
output_ = base_model(input_ids=X, attention_mask=X2)

In [None]:
# dir(output_)

In [None]:
output_.last_hidden_state.shape

torch.Size([8, 512, 1024])

In [None]:
# Taking embeddings of [CLS] token
cls_embedding = output_.last_hidden_state[:, 0:1, :].squeeze(1)

In [None]:
cls_embedding.shape

torch.Size([8, 1024])

In [None]:
# cls_embedding

In [None]:
model_config.pooler_hidden_size

1024

In [None]:
#| export
class EnDeBertaClassifier(nn.Module):
    def __init__(self, cnfg):
        super().__init__()

        self.model_config = AutoConfig.from_pretrained(cnfg.base_model_name)
        self.base_model = AutoModel.from_pretrained(
            cnfg.base_model_name, config=self.model_config
        )

        ## Classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(
                self.model_config.pooler_hidden_size,
                self.model_config.pooler_hidden_size,
            ),
            nn.Tanh(),
            nn.Linear(self.model_config.pooler_hidden_size, cnfg.num_classes),
            nn.Softmax(dim=1),
        )

    def forward(self, input_ids, attention_mask):
        base_embeddings = self.base_model(
            input_ids=input_ids, attention_mask=attention_mask
        )
        cls_embedding = base_embeddings.last_hidden_state[:, 0:1, :].squeeze(
            1
        )  # Taking embeddings of [CLS] token

        logits = self.classifier(cls_embedding)

        return logits

In [None]:
my_model = EnDeBertaClassifier(cnfg)



In [None]:
my_model

EnDeBertaClassifier(
  (base_model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=

In [None]:
logits = my_model(input_ids=X, attention_mask=X2)

In [None]:
logits.shape

torch.Size([8, 2])

In [None]:
#| hide
import nbdev

nbdev.nbdev_export()