# Koelectra

In [None]:
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from transformers.activations import get_activation
from transformers import (
  ElectraPreTrainedModel,
  ElectraModel,
  ElectraConfig,
  ElectraTokenizer,
  BertConfig,
  BertTokenizer
)

# MODEL_CLASSES = {
#     "koelectra-base": (ElectraConfig, koElectraForSequenceClassification, ElectraTokenizer),
#     "koelectra-small": (ElectraConfig, koElectraForSequenceClassification, ElectraTokenizer),
#     "koelectra-base-v2": (ElectraConfig, koElectraForSequenceClassification, ElectraTokenizer),
#     "koelectra-small-v2": (ElectraConfig, koElectraForSequenceClassification, ElectraTokenizer),
# }


# def load_tokenizer(args):
#   return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)


class ElectraClassificationHead(nn.Module):
  """Head for sentence-level classification tasks."""

  def __init__(self, config, num_labels):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, 4*config.hidden_size)
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.out_proj = nn.Linear(4*config.hidden_size,num_labels)

  def forward(self, features, **kwargs):
    x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
    x = self.dropout(x)
    x = self.dense(x)
    x = get_activation("gelu")(x)  # although BERT uses tanh here, it seems Electra authors used gelu here
    x = self.dropout(x)
    x = self.out_proj(x)
    return x

class koElectraForSequenceClassification(ElectraPreTrainedModel):
  def __init__(self,
               config,
               num_labels):
    super().__init__(config)
    self.num_labels = num_labels
    self.electra = ElectraModel(config)
    self.classifier = ElectraClassificationHead(config, num_labels)
    self.init_weights()

  def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          position_ids=None,
          head_mask=None,
          inputs_embeds=None,
          labels=None,
          output_attentions=None,
          output_hidden_states=None,
  ):
    r"""
    labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
        Labels for computing the sequence classification/regression loss.
        Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
        If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
        If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    discriminator_hidden_states = self.electra(
      input_ids,
      attention_mask,
      token_type_ids,
      position_ids,
      head_mask,
      inputs_embeds,
      output_attentions,
      output_hidden_states,
    )

    sequence_output = discriminator_hidden_states[0]
    logits = self.classifier(sequence_output)

    outputs = (logits,) + discriminator_hidden_states[1:]  # add hidden states and attention if they are here

    if labels is not None:
      if self.num_labels == 1:
        #  We are doing regression
        loss_fct = MSELoss()
        loss = loss_fct(logits.view(-1), labels.view(-1))
      else:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      outputs = (loss,) + outputs

    return outputs  # (loss), (logits), (hidden_states), (attentions)

def koelectra_input(tokenizer, str, device = None, max_seq_len = 512):
  index_of_words = tokenizer.encode(str)
  # token_type_ids = [0] * len(index_of_words)
  attention_mask = [1] * len(index_of_words)

  # Padding Length
  padding_length = max_seq_len - len(index_of_words)

  # Zero Padding
  index_of_words += [0] * padding_length
  # token_type_ids += [0] * padding_length
  attention_mask += [0] * padding_length

  data = {
    'input_ids': torch.tensor([index_of_words]).to(device),
    'attention_mask': torch.tensor([attention_mask]).to(device),
  }
  return data


In [None]:
model_name_or_path = "monologg/koelectra-base-discriminator"
electra_config = ElectraConfig.from_pretrained(model_name_or_path)
model = koElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                                            config=electra_config,
                                                            num_labels=359)

In [None]:
print(model)

koElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [None]:
from transformers import ElectraForPreTraining

from torchsummary import summary
from pytorch_model_summary import summary as pt_summary

In [None]:
# Get the model from Hugging Face(discriminator)
model_name = "monologg/koelectra-base-discriminator"
model99 = ElectraForPreTraining.from_pretrained(model_name)

# Calculate the number of parameters in the model
def count_parameters(model99):
    return sum(p.numel() for p in model99.parameters())

# Number of parameters output
print(f"Number of parameters: {count_parameters(model99):,}")

Number of parameters: 110,771,713


In [None]:
# Get the model from Hugging Face(generator)
model_name = "monologg/koelectra-base-generator"
model99 = ElectraForPreTraining.from_pretrained(model_name)

# Calculate the number of parameters in the model
def count_parameters(model99):
    return sum(p.numel() for p in model99.parameters())

# Number of parameters output
print(f"Number of parameters: {count_parameters(model99):,}")


config.json:   0%|          | 0.00/463 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/140M [00:00<?, ?B/s]

Some weights of ElectraForPreTraining were not initialized from the model checkpoint at monologg/koelectra-base-generator and are newly initialized: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 34,865,921


In [None]:
# Look into the model structure
for name, param in model.named_parameters():
    print(name, param.shape)

electra.embeddings.word_embeddings.weight torch.Size([32200, 768])
electra.embeddings.position_embeddings.weight torch.Size([512, 768])
electra.embeddings.token_type_embeddings.weight torch.Size([2, 768])
electra.embeddings.LayerNorm.weight torch.Size([768])
electra.embeddings.LayerNorm.bias torch.Size([768])
electra.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
electra.encoder.layer.0.attention.self.query.bias torch.Size([768])
electra.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
electra.encoder.layer.0.attention.self.key.bias torch.Size([768])
electra.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
electra.encoder.layer.0.attention.self.value.bias torch.Size([768])
electra.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
electra.encoder.layer.0.attention.output.dense.bias torch.Size([768])
electra.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
electra.encoder.layer.0.attention.output.

In [None]:
# Summarize model architecture
dummy_input5 = torch.randint(0, 32200, (1, 512))  # (batch_size, sequence_length)
print(pt_summary(model, dummy_input5, show_input=True, max_depth=None, show_parent_layers=True))

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                                                                                                       Parent Layers          Layer (type)           Input Shape         Param #     Tr. Param #
                                                   koElectraForSequenceClassification/ElectraModel/ElectraEmbeddings           Embedding-1              [1, 512]      24,729,600      24,729,600
                                                   koElectraForSequenceClassification/ElectraModel/ElectraEmbeddings           Embedding-2              [1, 512]           1,536           1,536
                                                   koElectraForSequenceClassification/ElectraModel/ElectraEmbeddings           Embedding-3              [1, 512]         393,216         393,216
                                   

---

# KoGPT2

In [None]:
!pip install kogpt2_transformers

Collecting kogpt2_transformers
  Downloading kogpt2_transformers-0.4.0-py3-none-any.whl.metadata (3.9 kB)
Downloading kogpt2_transformers-0.4.0-py3-none-any.whl (4.9 kB)
Installing collected packages: kogpt2_transformers
Successfully installed kogpt2_transformers-0.4.0


In [None]:
# model configuration
# logger = logging.getLogger(__name__)

#KoGPT2 configuration
# kogpt2_config = {
#     "initializer_range": 0.02,
#     "layer_norm_epsilon": 1e-05,
#     "n_ctx": 1024,
#     "n_embd": 768,
#     "n_head": 12,
#     "n_layer": 12,
#     "n_positions": 1024,
#     "vocab_size": 50000,
#     "activation_function": "gelu"
# }

import torch.nn as nn
from kogpt2_transformers import get_kogpt2_model


class DialogKoGPT2(nn.Module):
  def __init__(self):
    super(DialogKoGPT2, self).__init__()
    self.kogpt2 = get_kogpt2_model()

  def generate(self,
               input_ids,
               do_sample=True,
               max_length= 30,
               top_p=0.92,
               top_k=50,
               temperature= 0.6,
               no_repeat_ngram_size =None,
               num_return_sequences=1,
               early_stopping=False,
               ):
    return self.kogpt2.generate(input_ids,
               do_sample=do_sample,
               max_length=max_length,
               top_p = top_p,
               top_k=top_k,
               temperature=temperature,
               no_repeat_ngram_size= no_repeat_ngram_size,
               num_return_sequences=num_return_sequences,
               early_stopping = early_stopping,
              )

  def forward(self, input, labels = None):
    if labels is not None:
      outputs = self.kogpt2(input, labels=labels)
    else:
      outputs = self.kogpt2(input)

    return outputs



In [None]:
model2 = DialogKoGPT2()

In [None]:
print(model2)  # config코드 동작X

DialogKoGPT2(
  (kogpt2): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50000, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_feature

In [None]:
print(model2)  # config코드 동작O  # 결과는 같음

DialogKoGPT2(
  (kogpt2): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50000, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_feature

In [None]:
print(pt_summary(model2, dummy_input, show_input=True, max_depth=None, show_parent_layers=False))

-----------------------------------------------------------------------------
            Layer (type)         Input Shape         Param #     Tr. Param #
             Embedding-1           [1, 1024]      38,400,000      38,400,000
             Embedding-2           [1, 1024]         786,432         786,432
               Dropout-3      [1, 1024, 768]               0               0
             LayerNorm-4      [1, 1024, 768]           1,536           1,536
                Conv1D-5      [1, 1024, 768]       1,771,776       1,771,776
                Conv1D-6      [1, 1024, 768]         590,592         590,592
               Dropout-7      [1, 1024, 768]               0               0
             LayerNorm-8      [1, 1024, 768]           1,536           1,536
                Conv1D-9      [1, 1024, 768]       2,362,368       2,362,368
    NewGELUActivation-10     [1, 1024, 3072]               0               0
               Conv1D-11     [1, 1024, 3072]       2,360,064       2,360,06

In [None]:
model2 = DialogKoGPT2()

dummy_input = torch.randint(0, 50000, (1, 1024))  # (batch_size, sequence_length)

print(pt_summary(model2, dummy_input, show_input=True))

-------------------------------------------------------------------------
        Layer (type)         Input Shape         Param #     Tr. Param #
   GPT2LMHeadModel-1           [1, 1024]     124,242,432     124,242,432
Total params: 124,242,432
Trainable params: 124,242,432
Non-trainable params: 0
-------------------------------------------------------------------------


In [None]:
model3 = DialogKoGPT2()

dummy_input3 = torch.randint(0, 50000, (2, 1024))  # (batch_size, sequence_length)

print(pt_summary(model3, dummy_input3, show_input=True))

-------------------------------------------------------------------------
        Layer (type)         Input Shape         Param #     Tr. Param #
   GPT2LMHeadModel-1           [2, 1024]     124,242,432     124,242,432
Total params: 124,242,432
Trainable params: 124,242,432
Non-trainable params: 0
-------------------------------------------------------------------------


In [None]:
print(pt_summary(model3, dummy_input3, show_input=True, max_depth=None, show_parent_layers=True))

----------------------------------------------------------------------------------------------------------------------------------------------
                                                    Parent Layers             Layer (type)         Input Shape         Param #     Tr. Param #
                           DialogKoGPT2/GPT2LMHeadModel/GPT2Model              Embedding-1           [2, 1024]      38,400,000      38,400,000
                           DialogKoGPT2/GPT2LMHeadModel/GPT2Model              Embedding-2           [1, 1024]         786,432         786,432
                           DialogKoGPT2/GPT2LMHeadModel/GPT2Model                Dropout-3      [2, 1024, 768]               0               0
                 DialogKoGPT2/GPT2LMHeadModel/GPT2Model/GPT2Block              LayerNorm-4      [2, 1024, 768]           1,536           1,536
   DialogKoGPT2/GPT2LMHeadModel/GPT2Model/GPT2Block/GPT2Attention                 Conv1D-5      [2, 1024, 768]       1,771,776       1,771,776