In [1]:
from configuration import CFG
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from typing import Tuple

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
""" 인스턴스 상태로 임베딩 셰어링 하는 것과 개별 가중치 매트릭스 단위로 접근해서 셰여링 하는게 좋은지 """


class GeneratorEmbedding(nn.Module):
    """ BERT Embedding Module class
    This module has option => whether or not to use ALBERT Style Factorized Embedding
    This Module set & initialize 3 Embedding Layers:
        1) Word Embedding 2) Absolute Positional Embedding
    Args:
        cfg: configuration.py
    Notes:
        Absolute Positional Embedding added at bottom layers
    """
    def __init__(self, cfg: CFG) -> None:
        super(GeneratorEmbedding, self).__init__()
        self.cfg = cfg
        self.max_seq = cfg.max_seq
        self.word_embedding = nn.Embedding(len(cfg.tokenizer), cfg.dim_model)
        self.abs_pos_emb = nn.Embedding(cfg.max_seq, cfg.dim_model)  # Absolute Position Embedding for EMD Layer
        self.layer_norm1 = nn.LayerNorm(cfg.dim_model, eps=cfg.layer_norm_eps)  # for word embedding
        self.layer_norm2 = nn.LayerNorm(cfg.dim_model, eps=cfg.layer_norm_eps)  # for word embedding
        self.hidden_dropout = nn.Dropout(p=cfg.hidden_dropout_prob)

        # ALBERT Style Factorized Embedding
        if self.cfg.is_mf_embedding:
            self.word_embedding = nn.Embedding(len(cfg.tokenizer), int(cfg.dim_model/6))
            self.projector = nn.Linear(int(cfg.dim_model/6), cfg.dim_model)  # project to original hidden dim

    def forward(self, inputs: Tensor) -> Tuple[nn.Embedding, nn.Embedding]:
        if self.cfg.is_mf_embedding:
            word_embeddings = self.hidden_dropout(
                self.layer_norm1(self.projector(self.word_embedding(inputs)))
            )
        else:
            word_embeddings = self.hidden_dropout(
                self.layer_norm1(self.word_embedding(inputs))
            )
        abs_pos_emb = self.hidden_dropout(
            self.layer_norm2(self.abs_pos_emb(torch.arange(inputs.shape[1], device="cuda").repeat(inputs.shape[0]).view(inputs.shape[0], -1)))
        )
        return word_embeddings, abs_pos_emb
    
class DiscriminatorEmbedding(nn.Module):
    """ BERT Embedding Module class
    This module has option => whether or not to use ALBERT Style Factorized Embedding
    This Module set & initialize 3 Embedding Layers:
        1) Word Embedding 2) Absolute Positional Embedding
    Args:
        cfg: configuration.py
    Notes:
        Absolute Positional Embedding added at bottom layers
    """
    def __init__(self, cfg: CFG) -> None:
        super(DiscriminatorEmbedding, self).__init__()
        self.cfg = cfg
        self.max_seq = cfg.max_seq
        self.word_embedding = nn.Embedding(len(cfg.tokenizer), 1024)
        self.abs_pos_emb = nn.Embedding(cfg.max_seq, 1024)  # Absolute Position Embedding for EMD Layer
        self.layer_norm1 = nn.LayerNorm(cfg.dim_model, eps=1e-9)  # for word embedding
        self.layer_norm2 = nn.LayerNorm(cfg.dim_model, eps=1e-9)  # for word embedding
        self.hidden_dropout = nn.Dropout(p=cfg.hidden_dropout_prob)

        # ALBERT Style Factorized Embedding
        if self.cfg.is_mf_embedding:
            self.word_embedding = nn.Embedding(len(cfg.tokenizer), int(cfg.dim_model/6))
            self.projector = nn.Linear(int(cfg.dim_model/6), cfg.dim_model)  # project to original hidden dim

    def forward(self, inputs: Tensor) -> Tuple[nn.Embedding, nn.Embedding]:
        if self.cfg.is_mf_embedding:
            word_embeddings = self.hidden_dropout(
                self.layer_norm1(self.projector(self.word_embedding(inputs)))
            )
        else:
            word_embeddings = self.hidden_dropout(
                self.layer_norm1(self.word_embedding(inputs))
            )
        abs_pos_emb = self.hidden_dropout(
            self.layer_norm2(self.abs_pos_emb(torch.arange(inputs.shape[1], device="cuda").repeat(inputs.shape[0]).view(inputs.shape[0], -1)))
        )
        return word_embeddings, abs_pos_emb

In [12]:
generator = GeneratorEmbedding(CFG)
discriminator = DiscriminatorEmbedding(CFG)

In [13]:
generator, discriminator

(GeneratorEmbedding(
   (word_embedding): Embedding(128001, 768)
   (abs_pos_emb): Embedding(512, 768)
   (layer_norm1): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
   (layer_norm2): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
   (hidden_dropout): Dropout(p=0.1, inplace=False)
 ),
 DiscriminatorEmbedding(
   (word_embedding): Embedding(128001, 1024)
   (abs_pos_emb): Embedding(512, 1024)
   (layer_norm1): LayerNorm((768,), eps=1e-09, elementwise_affine=True)
   (layer_norm2): LayerNorm((768,), eps=1e-09, elementwise_affine=True)
   (hidden_dropout): Dropout(p=0.1, inplace=False)
 ))

In [11]:
""" 인스턴스 자체를 공유하는 경우, 해당 인스턴 내부에 포함된 다른 모듈 정보도 복사,
이것을 원치 않는다면 임베딩 관련 attr만 찍어서 셰어링 할 것
"""

discriminator = generator
discriminator

GeneratorEmbedding(
  (word_embedding): Embedding(128001, 768)
  (abs_pos_emb): Embedding(512, 768)
  (layer_norm1): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
  (layer_norm2): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
  (hidden_dropout): Dropout(p=0.1, inplace=False)
)

In [35]:
discriminator.word_embedding.weight = generator.word_embedding.weight
discriminator.abs_pos_emb.weight = generator.abs_pos_emb.weight

In [36]:
generator.word_embedding.weight, discriminator.word_embedding.weight

(Parameter containing:
 tensor([[ 2.2952, -1.1103, -0.6854,  ..., -0.4859,  0.7391,  0.0829],
         [-0.0511,  0.2567,  0.4269,  ...,  0.3777, -0.4286, -1.1887],
         [ 1.4421,  0.0392,  0.1582,  ..., -0.8192, -0.0563,  0.0259],
         ...,
         [-1.1109, -1.9360,  0.7205,  ...,  0.8326,  0.7596, -0.2808],
         [-0.5263,  1.5916, -1.1737,  ..., -2.0828,  1.2085, -0.7531],
         [ 0.2427, -1.7848, -1.4473,  ...,  1.2797,  1.2451, -1.5539]],
        requires_grad=True),
 Parameter containing:
 tensor([[ 2.2952, -1.1103, -0.6854,  ..., -0.4859,  0.7391,  0.0829],
         [-0.0511,  0.2567,  0.4269,  ...,  0.3777, -0.4286, -1.1887],
         [ 1.4421,  0.0392,  0.1582,  ..., -0.8192, -0.0563,  0.0259],
         ...,
         [-1.1109, -1.9360,  0.7205,  ...,  0.8326,  0.7596, -0.2808],
         [-0.5263,  1.5916, -1.1737,  ..., -2.0828,  1.2085, -0.7531],
         [ 0.2427, -1.7848, -1.4473,  ...,  1.2797,  1.2451, -1.5539]],
        requires_grad=True))

In [37]:
generator.abs_pos_emb.weight, discriminator.abs_pos_emb.weight

(Parameter containing:
 tensor([[-0.1623,  1.3104, -0.6516,  ..., -0.5250, -0.0222, -1.2774],
         [ 0.3852,  0.7240, -0.7078,  ..., -0.6339,  0.3923, -0.4718],
         [-0.4134, -0.7664,  1.3895,  ..., -1.0974,  1.5678, -0.4249],
         ...,
         [ 0.7117,  1.4549, -0.5336,  ...,  0.1409,  0.2749,  1.7711],
         [-0.5776, -0.6636,  1.5503,  ..., -1.1855,  0.1657,  0.4000],
         [ 1.3280, -1.6936, -1.2606,  ..., -0.4680,  0.9651, -0.9574]],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.1623,  1.3104, -0.6516,  ..., -0.5250, -0.0222, -1.2774],
         [ 0.3852,  0.7240, -0.7078,  ..., -0.6339,  0.3923, -0.4718],
         [-0.4134, -0.7664,  1.3895,  ..., -1.0974,  1.5678, -0.4249],
         ...,
         [ 0.7117,  1.4549, -0.5336,  ...,  0.1409,  0.2749,  1.7711],
         [-0.5776, -0.6636,  1.5503,  ..., -1.1855,  0.1657,  0.4000],
         [ 1.3280, -1.6936, -1.2606,  ..., -0.4680,  0.9651, -0.9574]],
        requires_grad=True))

In [22]:
test = nn.Linear(5, 5)
test2 = nn.Linear(5, 5)

test.weight, test2.weight

(Parameter containing:
 tensor([[ 0.0128,  0.4399,  0.2408, -0.3927,  0.1460],
         [-0.0897,  0.1777, -0.0144,  0.2699,  0.3550],
         [-0.3313, -0.3217,  0.1335,  0.2754, -0.0162],
         [ 0.1812, -0.0031, -0.0985,  0.0917,  0.4457],
         [ 0.4281, -0.1821,  0.3798, -0.3857,  0.0316]], requires_grad=True),
 Parameter containing:
 tensor([[ 0.0360, -0.2234, -0.3651, -0.3511, -0.2490],
         [ 0.2894, -0.3327,  0.2404,  0.3974, -0.3460],
         [ 0.0452,  0.0221, -0.1934, -0.3077,  0.0728],
         [ 0.4087,  0.2177, -0.2958, -0.2741, -0.1044],
         [ 0.0724,  0.2305,  0.1937, -0.0123, -0.2719]], requires_grad=True))

In [18]:
test.weight = test2.weight
test.weight

Parameter containing:
tensor([[-0.4118, -0.1691,  0.0908, -0.0913,  0.3356],
        [ 0.2840, -0.2020,  0.3218, -0.1826,  0.3488],
        [-0.4029,  0.2939, -0.1424,  0.0622,  0.3741],
        [ 0.1679,  0.1159, -0.0755,  0.2663, -0.2795],
        [-0.3321,  0.1732,  0.3661,  0.0217,  0.2214]], requires_grad=True)

In [23]:
""" to cuda 하기 싫으면 이거 쓰기 """

test.register_buffer("test", torch.ones(5))
test.test.requires_grad = True
test.test.requires_grad