In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader, Subset

import torchvision
from torchvision.datasets import CIFAR10
from torchvision.models import resnet18

from transformers import AutoModel, AutoTokenizer

In [19]:
class CustomCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, 1, 1)
        self.conv2 = nn.Conv2d(64, 64, 3, 1, 1)
        self.conv3 = nn.Conv2d(64, 256, 3, 1, 1)
        self.pool  = nn.AvgPool2d(3, 2)
        self.relu  = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = self.relu(self.conv3(x))
        x = self.pool(x)

        return x

In [34]:
import torchvision.transforms as T

transform = T.Compose([T.ToTensor(), T.Normalize(0.5, 0.2)])
cifar10 = CIFAR10("./datasets/cifar10", train=True, transform=transform)
train_loader = DataLoader(cifar10, batch_size=16, shuffle=True, pin_memory=True, num_workers=4)

In [35]:
x, y = cifar10[10]
x.device

device(type='cpu')

In [36]:
x.shape

torch.Size([3, 32, 32])

In [37]:
batch = next(iter(train_loader))
model = CustomCNN()
print(batch[0].shape)
out = model(batch[0])
out.shape

torch.Size([16, 3, 32, 32])


torch.Size([16, 256, 3, 3])

In [38]:
class Classifier(nn.Module):
    def __init__(self, in_features, num_classes) -> None:
        super().__init__()
        self.in_features = in_features
        self.num_classes = num_classes
        
        self.classifier = nn.Sequential(
            nn.Linear(in_features, in_features),
            nn.ReLU(),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.classifier(x)

In [39]:
class ImageClassifier(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.cnn = CustomCNN()
        self.flatten = nn.Flatten()
        self.classifier = Classifier(256*3*3, 10)
    
    def forward(self, x):
        x = self.cnn(x)
        x = self.flatten(x)
        x = self.classifier(x)
        return x

In [40]:
model = ImageClassifier()
model(batch[0]).shape

torch.Size([16, 10])

In [41]:
MODEL_NAME = "klue/bert-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
# upstage.ai

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [73]:
input_txt = "시퀀스 투시퀀스의 인코더 디코더 구조를 따라가지만 CNN, RNN 기반으로 이뤄진 기존 모델과 다르게 단순히 어텐션 구조만으로 전체 모델을 구성하여 어텐션 기법의 중요성을 강조한 논문입니다."
tokenized_out = tokenizer(input_txt)

In [74]:
tokenized_out

{'input_ids': [2, 1, 1, 1506, 2258, 2320, 887, 2258, 2320, 3962, 2138, 9219, 3683, 15639, 16, 54, 2111, 2111, 4568, 6233, 8815, 4078, 4347, 2145, 4405, 2318, 6044, 1406, 2436, 2197, 3962, 2154, 6233, 3910, 4347, 2069, 3896, 7488, 1406, 2436, 2197, 7385, 2079, 3748, 2047, 2069, 3986, 2470, 6022, 12190, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [78]:
input_ids = torch.tensor(tokenized_out['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(tokenized_out['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(tokenized_out['attention_mask']).unsqueeze(0)

In [80]:
model_out = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
model_out.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [81]:
model_out['last_hidden_state'].shape

torch.Size([1, 52, 768])

In [83]:
model_out['pooler_output'].shape

torch.Size([1, 768])

In [88]:
class BertClassifier(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.bert = AutoModel.from_pretrained("klue/bert-base")
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.GELU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
    
    def forward(self, input_ids, token_type_ids, attention_mask):
        output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        logits = self.classifier(output['pooler_output'])
        return logits

In [90]:
custom_bert = BertClassifier()
custom_bert(input_ids, token_type_ids, attention_mask)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[0.0321]], grad_fn=<AddmmBackward0>)

In [86]:
classifier = nn.Linear(768, 1)
classifier(model_out['pooler_output'])

tensor([[-0.1689]], grad_fn=<AddmmBackward0>)

BPE - byte pair encoding

In [85]:
vocab = [(k, v) for k, v in tokenizer.vocab.items()]
vocab = sorted(vocab, key=lambda x: x[1])
vocab[2] # classification

('[CLS]', 2)

In [87]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base")
model

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [91]:
from transformers import DefaultDataCollator

In [None]:
DataLoader(collate_fn=DefaultDataCollator("pt"))