# BERT 구현 및 nlp 
 1. 데이터 전처리 및 BERT model pretraining
 2. Naver 영화리뷰 감정분석 분류 모델로 BERT 사용해서 분류하기  
   
**Referenced by: https://paul-hyun.github.io/bert-01/**

In [1]:
import json
import torch
import torch.nn as nn
import sentencepiece as spm

""" configuration json을 읽어들이는 class """
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            config = json.loads(f.read())
            return Config(config)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# vocab loading
# 미리 만든 vocab 로드
vocab_file = "/opt/workspace/Seohyeon/Projects/LMMs/data_preprocessing/kowiki.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)

True

In [4]:
config = Config({
    "n_enc_vocab": len(vocab),
    "n_enc_seq": 256,
    "n_seg_type": 2,
    "n_layer": 6,
    "d_hidn": 256,
    "i_pad": 0,
    "d_ff": 1024,
    "n_head": 4,
    "d_head": 64,
    "dropout": 0.1,
    "layer_norm_epsilon": 1e-12
})
print(config)

{'n_enc_vocab': 8007, 'n_enc_seq': 256, 'n_seg_type': 2, 'n_layer': 6, 'd_hidn': 256, 'i_pad': 0, 'd_ff': 1024, 'n_head': 4, 'd_head': 64, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12}


## Model Architecture

In [5]:
""" attention pad mask """
def get_attn_pad_mask(seq_q, seq_k, i_pad):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(i_pad)
    pad_attn_mask= pad_attn_mask.unsqueeze(1).expand(batch_size, len_q, len_k)
    return pad_attn_mask

""" attention decoder mask """
def get_attn_decoder_mask(seq):
    subsequent_mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
    subsequent_mask = subsequent_mask.triu(diagonal=1) # upper triangular part of a matrix(2-D)
    return subsequent_mask

""" scale dot product attention """
class ScaledDotProductAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.dropout = nn.Dropout(config.dropout)
        self.scale = 1 / (self.config.d_head ** 0.5)
    
    def forward(self, Q, K, V, attn_mask):
        # (bs, n_head, n_q_seq, n_k_seq)
        scores = torch.matmul(Q, K.transpose(-1, -2))
        scores = scores.mul_(self.scale)
        scores.masked_fill_(attn_mask, -1e9)
        # (bs, n_head, n_q_seq, n_k_seq)
        attn_prob = nn.Softmax(dim=-1)(scores)
        attn_prob = self.dropout(attn_prob)
        # (bs, n_head, n_q_seq, d_v)
        context = torch.matmul(attn_prob, V)
        # (bs, n_head, n_q_seq, d_v), (bs, n_head, n_q_seq, n_v_seq)
        return context, attn_prob
    
    """ multi head attention """
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.W_Q = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_K = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_V = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.scaled_dot_attn = ScaledDotProductAttention(self.config)
        self.linear = nn.Linear(self.config.n_head * self.config.d_head, self.config.d_hidn)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, Q, K, V, attn_mask):
        batch_size = Q.size(0)
        # (bs, n_head, n_q_seq, d_head)
        q_s = self.W_Q(Q).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        # (bs, n_head, n_k_seq, d_head)
        k_s = self.W_K(K).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        # (bs, n_head, n_v_seq, d_head)
        v_s = self.W_V(V).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)

        # (bs, n_head, n_q_seq, n_k_seq)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1)

        # (bs, n_head, n_q_seq, d_head), (bs, n_head, n_q_seq, n_k_seq)
        context, attn_prob = self.scaled_dot_attn(q_s, k_s, v_s, attn_mask)
        # (bs, n_head, n_q_seq, h_head * d_head)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.d_head)
        # (bs, n_head, n_q_seq, e_embd)
        output = self.linear(context)
        output = self.dropout(output)
        # (bs, n_q_seq, d_hidn), (bs, n_head, n_q_seq, n_k_seq)
        return output, attn_prob
    
    """ feed forward """
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.conv1 = nn.Conv1d(in_channels=self.config.d_hidn, out_channels=self.config.d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=self.config.d_ff, out_channels=self.config.d_hidn, kernel_size=1)
        self.active = F.gelu
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, inputs):
        # (bs, d_ff, n_seq)
        output = self.conv1(inputs.transpose(1, 2))
        output = self.active(output)
        # (bs, n_seq, d_hidn)
        output = self.conv2(output).transpose(1, 2)
        output = self.dropout(output)
        # (bs, n_seq, d_hidn)
        return output
    

1) Encoder Layer
- 표준 Transformer EncoderLayer와 동일

In [6]:
""" Encoder Layer """
class EncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.self_attn = MultiHeadAttention(self.config)
        self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
        self.pos_ffn = PoswiseFeedForwardNet(self.config)
        self.layer_norm2 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
    
    def forward(self, inputs, attn_mask):
        # (bs, n_enc_seq, d_hidn), (bs, n_head, n_enc_seq, n_enc_seq)
        att_outputs, attn_prob = self.self_attn(inputs, inputs, inputs, attn_mask)
        att_outputs = self.layer_norm1(inputs + att_outputs)
        # (bs, n_enc_seq, d_hidn)
        ffn_outputs = self.pos_ffn(att_outputs)
        ffn_outputs = self.layer_norm2(ffn_outputs + att_outputs)
        # (bs, n_enc_seq, d_hidn), (bs, n_head, n_enc_seq, n_enc_seq)
        return ffn_outputs, attn_prob 

2. Encoder
- 표준 Transformer Encoder와 아래가 다름
    - 1) position을 학습 (line 8: freeze=False) 
    - 2) Segment Embedding 추가 (line 9)
    - 3) Encoder input에 Segment 정보 추가 (line 13)
    - 4) Token, Position 및 Segment 3가지 Embedding 더함 (line 19: 표준 Transformer에서는 Token, Position, 2가지 embedding 더함)

In [7]:
""" encoder """
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.enc_emb = nn.Embedding(self.config.n_enc_vocab, self.config.d_hidn)
        self.pos_emb = nn.Embedding(self.config.n_enc_seq + 1, self.config.d_hidn)
        self.seg_emb = nn.Embedding(self.config.n_seg_type, self.config.d_hidn)

        self.layers = nn.ModuleList([EncoderLayer(self.config) for _ in range(self.config.n_layer)])
    
    def forward(self, inputs, segments):
        positions = torch.arange(inputs.size(1), device=inputs.device, dtype=inputs.dtype).expand(inputs.size(0), inputs.size(1)).contiguous() + 1
        pos_mask = inputs.eq(self.config.i_pad)
        positions.masked_fill_(pos_mask, 0)

        # (bs, n_enc_seq, d_hidn)
        outputs = self.enc_emb(inputs) + self.pos_emb(positions)  + self.seg_emb(segments)

        # (bs, n_enc_seq, n_enc_seq)
        attn_mask = get_attn_pad_mask(inputs, inputs, self.config.i_pad)

        attn_probs = []
        for layer in self.layers:
            # (bs, n_enc_seq, d_hidn), (bs, n_head, n_enc_seq, n_enc_seq)
            outputs, attn_prob = layer(outputs, attn_mask)
            attn_probs.append(attn_prob)
        # (bs, n_enc_seq, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
        return outputs, attn_probs

In [9]:
""" bert """
class BERT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.encoder = Encoder(self.config)

        self.linear = nn.Linear(config.d_hidn, config.d_hidn)
        self.activation = torch.tanh
    
    def forward(self, inputs, segments):
        # (bs, n_seq, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
        outputs, self_attn_probs = self.encoder(inputs, segments)
        # (bs, d_hidn)
        outputs_cls = outputs[:, 0].contiguous()
        outputs_cls = self.linear(outputs_cls)
        outputs_cls = self.activation(outputs_cls)
        # (bs, n_enc_seq, n_enc_vocab), (bs, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
        
        return outputs, outputs_cls, self_attn_probs
    
    def save(self, epoch, loss, path):
        torch.save({
            "epoch": epoch,
            "loss": loss,
            "state_dict": self.state_dict()
        }, path)
    
    def load(self, path):
        save = torch.load(path)
        self.load_state_dict(save["state_dict"])
        return save["epoch"], save["loss"]

## BERT의 LOSS FUNCTION
- MLM (Masked Language Model): mask된 부분의 단어 예측 
    - 전체 단어의 15%를 선택 후, 그중 80%는 [mask], 10%는 현재 단어 유지, 10%는 임의의 단어로 대체
- NSP (Next Sentence Prediction): 첫번째 [CLS] 토큰으로 문장 A와 B의 관계를 예측
    - A 다음문장이 B가 맞을 경우는  True, A 다음문장이 B가 아닐 경우  False로 예측

### BERT를 Pretrain 하기위한 클래스
1. BERT의 결과를 입력으로 NSP를 예측하기위한 projection_cls 선언 (line: 9)
2. BERT의 결과를 입력으로 MLM을 예측하기위한 projection_lm을 선언 (line: 11)
3. projection_lm은 Encoder의 Embedding과 weight를 share함 (line: 12)
4. inputs, segments를 입력으로 BERT 실행 (line: 16)
5. outputs_cls를 입력으로 projection_cls를 실행해서 NSP를 예측하도록 함 (line: 18)
6. outputs를 입력으로 projection_lm을 실행해서 MLM 예측하도록 함 (line: 20)

In [8]:
"""BERT pretrain"""
class BERTPretrain(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.bert = BERT(self.config)
        # classifier
        self.projection_cls = nn.Linear(self.config.d_hidn, 2, bias=False) # NSP (next sentence prediction)을 위한 classifier
        # LM
        self.projection_lm = nn.Linear(self.config.d_hidn, self.config.n_enc_vocab, bias=False) # MLM (Masked language model)을 위한 classifier
        self.projection_lm.weight = self.bert.encoder.enc_emb.weight # 인코더의 임베딩과 weight를 공유
    
    def forward(self, inputs, segments):
        # (bs, n_enc_seq, d_hidn), (bs, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
        outputs, outputs_cls, attn_probs = self.bert(inputs, segments)
        # (bs, 2)
        logits_cls = self.projection_cls(outputs_cls)
        # (bs, n_enc_seq, n_enc_vocab)
        logits_lm = self.projection_lm(outputs)
        # (bs, n_enc_vocab), (bs, n_enc_seq, n_enc_vocab), [(bs, n_head, n_enc_seq, n_enc_seq)]
        return logits_cls, logits_lm, attn_probs
    
    

## Pretrain Data 생성
[MASK] 생성함수

In [9]:
""" 마스크 생성 """
from random import randrange, shuffle
import random 

def create_pretrain_mask(tokens, mask_cnt, vocab_list):
    cand_idx = []
    # 1) token을 단어별로 index 배열 형태로 저장
    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
        if 0 < len(cand_idx) and not token.startswith(u"\u2581"): # u"\u2581" : 단어의 시작을 의미
            cand_idx[-1].append(i) 
        else:
            cand_idx.append([i])
    # 2) random 선택을 위해 단어의 index 섞음
    shuffle(cand_idx) 

    mask_lms = []
    # 3) mask_lms의 개수가 mask_cnt를 넘지 않도록 함 (mask_cnt는 전체 token의 15%)
    for index_set in cand_idx: 
        if len(mask_lms) >= mask_cnt:
            break
        if len(mask_lms) + len(index_set) > mask_cnt:
            continue
        for index in index_set:
            masked_token = None
            if random.random() < 0.8: # 80% 마스킹
                masked_token = "[MASK]"
            else:
                if random.random() < 0.5: # 10% 는 original로 유지
                    masked_token = tokens[index]
                else: # 10%는 random으로 vocab_list에서 임의의 값 선택
                    masked_token = random.choice(vocab_list)
            # 4) mask된 index값과 정답 label을 maSK_lms에 저장
            mask_lms.append({"index": index, "label": tokens[index]})
            # 5) token index의 값을 mask
            tokens[index] = masked_token
    # 6) Random하게 mask된 값을 index순으로 정렬
    mask_lms = sorted(mask_lms, key=lambda x: x["index"])
    # 7) 정렬된 값을 이용해 mask_idx와 mask_label을 생성
    mask_idx = [p["index"] for p in mask_lms]
    mask_label = [p["label"] for p in mask_lms]

    return tokens, mask_idx, mask_label    


In [10]:
""" 최대 길이 초과하는 토큰 자르기
1) token A의 길이가 길 경우 앞에서부터 토큰을 제거 (line: 11~12)
2) token B의 길이가 길 경우 뒤에서부터 토큰을 제거 (line: 13~14)
 """
def trim_tokens(tokens_a, tokens_b, max_seq):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_seq:
            break

        if len(tokens_a) > len(tokens_b):
            del tokens_a[0]
        else:
            tokens_b.pop()

### 단락별 pretrain 데이터 생성

In [11]:
""" doc 별 pretrain 데이터 생성 """
def create_pretrain_instances(docs, doc_idx, doc, n_seq, mask_prob, vocab_list):
    # for [CLS], [SEP], [SEP]
    max_seq = n_seq - 3 # 특수 토큰 총 3개
    tgt_seq = max_seq

    instances = []
    current_chunk = []
    current_length = 0
    for i in range(len(doc)):
        current_chunk.append(doc[i]) # 단락을 줄 단위로 돌며 current_chunk에 line 추가
        current_length += len(doc[i]) # current_length에 line의 token수를 더함
        if i == len(doc) - 1 or current_length >= tgt_seq: # 마지막 line이거나, current_length가 tgt_seq를 넘을 경우 학습데이터 만듦
            if 0 < len(current_chunk): # 
                a_end = 1
                if 1 < len(current_chunk):
                    a_end = randrange(1, len(current_chunk)) # randrange(1, len(current_chunk)) : 1~len(current_chunk) 중 랜덤값 반환
                tokens_a = []
                for j  in range(a_end): # token 수 랜덤으로 선택해서 tokens_a에 추가
                    tokens_a.extend(current_chunk[j])
                
                tokens_b = []
                # 50%의 확률로 다른 단락에서 tokens_b 만듦
                if len(current_chunk) == 1 or random.random() < 0.5: 
                    is_next = 0 # is_next의 값은 false (0)
                    tokens_b_len = tgt_seq - len(tokens_a) 
                    random_doc_idx = doc_idx
                    while doc_idx == random_doc_idx:
                        random_doc_idx = randrange(0, len(docs))
                    random_doc = docs[random_doc_idx]

                    random_start = randrange(0, len(random_doc))
                    for j in range(random_start, len(random_doc)):
                        tokens_b.extend(random_doc[j])
                # 50%의 확률로 current_chunk에서 tokens_a 이후부터 tokens_b 만듦
                else:
                    is_next = 1 #is_next의 값은 true (1)
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend(current_chunk[j])

                trim_tokens(tokens_a, tokens_b, max_seq) # token 크기 줄이기
                assert 0 < len(tokens_a) # 조건이 참이 아니면 AssertionError 발생
                assert 0 < len(tokens_b)

                tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] # '[CLS]'+ tokens_a + '[SEP]' + tokens_b + '[SEP]' 형태로 tokens 생성
                segment = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) # tokens_a의 길이 + 2 만큼 0으로, tokens_b의 길이 + 1 만큼 1로 segment 생성

                tokens, mask_idx, mask_label = create_pretrain_mask(tokens, int((len(tokens)-3)*mask_prob), vocab_list) # mask 생성 (전체 token 수의 15% 만큼 mask)

                instance = {
                    "tokens": tokens,
                    "segment": segment,
                    "is_next": is_next,
                    "mask_idx": mask_idx,
                    "mask_label": mask_label
                }
                instances.append(instance)

            current_chunk = []
            current_length = 0
    return instances

### Pretrain 데이터 생성 함수 정의
- 말뭉치를 읽어 pretrain data 만듦

In [12]:
import os
from tqdm import tqdm
""" pretrain 데이터 생성 """
def make_pretrain_data(vocab, in_file, out_file, count, n_seq, mask_prob):
    vocab_list = [] # 단어목록 생성
    for id in range(vocab.get_piece_size()): 
        if not vocab.is_unknown(id): # 생성 시, unknown은 제거
            vocab_list.append(vocab.id_to_piece(id))

    line_cnt = 0
    with open(in_file, "r") as in_f:
        for line in in_f:
            line_cnt += 1 # line 수 count

    docs = []
    with open(in_file, "r") as f:
        doc = []
        with tqdm(total=line_cnt, desc=f"Loading") as pbar:
            for i, line in enumerate(f):
                line = line.strip()
                if line == "":
                    if 0 < len(doc):
                        docs.append(doc)
                        doc = []
                else:
                    pieces = vocab.encode_as_pieces(line)
                    if 0 < len(pieces):
                        doc.append(pieces)
                pbar.update(1)

        if doc:
            docs.append(doc)

    for index in range(count):
        output = out_file.format(index)
        if os.path.isfile(output):
            continue
        with open(output, "w") as out_f:
            with tqdm(total=len(docs), desc=f"Making") as pbar:
                for i, doc in enumerate(docs):
                    instances = create_pretrain_instances(docs, i, doc, n_seq, mask_prob, vocab_list)
                    for instance in instances:
                        out_f.write(json.dumps(instance)) # python의 객체를 json 문자열로 변환 (json = JavaScript Object Notation)
                        out_f.write("\n")
                    pbar.update(1)

### Pretrain 데이터 생성 실행
**Settings**
- 말뭉치 개수(count) = 10
- sequence 길이 (n_seq) = 256
- Mask 확률 (mask_prob) = 15 %

In [13]:
in_file = "/opt/workspace/Seohyeon/Projects/LMMs/data_preprocessing/DATA/kowiki.txt"
out_file = "/opt/workspace/Seohyeon/Projects/LMMs/data_preprocessing/DATA/kowiki_bert_{}.json"
count = 10 
n_seq = 256
mask_prob = 0.15

make_pretrain_data(vocab, in_file, out_file, count, n_seq, mask_prob)


Loading:  39%|███▉      | 2058360/5234475 [02:09<02:22, 22295.86it/s]

##

### Pretrain DataSet 생성

In [16]:
import numpy as np
""" pretrain 데이터셋 """
class PretrainDataSet(torch.utils.data.Dataset):
    def __init__(self, vocab, infile):
        self.vocab = vocab
        self.labels_cls = []
        self.labels_lm = []
        self.sentences = []
        self.segments = []

        line_cnt = 0
        with open(infile, "r") as f:
            for line in f:
                line_cnt += 1
        
        with open(infile, "r") as f:
            for i, line in enumerate(tqdm(f, total=line_cnt, desc=f"Loading {infile}", unit="lines")):
                instance = json.loads(line)
                self.labels_cls.append(instance["is_next"]) # tokens_a와 tokens_b가 인접한 문장인지 여부
                sentences = [vocab.piece_to_id(p) for p in instance["tokens"]] # token을 id(숫자)로 변환
                self.sentences.append(sentences)
                self.segments.append(instance["segment"]) # tokens_a(0)와  tokens_b(1)를 구분하기 위한 값
                mask_idx = np.array(instance["mask_idx"], dtype=np.int) # mask_idx: tokens내의 mask index
                mask_label = np.array([vocab.piece_to_id(p) for p in instance["mask_label"]], dtype=np.int) # mask_label: tokens내의 mask된 부분의 정답
                label_lm = np.full(len(sentences), dtype=np.int, fill_value=-1) # 값이 모두 -1인 label_lm 변수 생성
                label_lm[mask_idx] = mask_label # label_lm의  mask_idx 위치에 mask_label 값 저장 => mask_idx 위치는 mask_label이 나머지는 -1이 됨
                self.labels_lm.append(label_lm) 

    def __len__(self):
        assert len(self.labels_cls) == len(self.labels_lm)
        assert len(self.labels_cls) == len(self.sentences)
        assert len(self.labels_cls) == len(self.segments)
        return len(self.labels_cls)
    
    def __getitem__(self, item):
        return (torch.tensor(self.labels_cls[item]),
                torch.tensor(self.labels_lm[item]),
                torch.tensor(self.sentences[item]),
                torch.tensor(self.segments[item]))


### 배치단위로 데이터 처리하기 위한 collate_fn 생성

In [17]:
""" pretrain data collate_fn """
def pretrain_collate_fn(inputs):
    labels_cls, labels_lm, inputs, segments = list(zip(*inputs))

    # labels_lm의 길이가 같아지도록 짧은 문장에 padding(-1) 추가
    labels_lm = torch.nn.utils.rnn.pad_sequence(labels_lm, batch_first=True, padding_value=-1) # pad_sequence(): stacks a list of Tensors along a new dimension, and pads them to equal length
    # inputs의 길이가 같아지도록 짧은 문장에 padding(0) 추가
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    # segments의 길이가 같아지도록 짧은 문장에 padding(0) 추가
    segments = torch.nn.utils.rnn.pad_sequence(segments, batch_first=True, padding_value=0)

    batch = [
        torch.stack(labels_cls, dim=0), # labels_cls는 길이가 1고정이므로 stack함수를 이용해 tensor로 만듦
        labels_lm,
        inputs,
        segments
    ]
    return batch

### DataLoader
- 위에서 정의한 DataSet과 collate_fn을 이용해 학습용 (train_loader) DataLoader 생성
- 위에서 생성한 pretrain data중 첫 번째 값을 읽음

In [18]:
""" pretrain 데이터 로더 """
batch_size = 128
dataset = PretrainDataSet(vocab, "/opt/workspace/Seohyeon/Projects/LMMs/data_preprocessing/DATA/kowiki_bert_0.json")
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pretrain_collate_fn)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask_idx = np.array(instance["mask_idx"], dtype=np.int) # mask_idx: tokens내의 mask index
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask_label = np.array([vocab.piece_to_id(p) for p in instance["mask_label"]], dtype=np.int) # mask_label: tokens내의 mask된 부분의 정답
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  label_lm = np.full(len(sentences), dtype=np.int, fill_value=-1) # 값이 모두 -1인 label_lm 변수 생성
Loading C:/PRML/personal_project/LLMs/data_preprocessing/DATA/kowiki_bert_0.json: 100%|██████████| 1042228/1042228 [05:00<00:00, 3467.47lines/s]


### Pretrain
**Train**
- BERT model을 pretrain하기 위한 함수

In [19]:
""" 모델 epoch 학습 """
def train_epoch(config, epoch, model, criterion_lm, criterion_cls, optimizer, train_loader):
    losses = []
    model.train()

    with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
        for i, value in enumerate(train_loader):
            labels_cls, labels_lm, inputs, segments = map(lambda v: v.to(config.device), value)

            labels_lm = labels_lm.long()

            optimizer.zero_grad()
            outputs = model(inputs, segments)
            logits_cls, logits_lm = outputs[0], outputs[1] # NSP(logit_cls), MLM(logit_lm) 반환

            # A 문장 뒤 B 문장인지 T/F 분류하는 loss
            loss_cls = criterion_cls(logits_cls, labels_cls)
            # Mask된 token을 예측하는 loss
            loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), labels_lm.view(-1)) # (bs, seq_len, vocab_size) -> (bs*seq_len, vocab_size) 
                                                                                              # 각 토큰에 대한 모델의 예측(각 단어의 logit 값) 비교
            loss = loss_cls + loss_lm

            loss_val = loss_lm.item()
            losses.append(loss_val)

            loss.backward()
            optimizer.step()

            pbar.update(1)
            pbar.set_postfix_str(f"Loss: {loss_val:.3f} ({np.mean(losses):.3f})")
            

    return np.mean(losses)


In [20]:
config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(config)

learning_rate = 5e-5
n_epoch = 20
count = 10 

{'n_enc_vocab': 8007, 'n_enc_seq': 256, 'n_seg_type': 2, 'n_layer': 6, 'd_hidn': 256, 'i_pad': 0, 'd_ff': 1024, 'n_head': 4, 'd_head': 64, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12, 'device': device(type='cuda', index=0)}


# Main: Operating code for training BERT

In [21]:
import torch.nn.functional as F
# 모델 선언
model = BERTPretrain(config)

save_pretrain = "/opt/workspace/Seohyeon/Projects/LMMs/BERT/pretrained_BERT/save_best_model.pth"
best_epoch, best_loss = 0, 0
# 기존에 학습된 pretrain값이 있으면 로드
if os.path.isfile(save_pretrain):
    best_epoch, best_loss = model.bert.load(save_pretrain)
    print(f"load pretrain from: {save_pretrain}, epoch={best_epoch}, loss={best_loss}")
    best_epoch += 1

model.to(config.device)

criterion_lm = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
criterion_cls = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

losses = []
offset = best_epoch
for step in range(n_epoch):
    epoch = step + offset
    if 0 < step:
        del train_loader
        dataset = PretrainDataSet(vocab, f"/opt/workspace/Seohyeon/Projects/LMMs/data_preprocessing/DATA/kowiki_bert_{epoch % count}.json")
        train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pretrain_collate_fn)
    
    loss = train_epoch(config, epoch, model, criterion_lm, criterion_cls, optimizer, train_loader)
    losses.append(loss)
    model.bert.save(epoch, loss, save_pretrain)

load pretrain from: C:/PRML/personal_project/LLMs/BERT/pretrained_BERT/save_best_model.pth, epoch=0, loss=1.9920589530471646


Train(1): 100%|██████████| 32570/32570 [1:08:21<00:00,  7.94it/s, Loss: 0.794 (1.283)]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask_idx = np.array(instance["mask_idx"], dtype=np.int) # mask_idx: tokens내의 mask index
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask_label = np.array([vocab.piece_to_id(p) for p in instance["mask_label"]], dtype=np.int) # mask_label: tokens내의 mask된 부분의 정답
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  label_lm = np.full(len(sentences), dtype=np.int, fill_value=-1) # 값이 모두 -1인 label_lm 변수 생성
Loading C:/PRML/personal_project/LLMs/data_preprocessing/DATA/kowiki_bert_2.json: 100%|██████████| 1042228/1042228 [07:25<00:00, 2340.39lines/s]
Train(2): 100%|██████████| 32570/32570 [1:10:38<00:00,  7.69it/s, Loss: 0.873 (0.912)]
Lo

KeyboardInterrupt: 

### Results

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# data
data = {
    "loss": losses
}
df = pd.DataFrame(data)
display(df)

# graph
plt.figure(figsize=[8, 4])
plt.plot(losses)
plt.xlabel('Depth')
plt.xlim((0, n_epoch - 1))
plt.ylabel('Position')
plt.show()

# 2. Naver 영화리뷰 감정분석 분류 

In [None]:
""" naver movie classfication """
class MovieClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.bert = BERT(self.config)
        # classfier
        self.projection_cls = nn.Linear(self.config.d_hidn, self.config.n_output, bias=False)
    
    def forward(self, inputs, segments):
        # (bs, n_enc_seq, d_hidn), (bs, d_hidn), [(bs, n_head, n_enc_seq, n_enc_seq)]
        outputs, outputs_cls, attn_probs = self.bert(inputs, segments)
        # (bs, n_output)
        logits_cls = self.projection_cls(outputs_cls)
        # (bs, n_output), [(bs, n_head, n_enc_seq, n_enc_seq)]
        return logits_cls, attn_probs

In [None]:
""" 영화 분류 데이터셋 """
class MovieDataSet(torch.utils.data.Dataset):
    def __init__(self, vocab, infile):
        self.vocab = vocab
        self.labels = []
        self.sentences = []
        self.segments = []

        line_cnt = 0
        with open(infile, "r") as f:
            for line in f:
                line_cnt += 1

        with open(infile, "r") as f:
            for i, line in enumerate(tqdm(f, total=line_cnt, desc="Loading Dataset", unit=" lines")):
                data = json.loads(line)
                self.labels.append(data["label"])
                sentence = [vocab.piece_to_id("[CLS]")] + [vocab.piece_to_id(p) for p in data["doc"]] + [vocab.piece_to_id("[SEP]")]
                self.sentences.append(sentence)
                self.segments.append([0] * len(sentence))
    
    def __len__(self):
        assert len(self.labels) == len(self.sentences)
        assert len(self.labels) == len(self.segments)
        return len(self.labels)
    
    def __getitem__(self, item):
        return (torch.tensor(self.labels[item]),
                torch.tensor(self.sentences[item]),
                torch.tensor(self.segments[item]))

In [None]:
""" movie data collate_fn """
def movie_collate_fn(inputs):
    labels, inputs, segments = list(zip(*inputs))

    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
    segments = torch.nn.utils.rnn.pad_sequence(segments, batch_first=True, padding_value=0)

    batch = [
        torch.stack(labels, dim=0),
        inputs,
        segments,
    ]
    return batch

In [None]:
""" 데이터 로더 """
batch_size = 128
train_dataset = MovieDataSet(vocab, "/opt/workspace/Seohyeon/Projects/LMMs/data_preprocessing/DATA/ratings_train.json")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=movie_collate_fn)
test_dataset = MovieDataSet(vocab, "/opt/workspace/Seohyeon/Projects/LMMs/data_preprocessing/DATA/ratings_test.json")
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=movie_collate_fn)

In [None]:
""" 모델 epoch 평가 """
def eval_epoch(config, model, data_loader):
    matchs = []
    model.eval()

    n_word_total = 0
    n_correct_total = 0
    with tqdm(total=len(data_loader), desc=f"Valid") as pbar:
        for i, value in enumerate(data_loader):
            labels, inputs, segments = map(lambda v: v.to(config.device), value)

            outputs = model(inputs, segments)
            logits_cls = outputs[0]
            _, indices = logits_cls.max(1)

            match = torch.eq(indices, labels).detach()
            matchs.extend(match.cpu())
            accuracy = np.sum(matchs) / len(matchs) if 0 < len(matchs) else 0

            pbar.update(1)
            pbar.set_postfix_str(f"Acc: {accuracy:.3f}")
    return np.sum(matchs) / len(matchs) if 0 < len(matchs) else 0

In [None]:
""" 모델 epoch 학습 """
def train_epoch(config, epoch, model, criterion_cls, optimizer, train_loader):
    losses = []
    model.train()

    with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
        for i, value in enumerate(train_loader):
            labels, inputs, segments = map(lambda v: v.to(config.device), value)

            optimizer.zero_grad()
            outputs = model(inputs, segments)
            logits_cls = outputs[0]

            loss_cls = criterion_cls(logits_cls, labels)
            loss = loss_cls

            loss_val = loss_cls.item()
            losses.append(loss_val)

            loss.backward()
            optimizer.step()

            pbar.update(1)
            pbar.set_postfix_str(f"Loss: {loss_val:.3f} ({np.mean(losses):.3f})")
    return np.mean(losses)

In [None]:
config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config.n_output = 2
print(config)

learning_rate = 5e-5
n_epoch = 10

In [None]:
def train(model):
    model.to(config.device)

    criterion_cls = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    best_epoch, best_loss, best_score = 0, 0, 0
    losses, scores = [], []
    for epoch in range(n_epoch):
        loss = train_epoch(config, epoch, model, criterion_cls, optimizer, train_loader)
        score = eval_epoch(config, model, test_loader)

        losses.append(loss)
        scores.append(score)

        if best_score < score:
            best_epoch, best_loss, best_score = epoch, loss, score
    print(f">>>> epoch={best_epoch}, loss={best_loss:.5f}, socre={best_score:.5f}")
    return losses, scores

### Train (No pretrain)
- MovieClassification 생성 -> 추가적인 처리 없이 생성된 MovieClassification으로 학습 진행

In [None]:
model = MovieClassification(config)

losses_00, scores_00 = train(model)

### Train (20 epoch Pretrain)
- 20 epoch Pretrain된 모델을 이용해 학습 진행
- MovieClassification 생성 -> Pretrain모델 로드 -> MovieClassification으로 학습 진행

In [None]:
model = MovieClassification(config)

save_pretrain = "/opt/workspace/Seohyeon/Projects/LMMs/BERT/pretrained_BERT/save_best_model.pth"
model.bert.load(save_pretrain)

losses_20, scores_20 = train(model)

### Results
- Pretrain 안한 경우와 한 경우의 정확도 비교

In [None]:
# table
data = {
    "loss_00": losses_00,
    "socre_00": scores_00,
    "loss_20": losses_20,
    "socre_20": scores_20,
}
df = pd.DataFrame(data)
display(df)

# graph
plt.figure(figsize=[12, 4])
plt.plot(scores_00, label="score_00")
plt.plot(scores_20, label="score_20")
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.show()