# Multilingual BERT (Google)
---
**모델 설명**<br>
- 구글에서 개발한 다국어 자연어처리 모델

**모델 사이즈**<br>
-  vocab size = 11만
- 12-layer

**학습 코퍼스**<br>
- 다국어(104개 언어)

### 필요한 라이브러리 가져오기

In [1]:
# !pip install transformers

In [2]:
import numpy as np
import pandas as pd
import os
import random

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup

from sklearn.preprocessing import LabelEncoder

In [3]:
# GPU에서 학습하기
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### 하이퍼 파라미터 세팅

In [4]:
CFG = {
    'EPOCHS': 2, 
    'BATCH_SIZE': 8, 
    'LEARNING_RATE': 1e-06, 
    'SEED': 42
}

### SEED 고정하기
코드의 재현성을 위한 함수

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [6]:
seed_everything(CFG['SEED'])

### 1. 데이터 준비
- AI-Hub: 주제별 텍스트 일상 대화 데이터
- 20개 주제 중에서 10개 주제(식음료, 회사/아르바이트, 교육, 가족, 연애/결혼, 반려동물, 스포츠/레저, 여행, 미용, 영화/만화)의 데이터를 사용함

In [7]:
train_df = pd.read_csv('/content/drive/MyDrive/classification_model/data/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/classification_model/data/valid.csv')

In [8]:
train_df.head(3)

Unnamed: 0,text,label
0,이영애 새 드라마 보신 분 이영애 새 드라마 나와 난 방송은 못 믿겠더라 맛집 소개...,방송/연예
1,엄마 10월 그리고리 영화봤어요 아니 그게 뭔지 몰라 죽기 전에 봐야 할 영화 2번...,영화/만화
2,내가 곰곰이 생각해 보니 우리 가족은 웃음이 없는 것 같아요 나도 그렇게 생각했는데...,가족


In [9]:
valid_df.head(3)

Unnamed: 0,text,label
0,언니 지금 비타민 챙겨 먹고 있어 웅 챙겨 먹어 나이가 있어서 맞아 20대 때부터 ...,건강
1,여행사 껴서 여행 가는 거 생각보다 괜찮더라 맞아 나도 가끔 가는데 일정 생각 안하...,여행
2,재밌는 영화 추천 좀 해봐 키키 요새 영화 안 본지 오래됨 나두 그래 너무 재밌는 ...,영화/만화


In [10]:
label_list = ['영화/만화', '가족', '식음료', '교육', '회사/아르바이트', '스포츠/레저', '연애/결혼', '반려동물', '여행', '미용']

train_df = train_df[train_df['label'].isin(label_list)].reset_index(drop=True)
valid_df = valid_df[valid_df['label'].isin(label_list)].reset_index(drop=True)

In [11]:
train_df['label'].unique()

array(['영화/만화', '가족', '식음료', '교육', '회사/아르바이트', '스포츠/레저', '연애/결혼', '반려동물',
       '여행', '미용'], dtype=object)

In [12]:
len(train_df['label'].unique())

10

라벨 인코딩

In [13]:
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['label'])
valid_df['label'] = le.transform(valid_df['label'])

In [14]:
# 변환된 label classes 확인
le.classes_

array(['가족', '교육', '미용', '반려동물', '스포츠/레저', '식음료', '여행', '연애/결혼', '영화/만화',
       '회사/아르바이트'], dtype=object)

In [15]:
train_df.head()

Unnamed: 0,text,label
0,엄마 10월 그리고리 영화봤어요 아니 그게 뭔지 몰라 죽기 전에 봐야 할 영화 2번...,8
1,내가 곰곰이 생각해 보니 우리 가족은 웃음이 없는 것 같아요 나도 그렇게 생각했는데...,0
2,오늘 하루도 잘 보냈니 내일 출근 전 아침 메뉴 하나 추천해줘 너도 잘 보냈냐 너 ...,5
3,나도 만화가 되고싶다 나도 만화 잘 그리고 싶어 어렸을떄 애들이 만화 많이 그렸는데...,8
4,는 날 닮은 것 같지 응 자기 닮았지 엄마도 내 어린시절을 많이 닮았다고 하셔 하하...,0


In [16]:
LABEL_COLUMNS = {i: label for i, label in enumerate(le.classes_)}
print(LABEL_COLUMNS)

{0: '가족', 1: '교육', 2: '미용', 3: '반려동물', 4: '스포츠/레저', 5: '식음료', 6: '여행', 7: '연애/결혼', 8: '영화/만화', 9: '회사/아르바이트'}


### 2. BERT Tokenizer 🤗
- CustomDataset 클래스

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                          do_lower_case=False)

In [18]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, 'text']
        label = self.df.loc[idx, 'label']
        
        encoded_dict = self.tokenizer.encode_plus(
            text=text, # Sequence to encode
            add_special_tokens=True, # Add '[CLS]' and '[SEP]'
            max_length=self.max_len, 
            padding='max_length', # Pad and truncate
            truncation=True, #Truncate the seq
            return_attention_mask=True, # Construct attn. masks
            return_token_type_ids=False, 
            return_tensors='pt' # Return pytorch tensors
        )

        return dict(
            text = text, 
            input_ids = encoded_dict['input_ids'].flatten(), 
            attention_mask = encoded_dict['attention_mask'].flatten(), 
            label = torch.tensor(label)
        )

In [19]:
MAX_LEN = 512

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(valid_df, tokenizer, MAX_LEN)

In [20]:
train_dataloader = DataLoader(
    train_dataset, 
    batch_size = CFG['BATCH_SIZE'], 
    shuffle = True, 
    num_workers = 2)

valid_dataloader = DataLoader(
    valid_dataset, 
    batch_size = CFG['BATCH_SIZE'], 
    shuffle = True, 
    num_workers = 2)

### 3. BERT Model 🤗
- BaseModel 클래스

In [21]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=len(le.classes_)):
        super(BaseModel, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]
        x = self.dropout(cls_token)
        output = self.linear(x)
        
        return output

In [22]:
model = BaseModel()
model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

### 4. 학습(Train) & 평가(Evaluation)

In [23]:
# 폴더 생성
path = 'model'

print(f'Create a model save folder...')
if not os.path.isdir(path):
    os.mkdir(path)

Create a model save folder...


In [24]:
# 손실 함수 설정
loss_fn = nn.CrossEntropyLoss()

# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LEARNING_RATE'], eps=1e-8)

# 총 훈련 스텝으로 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * CFG['EPOCHS']

# 학습률을 변화시키는 스케줄러
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [25]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [26]:
for epoch in range(CFG['EPOCHS']):
    # ----------------------Training----------------------
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, CFG['EPOCHS']))
    print('Training...')
    
    model.train() # 학습 모드
    
    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader, 0):
        b_input_ids = batch['input_ids'].to(device, dtype = torch.long)
        b_input_mask = batch['attention_mask'].to(device, dtype = torch.long)
        b_labels = batch['label'].to(device, dtype = torch.long)

        optimizer.zero_grad() # optimizer 초기화
        outputs = model(b_input_ids, b_input_mask)

        loss = loss_fn(outputs, b_labels)

        loss.backward() # loss의 역전파
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # gradient clipping(exploding gradient problem 방지)
        optimizer.step() # optimizer 가중치 업데이트
        scheduler.step() # Update learning rate schedule

        total_train_loss += loss.item() # 총 loss 계산

        if step % 1000 == 0:
            print('Batch {:>5,} of {:>5,} | Train Loss: {:.3f}'.format(step, len(train_dataloader), loss.item()))

    avg_train_loss = total_train_loss / len(train_dataloader)
    print("Average Train Loss: {:.3f}".format(avg_train_loss))
    
    # ----------------------Validation----------------------
    print("")
    print("Running Validation...")
    
    model.eval() # 평가 모드
    
    total_eval_loss = 0
    total_eval_acc = 0

    for step, batch in enumerate(valid_dataloader, 0):
        with torch.no_grad(): # gradient 계산 안함
            b_input_ids = batch['input_ids'].to(device, dtype = torch.long)
            b_input_mask = batch['attention_mask'].to(device, dtype = torch.long)
            b_labels = batch['label'].to(device, dtype = torch.long)

            outputs = model(b_input_ids, b_input_mask)

            loss = loss_fn(outputs, b_labels)

            total_eval_loss += loss.item() # 총 loss 계산

            # CPU 데이터
            outputs = outputs.detach().cpu().numpy()
            b_labels = b_labels.to('cpu').numpy()

            total_eval_acc += flat_accuracy(outputs, b_labels)
    
    print("Average Validation Loss: {:.3f}".format(total_eval_loss / len(valid_dataloader)))
    print("Validation Accuracy: {:.3f}".format(total_eval_acc / len(valid_dataloader)))
    
    PATH = f'model/classification_model_{epoch + 1}.pt'
    torch.save(model.state_dict(), PATH)

Training...
Batch     0 of 5,571 | Train Loss: 2.484
Batch 1,000 of 5,571 | Train Loss: 1.519
Batch 2,000 of 5,571 | Train Loss: 0.685
Batch 3,000 of 5,571 | Train Loss: 2.847
Batch 4,000 of 5,571 | Train Loss: 0.608
Batch 5,000 of 5,571 | Train Loss: 0.837
Average Train Loss: 0.797

Running Validation...
Average Validation Loss: 0.354
Validation Accuracy: 0.899
Training...
Batch     0 of 5,571 | Train Loss: 0.153
Batch 1,000 of 5,571 | Train Loss: 0.041
Batch 2,000 of 5,571 | Train Loss: 0.160
Batch 3,000 of 5,571 | Train Loss: 0.716
Batch 4,000 of 5,571 | Train Loss: 0.103
Batch 5,000 of 5,571 | Train Loss: 0.019
Average Train Loss: 0.400

Running Validation...
Average Validation Loss: 0.340
Validation Accuracy: 0.910


|Epoch|Train Loss|Val loss|Val acc|
|:---:|:---:|:---:|:---:|
|1|0.797|0.354|0.899|
|2|0.400|0.340|0.910|

### 5. 추론(Inference)
- 추론을 위해 모델 저장하고 불러오기
- 저장한 모델을 불러와서 예측하기

In [27]:
# 모델 저장하기
PATH = 'model/classification_model.pt'
torch.save(model.state_dict(), PATH)

In [28]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=10):
        super(BaseModel, self).__init__()
        self.model = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]
        x = self.dropout(cls_token)
        output = self.linear(x)
        
        return output

In [29]:
# 모델 불러오기
PATH = 'model/classification_model.pt'

device = torch.device('cuda')
model = BaseModel()
model.load_state_dict(torch.load(PATH))
model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [30]:
def predict(tokenizer, label_columns, sent_list):
    text = ' '.join(sent_list)
    
    encoded_dict = tokenizer.encode_plus(
        text=text, # Sequence to encode
        add_special_tokens=True, # Add '[CLS]' and '[SEP]'
        max_length=512, 
        padding='max_length', # Pad and truncate
        truncation=True, #Truncate the seq
        return_attention_mask=True, # Construct attn. masks
        return_token_type_ids=False, 
        return_tensors='pt' # Return pytorch tensors
    )

    model.eval()

    input_ids = encoded_dict['input_ids'].long().to(device)
    input_mask = encoded_dict['attention_mask'].long().to(device)
    
    output = model(input_ids, input_mask)

    print(f"모델의 Output:\n {output}")
    print(f"Softmax를 통과한 Output:\n {nn.Softmax(dim=1)(output)}")

    output = output.detach().cpu().numpy()
    pred = np.argmax(output)

    return label_columns[pred]

In [33]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                          do_lower_case=False)

label_columns = ['가족', '교육', '미용', '반려동물', '스포츠/레저', '식음료', '여행', '연애/결혼', '영화/만화', '회사/아르바이트']

sent_list = ['오늘 점심으로 토스트와 커피를 먹을거예요', 
             '토스트를 구매하고 커피를 사러 갈거예요', 
             '아이스아메리카노 한잔 주세요']

predict(tokenizer, label_columns, sent_list)

모델의 Output:
 tensor([[-0.5934, -1.3689, -0.0824, -0.7619, -0.6494,  5.8656, -0.3038, -0.4132,
         -1.4283, -0.7134]], device='cuda:0', grad_fn=<AddmmBackward0>)
Softmax를 통과한 Output:
 tensor([[1.5450e-03, 7.1145e-04, 2.5757e-03, 1.3054e-03, 1.4610e-03, 9.8645e-01,
         2.0642e-03, 1.8503e-03, 6.7048e-04, 1.3704e-03]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)


'식음료'

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',
                                          do_lower_case=False)

label_columns = ['가족', '교육', '미용', '반려동물', '스포츠/레저', '식음료', '여행', '연애/결혼', '영화/만화', '회사/아르바이트']

sent_list = ['우리 댕댕이 귀여워', '강아지랑 산책 중이예요']

predict(tokenizer, label_columns, sent_list)

모델의 Output:
 tensor([[-0.3285, -0.3316, -0.7047,  6.5010, -0.7961, -1.0644,  0.2368, -0.8782,
         -0.5075, -0.9096]], device='cuda:0', grad_fn=<AddmmBackward0>)
Softmax를 통과한 Output:
 tensor([[1.0726e-03, 1.0693e-03, 7.3637e-04, 9.9193e-01, 6.7204e-04, 5.1390e-04,
         1.8879e-03, 6.1905e-04, 8.9688e-04, 5.9994e-04]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)


'반려동물'

### END