In [5]:
import os
import random
import numpy as np
import torch
import warnings

warnings.filterwarnings('ignore')

# 시드설정 (난수생성)
SEED = 123

def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [6]:
#샘플 예제파일 다운로드
import urllib

# bbc news -text.csv 데이터셋 다운로드
url = 'https://storage.googleapis.com/download.tensorflow.org/data/bbc-text.csv'
urllib.request.urlretrieve(url, 'bbc-text.csv')

('bbc-text.csv', <http.client.HTTPMessage at 0x7e03ba5eb2e0>)

In [7]:
#데이터 로드
import json
from tqdm import tqdm
import numpy as np
import pandas as pd

# 데이터프레임을 로드 합니다.
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [10]:
#HugginFace 라이브러리 이용
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("abhishek/autonlp-bbc-news-classification-37229289")

model = AutoModelForSequenceClassification.from_pretrained("abhishek/autonlp-bbc-news-classification-37229289")

In [9]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [11]:
#토큰화된 결과 확인
tokenized = tokenizer(df['text'].iloc[0], padding=True, truncation=True)
tokenized

{'input_ids': [101, 2694, 2925, 1999, 1996, 2398, 1997, 7193, 2007, 2188, 3004, 3001, 12123, 2152, 1011, 6210, 2694, 2015, 1998, 3617, 2678, 14520, 2015, 3048, 2046, 1996, 2542, 2282, 1996, 2126, 2111, 3422, 2694, 2097, 2022, 25796, 2367, 1999, 2274, 2086, 2051, 1012, 2008, 2003, 2429, 2000, 2019, 6739, 5997, 2029, 5935, 2012, 1996, 3296, 7325, 8139, 2265, 1999, 5869, 7136, 2000, 6848, 2129, 2122, 2047, 6786, 2097, 4254, 2028, 1997, 2256, 8837, 2627, 14428, 2015, 1012, 2007, 1996, 2149, 2877, 1996, 9874, 8497, 1998, 2060, 4180, 2097, 2022, 5359, 2000, 7193, 3081, 2188, 6125, 2083, 5830, 5871, 18126, 2015, 3316, 1998, 19595, 2326, 11670, 2000, 2392, 4734, 1998, 12109, 5733, 1012, 2028, 1997, 1996, 2087, 5720, 1011, 2055, 6786, 1997, 8292, 2015, 2038, 2042, 3617, 1998, 3167, 2678, 14520, 2015, 1006, 1040, 19716, 1998, 26189, 2099, 1007, 1012, 2122, 2275, 1011, 2327, 8378, 2066, 1996, 2149, 1055, 14841, 6767, 1998, 1996, 2866, 1055, 3712, 1009, 2291, 3499, 2111, 2000, 2501, 3573, 2377, 87

In [12]:
#input_ids 를 토큰으로 변환
# input_ids 10개만 출력
input_ids = tokenized['input_ids']
input_ids[:10]

[101, 2694, 2925, 1999, 1996, 2398, 1997, 7193, 2007, 2188]

In [13]:
# 변환된 결과 확인
# 처음 20개의 토큰
print(tokenizer.convert_ids_to_tokens(input_ids)[:20])

['[CLS]', 'tv', 'future', 'in', 'the', 'hands', 'of', 'viewers', 'with', 'home', 'theatre', 'systems', 'plasma', 'high', '-', 'definition', 'tv', '##s', 'and', 'digital']


In [14]:
# 끝부분의 20개의 토큰
print(tokenizer.convert_ids_to_tokens(input_ids)[-20:])

['networks', 'in', 'us', 'terms', 'or', 'channels', 'could', 'take', 'a', 'leaf', 'out', 'of', 'google', 's', 'book', 'and', 'be', 'the', 'search', '[SEP]']


In [15]:
#Label Map 생성
label_map = {
    'sport': 0,
    'business': 1,
    'politics': 2,
    'tech': 3,
    'entertainment': 4
}

# 영문 Label을 숫자로 인코딩 변환
df['category_num'] = df['category'].map(label_map)

In [16]:
#Dataset 분할
#분할 비율: 0.8: 0.2
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['category_num'],
                                                    stratify=df['category_num'],
                                                    test_size=0.2,
                                                    random_state=SEED
                                                   )

In [17]:
#Batch Tokenization
# truncation 시 최대길이 확인
tokenizer.model_max_length

512

In [18]:
# 배치 단위의 사이즈를 맞춰 일괄 처리
batch_tokenized = tokenizer(df['text'].iloc[:10].tolist(), padding=True, truncation=True)

In [19]:
# (batch_size, model_max_length)
np.array(batch_tokenized['input_ids']).shape

(10, 512)

In [15]:
#torchtextx를 사용하기 위한 버전
!pip install torch==2.2.0 torchtext==0.17.0


Collecting torch==2.2.0
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collec

In [20]:
#데이터셋 생성
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import build_vocab_from_iterator


class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        super().__init__()
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        return text, label

In [21]:
# Custom Dataset 생성
train_ds = CustomDataset(x_train, y_train)
valid_ds = CustomDataset(x_test, y_test)

In [22]:
# 1개의 데이터 추출
text, label = next(iter(train_ds))
len(text), label

(1665, 2)

In [27]:
#DataLoader 생성
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

# torch 디바이스 지정 ('cpu', 'cuda:0' 혹은 cuda:1)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [24]:
def collate_batch(batch, tokenizer):
    text_list, label_list = [], []

    for text, label in batch:
        text_list.append(text)
        label_list.append(label)

    label_list = torch.tensor(label_list, dtype=torch.int64)

    # padding을 주어 짧은 문장에 대한 길이를 맞춥니다.
    text_tokenized = tokenizer(text_list, padding=True, truncation=True, return_tensors='pt')

    return text_tokenized, label_list

In [25]:
train_loader = DataLoader(train_ds,
                          batch_size=8,
                          shuffle=True,
                          collate_fn=lambda x: collate_batch(x, tokenizer))

valid_loader = DataLoader(valid_ds,
                          batch_size=8,
                          shuffle=False,
                          collate_fn=lambda x: collate_batch(x, tokenizer))

In [28]:
x, y = next(iter(train_loader))
x = x.to(device)
y = y.to(device)

In [32]:
x['input_ids'].shape

torch.Size([8, 512])

In [30]:
x['input_ids']

tensor([[  101,  3016, 18866,  ...,     0,     0,     0],
        [  101,  2148,  2924,  ...,     0,     0,     0],
        [  101,  4922,  2128,  ...,  7674, 17783,   102],
        ...,
        [  101,  1037,  1011,  ...,     0,     0,     0],
        [  101, 13587,  3514,  ...,     0,     0,     0],
        [  101, 19236,  1024,  ...,  1038,  1051,   102]], device='cuda:0')

In [33]:
#모델
from tqdm import tqdm  # Progress Bar 출력
import numpy as np
import torch.nn as nn
import torch.optim as optim

# 사전학습 모델의 구조 확인
print(model)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [34]:
# 가중치 Freeze
for param in model.parameters():
    param.requires_grad = False

In [35]:
# 모델의 classifier 부분 변경
model.classifier = nn.Sequential(
    nn.Linear(1024, 256),
    nn.BatchNorm1d(256),
    nn.ReLU(),
    nn.Linear(256, 32),
    nn.BatchNorm1d(32),
    nn.ReLU(),
    nn.Linear(32, 5)
)

In [36]:
# 변경된 classifier 가중치 업데이트 가능 여부 확인
for param in model.classifier.parameters():
    print(param.requires_grad)

True
True
True
True
True
True
True
True
True
True


In [38]:
# 입력의 각 키별(input_ids, token_type_ids, attention_mask) device 에 로드
inputs = {k: v.to(device) for k, v in x.items()}
inputs

{'input_ids': tensor([[  101,  3016, 18866,  ...,     0,     0,     0],
         [  101,  2148,  2924,  ...,     0,     0,     0],
         [  101,  4922,  2128,  ...,  7674, 17783,   102],
         ...,
         [  101,  1037,  1011,  ...,     0,     0,     0],
         [  101, 13587,  3514,  ...,     0,     0,     0],
         [  101, 19236,  1024,  ...,  1038,  1051,   102]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')}

In [39]:
# 모델을 device 에 로드
model.to(device)

# inputs를 입력으로 추론
output = model(**inputs)
# SequenceClassifierOutput 결과 값 확인
output

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0086,  0.2367, -0.3304, -0.5951, -0.5327],
        [-0.0817,  0.1574, -0.4939, -0.6673, -0.5440],
        [-0.2159,  0.5163, -0.2780,  0.1354,  0.1225],
        [ 0.1838,  0.4520,  0.4209, -0.1438, -0.0150],
        [-0.3490,  0.4510, -0.3164,  0.0252,  0.1679],
        [-0.0828,  0.1375, -0.3727, -0.4824, -0.4159],
        [ 0.5877, -0.3459,  0.2237, -0.3377,  0.0716],
        [-0.1894,  0.8065, -0.5820, -0.3183, -0.0150]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [42]:
# 확률 값인 logits 추출
output.logits

tensor([[-0.0086,  0.2367, -0.3304, -0.5951, -0.5327],
        [-0.0817,  0.1574, -0.4939, -0.6673, -0.5440],
        [-0.2159,  0.5163, -0.2780,  0.1354,  0.1225],
        [ 0.1838,  0.4520,  0.4209, -0.1438, -0.0150],
        [-0.3490,  0.4510, -0.3164,  0.0252,  0.1679],
        [-0.0828,  0.1375, -0.3727, -0.4824, -0.4159],
        [ 0.5877, -0.3459,  0.2237, -0.3377,  0.0716],
        [-0.1894,  0.8065, -0.5820, -0.3183, -0.0150]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [43]:
# 모델을 device에 로드
model.to(device)

# loss 정의: CrossEntropyLoss
loss_fn = nn.CrossEntropyLoss()

# 옵티마이저 정의: bert.paramters()와 learning_rate 설정
optimizer = optim.Adam(model.parameters(), lr=0.00005)

In [44]:
def model_train(model, data_loader, loss_fn, optimizer, device):
    # 모델을 훈련모드로 설정합니다. training mode 일 때 Gradient 가 업데이트 됩니다. 반드시 train()으로 모드 변경을 해야 합니다.
    model.train()

    # loss와 accuracy 계산을 위한 임시 변수 입니다. 0으로 초기화합니다.
    running_loss = 0
    corr = 0
    counts = 0

    # 예쁘게 Progress Bar를 출력하면서 훈련 상태를 모니터링 하기 위하여 tqdm으로 래핑합니다.
    prograss_bar = tqdm(data_loader, unit='batch', total=len(data_loader), mininterval=1)

    # mini-batch 학습을 시작합니다.
    for idx, (txt, lbl) in enumerate(prograss_bar):
        # txt, lbl 데이터를 device 에 올립니다. (cuda:0 혹은 cpu)
        inputs = {k:v.to(device) for k, v in txt.items()}
        lbl = lbl.to(device)

        # 누적 Gradient를 초기화 합니다.
        optimizer.zero_grad()

        # Forward Propagation을 진행하여 결과를 얻습니다.
        output = model(**inputs)

        # 예측 값인 logits 만 추출합니다.
        output = output.logits

        # 손실함수에 output, lbl 값을 대입하여 손실을 계산합니다.
        loss = loss_fn(output, lbl)

        # 오차역전파(Back Propagation)을 진행하여 미분 값을 계산합니다.
        loss.backward()

        # 계산된 Gradient를 업데이트 합니다.
        optimizer.step()

        # Probability Max index 를 구합니다.
        output = output.argmax(dim=1)

        # 정답 개수를 구합니다.
        corr += (output == lbl).sum().item()
        counts += len(lbl)

        # batch 별 loss 계산하여 누적합을 구합니다.
        running_loss += loss.item()

        # 프로그레스바에 학습 상황 업데이트
        prograss_bar.set_description(f"training loss: {running_loss/(idx+1):.5f}, training accuracy: {corr / counts:.5f}")

    # 누적된 정답수를 전체 개수로 나누어 주면 정확도가 산출됩니다.
    acc = corr / len(data_loader.dataset)

    # 평균 손실(loss)과 정확도를 반환합니다.
    # train_loss, train_acc
    return running_loss / len(data_loader), acc

In [45]:
def model_evaluate(model, data_loader, loss_fn, device):
    # model.eval()은 모델을 평가모드로 설정을 바꾸어 줍니다.
    # dropout과 같은 layer의 역할 변경을 위하여 evaluation 진행시 꼭 필요한 절차 입니다.
    model.eval()

    # Gradient가 업데이트 되는 것을 방지 하기 위하여 반드시 필요합니다.
    with torch.no_grad():
        # loss와 accuracy 계산을 위한 임시 변수 입니다. 0으로 초기화합니다.
        corr = 0
        running_loss = 0

        # 배치별 evaluation을 진행합니다.
        for txt, lbl in data_loader:
            # txt, lbl 데이터를 device 에 올립니다. (cuda:0 혹은 cpu)
            inputs = {k:v.to(device) for k, v in txt.items()}
            lbl = lbl.to(device)

            # 모델에 Forward Propagation을 하여 결과를 도출합니다.
            output = model(**inputs)

            # 예측 값인 logits 만 추출합니다.
            output = output.logits

            # 검증 손실을 구합니다.
            loss = loss_fn(output, lbl)

            # Probability Max index 를 구합니다.
            output = output.argmax(dim=1)

            # 정답 개수를 구합니다.
            corr += (output == lbl).sum().item()

            # batch 별 loss 계산하여 누적합을 구합니다.
            running_loss += loss.item()

        # validation 정확도를 계산합니다.
        # 누적한 정답숫자를 전체 데이터셋의 숫자로 나누어 최종 accuracy를 산출합니다.
        acc = corr / len(data_loader.dataset)

        # 결과를 반환합니다.
        # val_loss, val_acc
        return running_loss / len(data_loader), acc

In [46]:
# 최대 Epoch을 지정합니다.
num_epochs = 5

# checkpoint로 저장할 모델의 이름을 정의 합니다.
model_name = 'BBC-Text-CLF-BERT'

min_loss = np.inf

# Epoch 별 훈련 및 검증을 수행합니다.
for epoch in range(num_epochs):
    # Model Training
    # 훈련 손실과 정확도를 반환 받습니다.
    train_loss, train_acc = model_train(model, train_loader, loss_fn, optimizer, device)

    # 검증 손실과 검증 정확도를 반환 받습니다.
    val_loss, val_acc = model_evaluate(model, valid_loader, loss_fn, device)

    # val_loss 가 개선되었다면 min_loss를 갱신하고 model의 가중치(weights)를 저장합니다.
    if val_loss < min_loss:
        print(f'[INFO] val_loss has been improved from {min_loss:.5f} to {val_loss:.5f}. Saving Model!')
        min_loss = val_loss
        torch.save(model.state_dict(), f'{model_name}.pth')

    # Epoch 별 결과를 출력합니다.
    print(f'epoch {epoch+1:02d}, loss: {train_loss:.5f}, acc: {train_acc:.5f}, val_loss: {val_loss:.5f}, val_accuracy: {val_acc:.5f}')

training loss: 0.74239, training accuracy: 0.92472: 100%|██████████| 223/223 [02:40<00:00,  1.39batch/s]


[INFO] val_loss has been improved from inf to 0.47634. Saving Model!
epoch 01, loss: 0.74239, acc: 0.92472, val_loss: 0.47634, val_accuracy: 0.99326


training loss: 0.57364, training accuracy: 0.98483: 100%|██████████| 223/223 [02:45<00:00,  1.35batch/s]


[INFO] val_loss has been improved from 0.47634 to 0.42014. Saving Model!
epoch 02, loss: 0.57364, acc: 0.98483, val_loss: 0.42014, val_accuracy: 0.99326


training loss: 0.50006, training accuracy: 0.98371: 100%|██████████| 223/223 [02:45<00:00,  1.34batch/s]


[INFO] val_loss has been improved from 0.42014 to 0.38643. Saving Model!
epoch 03, loss: 0.50006, acc: 0.98371, val_loss: 0.38643, val_accuracy: 0.99326


training loss: 0.44614, training accuracy: 0.99213: 100%|██████████| 223/223 [02:45<00:00,  1.34batch/s]


[INFO] val_loss has been improved from 0.38643 to 0.31494. Saving Model!
epoch 04, loss: 0.44614, acc: 0.99213, val_loss: 0.31494, val_accuracy: 0.99326


training loss: 0.40023, training accuracy: 0.98708: 100%|██████████| 223/223 [02:46<00:00,  1.34batch/s]


[INFO] val_loss has been improved from 0.31494 to 0.28365. Saving Model!
epoch 05, loss: 0.40023, acc: 0.98708, val_loss: 0.28365, val_accuracy: 0.99326


In [47]:
# 가중치 로드
model.load_state_dict(torch.load(f'{model_name}.pth'))

<All keys matched successfully>

In [49]:
model.eval()

with torch.no_grad():
    val_loss, val_acc = model_evaluate(model, valid_loader, loss_fn, device)

    print(f'loss: {val_loss:.5f}, accuracy: {val_acc:.5f}')

loss: 0.28365, accuracy: 0.99326
