In [1]:
!pip install transformers
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-89tuhr7i/kobert-tokenizer_723976c74d3643e2a535de5b867a8aa6
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-89tuhr7i/kobert-tokenizer_723976c74d3643e2a535de5b867a8aa6
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kobert_tokenizer
  Building wheel for kobert_tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert_tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4633 sha256=fd64556dcd399dbea05f2543a864ebfe731ad264213194897579e199f42a1963
  Stored in directory: /tmp/pip-ephem-wheel-cache-yxo91zce/wheels/e9/1a/3f/a864970e8a169c176befa3c4a1e07aa612f69195907a4045fe
Successfully built kobert_tokenizer
Installing collected packages: kobert_tokenizer
Successfully ins

In [8]:
import torch
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
import torch.nn as nn
import os

# GPU 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 모델 로드 경로 설정
MODEL_SAVE_PATH = "/content/drive/MyDrive/프로젝트/증권 뉴스 분류 및 개체명 인식/kobert_model.pth"
TOKENIZER_SAVE_FOLDER = "/content/drive/MyDrive/프로젝트/증권 뉴스 분류 및 개체명 인식/kobert_tokenizer"

# 필요한 파일 목록
required_files = ['tokenizer_config.json', 'spiece.model', 'special_tokens_map.json']

# 파일 존재 여부 확인
for file_name in required_files:
    file_path = os.path.join(TOKENIZER_SAVE_FOLDER, file_name)
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Error: {file_name} not found in the directory.")
    else:
        print(f"{file_name} found in the directory.")

# KoBERTClass 정의
class KoBERTClass(nn.Module):
    def __init__(self):
        super(KoBERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('skt/kobert-base-v1')
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 6) # 라벨 수에 맞게 조정

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

# 모델과 토크나이저 로드
model = KoBERTClass()
model.load_state_dict(torch.load(MODEL_SAVE_PATH, map_location=device))
tokenizer = KoBERTTokenizer.from_pretrained(TOKENIZER_SAVE_FOLDER)

# GPU로 모델 이동
model.to(device)

tokenizer_config.json found in the directory.
spiece.model found in the directory.
special_tokens_map.json found in the directory.


KoBERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [13]:
# 예제 문장 분류
example_sentence = [
    "애플이 스타업을 인수하려고 합니다",

]

# 문장을 토크나이저로 전처리
inputs = tokenizer.encode_plus(
    example_sentence,
    None,
    add_special_tokens=True,
    max_length=128,
    pad_to_max_length=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_tensors='pt'
)

ids = inputs['input_ids'].to(device, dtype=torch.long)
mask = inputs['attention_mask'].to(device, dtype=torch.long)
token_type_ids = inputs['token_type_ids'].to(device, dtype=torch.long)

# 모델을 평가 모드로 전환
model.eval()

# 모델에 입력하여 결과 도출
with torch.no_grad():
    outputs = model(ids, mask, token_type_ids)
    prediction = torch.argmax(outputs, dim=1).item()

# 라벨별 이름을 설정 (필요시 수정)
label_names = ["시황/전망", "기업/종목분석", "해외증시", "채권/선물", "공시/메모", "환율"]

# 결과 출력
print(f"문장: '{example_sentence}'")
print(f"분류 결과: {label_names[prediction]} (라벨: {prediction})")

문장: '애플이 스타업을 인수하려고 합니다'
분류 결과: 해외증시 (라벨: 2)
