In [1]:
!pip install transformers
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-atdyr_46/kobert-tokenizer_817765ed7c164b9fa6068d82fab159ad
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-atdyr_46/kobert-tokenizer_817765ed7c164b9fa6068d82fab159ad
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kobert_tokenizer
  Building wheel for kobert_tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert_tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4633 sha256=4321bdd657c21b22c10d0e12f493d4d808a85dec994ea06b5af48176f1450859
  Stored in directory: /tmp/pip-ephem-wheel-cache-mppv9sne/wheels/e9/1a/3f/a864970e8a169c176befa3c4a1e07aa612f69195907a4045fe
Successfully built kobert_tokenizer
Installing collected packages: kobert_tokenizer
Successfully ins

In [8]:
import torch
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer
import torch.nn as nn

# KoBERT 모델 정의 (반드시 학습할 때와 동일한 구조여야 함)
class KoBERTNER(nn.Module):
    def __init__(self, num_labels):
        super(KoBERTNER, self).__init__()
        self.bert = BertModel.from_pretrained('skt/kobert-base-v1')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))
            return loss, logits
        return logits

# 태그 인덱스를 태그 이름으로 매핑 (반드시 학습할 때와 동일한 매핑이어야 함)
tag_to_id = {'B-ORG': 1, 'I-ORG': 2, 'B-MNY': 3, 'I-MNY': 4, 'B-PER': 5, 'I-PER': 6, 'O': 0}
id_to_tag = {v: k for k, v in tag_to_id.items()}
num_labels = len(tag_to_id)

# 모델과 토크나이저 로드
MODEL_SAVE_PATH = "/content/drive/MyDrive/프로젝트/증권 뉴스 분류 및 개체명 인식/kobert_ner_model.pth"
TOKENIZER_SAVE_FOLDER = "/content/drive/MyDrive/프로젝트/증권 뉴스 분류 및 개체명 인식/kobert_tokenizer_ner"
max_len = 128

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = KoBERTNER(num_labels)
model.load_state_dict(torch.load(MODEL_SAVE_PATH, map_location=device))
model.to(device)
model.eval()

tokenizer = KoBERTTokenizer.from_pretrained(TOKENIZER_SAVE_FOLDER)

In [9]:
# 음절 단위 토크나이저 함수 정의
def ner_tokenizer(sent, max_seq_length):
    pre_syllable = "-"
    input_ids = [0] * (max_seq_length - 1)
    attention_mask = [0] * (max_seq_length - 1)
    token_type_ids = [0] * max_seq_length
    sent = sent[:max_seq_length - 2]

    for i, syllable in enumerate(sent):
        if syllable == '-':
            pre_syllable = syllable
        if pre_syllable != "-":
            syllable = '##' + syllable
        pre_syllable = syllable

        input_ids[i] = tokenizer.convert_tokens_to_ids(syllable)
        attention_mask[i] = 1

    input_ids = [2] + input_ids
    input_ids[len(sent) + 1] = 3
    attention_mask = [1] + attention_mask
    attention_mask[len(sent) + 1] = 1

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids
    }

In [10]:
# 예측 함수 정의
def predict_sentence(model, tokenizer, sentence, tag_to_id, id_to_tag, max_len, device):
    model.eval()

    # 음절 단위로 토큰화
    tokenized = ner_tokenizer(sentence, max_len)
    tokens = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    token_type_ids = tokenized['token_type_ids']

    # 데이터를 텐서로 변환
    input_ids = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0).to(device)
    token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0).to(device)

    # 모델 예측
    with torch.no_grad():
        outputs = model(input_ids, attention_mask, token_type_ids)
        predictions = torch.argmax(outputs, dim=2).cpu().numpy().flatten()

    # 예측 결과 디코딩
    decoded_predictions = [id_to_tag[pred] for pred in predictions]

    return decoded_predictions

In [11]:
# 개체명 추출 함수 정의
def extract_entities(sentence, tags):
    entities = []
    entity = ""
    current_tag = None

    for char, tag in zip(sentence, tags):
        if tag.startswith("B-"):
            if entity:
                entities.append(entity)
            entity = char
            current_tag = tag[2:]
        elif tag.startswith("I-") and current_tag == tag[2:]:
            entity += char
        else:
            if entity:
                entities.append(entity)
                entity = ""
            current_tag = None

    if entity:
        entities.append(entity)

    return entities

In [12]:
# 테스트할 문장
input_sentence = "삼성전자는 한국의 대표적인 전자 기업입니다."

# 개체명 인식 수행
predicted_tags = predict_sentence(model, tokenizer, input_sentence, tag_to_id, id_to_tag, max_len, device)

# 예측 결과에서 개체명 추출
entities = extract_entities(input_sentence, predicted_tags)

# 예측 결과 출력
print(f"Input Sentence: {input_sentence}")
print(f"Predicted Tags: {predicted_tags}")
print(f"Predicted Entities: {entities}")

Input Sentence: 삼성전자는 한국의 대표적인 전자 기업입니다.
Predicted Tags: ['O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Entities: ['성전자']


In [17]:
# 테스트할 문장
input_sentence = "LG전자는 한국의 대표적인 전자 기업입니다."

# 개체명 인식 수행
predicted_tags = predict_sentence(model, tokenizer, input_sentence, tag_to_id, id_to_tag, max_len, device)

# 예측 결과에서 개체명 추출
entities = extract_entities(input_sentence, predicted_tags)

# 예측 결과 출력
print(f"Input Sentence: {input_sentence}")
print(f"Predicted Tags: {predicted_tags}")
print(f"Predicted Entities: {entities}")

Input Sentence: LG전자는 한국의 대표적인 전자 기업입니다.
Predicted Tags: ['O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Entities: ['G전자']
