In [1]:
import json
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.metrics import classification_report, confusion_matrix
from torch.nn.utils.rnn import pad_sequence

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# 토크나이저와 모델
model = AutoModelForTokenClassification.from_pretrained('./koelectra_v3_ner_model5')
tokenizer = AutoTokenizer.from_pretrained('./koelectra_v3_ner_model5')
file_path = './data/test/test_data+text.json'

In [3]:
# data 불러오기
def load_data(file_path):
    """JSON 파일에서 NER 데이터 로드"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)
        

In [4]:
# KoNERDataset전용 클래스 생성
class KoNERDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [5]:
def custom_collate_fn(batch):
    """
    Custom collate function to handle variable-length sequences
    """
    # Sort the batch in the descending order of sequence lengths
    batch.sort(key=lambda x: len(x['input_ids']), reverse=True)
    
    # Separate the input components
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad sequences
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_mask_padded,
        'labels': labels_padded
    }

In [6]:
def prepare_koelectra_tagged_data(data, tokenizer, max_length=64):
    """KoElectra용 데이터 준비"""
    text = [d['text'] for d in data]
    tokens = [d['tokens'] for d in data]
    labels = [d['ner_tags'] for d in data]
    
    # Truncate or pad tokens and labels to max_length
    tokens = [tokens[:max_length] for tokens in tokens]
    labels = [labels[:max_length] for labels in labels]

    # KoElectra 토크나이저로 인코딩
    encodings = tokenizer(
        tokens, 
        is_split_into_words=True, 
        padding=True, 
        truncation=True, 
        max_length=max_length,
        return_tensors='pt'
    )

    return text, encodings, labels

In [9]:
def test_koelectra_ner_model(model, test_dataset, tokenizer, test_data):
    """모델 테스트 및 상세 성능 평가"""
    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []
    error_cases = []

    # NER 태그 매핑
    id_to_label = {
        0: 'O',
        1: 'B-store',
        2: 'I-store',
        3: 'B-menu',
        4: 'I-menu',
        5: 'B-ea',
        6: 'I-ea'
    }

    with torch.no_grad():
        for batch_idx, batch in enumerate(test_dataset):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']

            outputs = model(
                input_ids,
                attention_mask=attention_mask
            )

            predictions = torch.argmax(outputs.logits, dim=-1)

            # 예측과 레이블 수집
            for sample_idx, (pred, label, input_id) in enumerate(zip(predictions, labels, input_ids)):
                # 실제 시퀀스 길이 찾기 (패딩 제거)
                seq_length = torch.sum(input_id != 0).item()
                
                # 마스크 생성 및 적용
                mask = (label[:seq_length] != -100)
                
                batch_preds = pred[:seq_length][mask.squeeze()].cpu().numpy()
                batch_labels = label[:seq_length][mask.squeeze()].numpy()
                
                all_preds.extend(batch_preds)
                all_labels.extend(batch_labels)

                print(f"pred shape: {pred.shape}")
                print(f"seq_length: {seq_length}")
                print(f"mask shape: {mask.shape}")
                print(f"input_id shape: {input_id.shape}")


                # 잘못 예측된 케이스 추적
                for true_label, pred_label, token_id in zip(batch_labels, batch_preds, input_id[:seq_length][mask]):
                    if true_label != pred_label:
                        # 원본 텍스트와 토큰 복원
                        token = tokenizer.convert_ids_to_tokens(token_id.item())
                        
                        # 배치 인덱스와 샘플 인덱스를 고려한 원본 텍스트 찾기
                        data_idx = batch_idx * test_dataset.batch_size + sample_idx
                        original_text = test_data[data_idx]['text'] if data_idx < len(test_data) else "텍스트 없음"
                        
                        error_cases.append({
                            'text': original_text,
                            'token': token,
                            'true_label': id_to_label.get(true_label, 'Unknown'),
                            'pred_label': id_to_label.get(pred_label, 'Unknown')
                        })
    

In [10]:
def main_test():
    # 테스트 데이터 로드
    file_path = './data/test/test_data+text.json'
    test_data = load_data(file_path)
    print(f"Total test data: {len(test_data)}")

    # 모델과 토크나이저 로드
    model = AutoModelForTokenClassification.from_pretrained('./koelectra_v3_ner_model5')
    tokenizer = AutoTokenizer.from_pretrained('./koelectra_v3_ner_model5')

    # 테스트 데이터 준비 및 데이터셋 생성
    test_text, test_encodings, test_labels = prepare_koelectra_tagged_data(test_data, tokenizer)

    # 테스트 데이터셋 생성
    test_dataset = DataLoader(
        KoNERDataset(test_encodings, test_labels),
        batch_size=8,
        shuffle=False,
        collate_fn=custom_collate_fn
    )

    # 모델 테스트 및 에러 케이스 저장
    error_cases = test_koelectra_ner_model(model, test_dataset, tokenizer, test_data)

    # 선택적: 에러 케이스를 JSON으로 저장
    with open('ner_error_cases.json', 'w', encoding='utf-8') as f:
        json.dump(error_cases, f, ensure_ascii=False, indent=2)

if __name__ == '__main__':
    main_test()

Total test data: 79


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


pred shape: torch.Size([64])
seq_length: 37
mask shape: torch.Size([37])
input_id shape: torch.Size([64])
pred shape: torch.Size([64])
seq_length: 36
mask shape: torch.Size([36])
input_id shape: torch.Size([64])
pred shape: torch.Size([64])
seq_length: 37
mask shape: torch.Size([37])
input_id shape: torch.Size([64])
pred shape: torch.Size([64])
seq_length: 30
mask shape: torch.Size([30])
input_id shape: torch.Size([64])
pred shape: torch.Size([64])
seq_length: 36
mask shape: torch.Size([36])
input_id shape: torch.Size([64])
pred shape: torch.Size([64])
seq_length: 39
mask shape: torch.Size([39])
input_id shape: torch.Size([64])
pred shape: torch.Size([64])
seq_length: 34
mask shape: torch.Size([34])
input_id shape: torch.Size([64])


IndexError: The shape of the mask [49] at index 0 does not match the shape of the indexed tensor [51] at index 0

In [None]:
def main_test():
    # 테스트 데이터 로드
    file_path = './data/test/test_data+text.json'
    test_data = load_data(file_path)
    print(f"Total test data: {len(test_data)}")

    # 모델과 토크나이저 로드
    model = AutoModelForTokenClassification.from_pretrained('./koelectra_v3_ner_model5')
    tokenizer = AutoTokenizer.from_pretrained('./koelectra_v3_ner_model5')

    # 테스트 데이터 준비 및 데이터셋 생성
    test_text, test_encodings, test_labels = prepare_koelectra_tagged_data(test_data, tokenizer)

    # 테스트 데이터셋 생성
    test_dataset = DataLoader(
        KoNERDataset(test_encodings, test_labels),
        batch_size=8,
        shuffle=False,
        collate_fn=custom_collate_fn
    )

    # 모델 테스트 및 에러 케이스 저장
    error_cases = test_koelectra_ner_model(model, test_dataset, tokenizer, test_data)

    # 선택적: 에러 케이스를 JSON으로 저장
    with open('ner_error_cases.json', 'w', encoding='utf-8') as f:
        json.dump(error_cases, f, ensure_ascii=False, indent=2)

if __name__ == '__main__':
    main_test()