In [3]:
import multiprocessing
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import wandb
import datetime
import yaml
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

In [4]:
if torch.__version__ >= '2.0':
    torch.set_float32_matmul_precision('high')
num_seed = 42
device = "cuda" if torch.cuda.is_available() else "cpu"
def set_seed(seed: int):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    seed_everything(seed, workers=True)
set_seed(num_seed)
# YAML 파일에서 설정 읽기
def load_config(yaml_file):
    with open(yaml_file, 'r') as file:
        config = yaml.safe_load(file)
        
        lr = config['lr']
        batch_size = config['batch_size']
        max_epochs = config['max_epochs']
        max_len = config['max_len']
        num_classes = config['num_classes']
    return float(lr), batch_size, max_epochs, max_len, num_classes

lr, batch_size, max_epochs, max_len, num_classes = load_config('config.yaml')

Seed set to 42


In [5]:
class BertLightningModel(LightningModule):
    def __init__(self, bert_pretrained, num_labels=num_classes, lr=lr, total_steps=None):
        super(BertLightningModel, self).__init__()
        self.save_hyperparameters()
        self.bert = BertModel.from_pretrained(bert_pretrained)
        self.fc = nn.Linear(768, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()
        self.total_steps = total_steps
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_output = output.last_hidden_state[:, 0, :]
        logits = self.fc(cls_output)
        return logits
    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(**inputs)
        loss = self.loss_fn(outputs, labels)
        self.log('train_loss', loss, prog_bar=True)
        return loss
    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(**inputs)
        loss = self.loss_fn(outputs, labels)
        _, preds = torch.max(outputs, dim=1)
        self.val_predictions.append(preds)
        self.val_targets.append(labels)
        self.val_losses.append(loss)
    def on_validation_epoch_start(self):
        self.val_predictions = []
        self.val_targets = []
        self.val_losses = []
    def on_validation_epoch_end(self):
        preds = torch.cat(self.val_predictions)
        targets = torch.cat(self.val_targets)
        loss = torch.mean(torch.stack(self.val_losses))
        if self.trainer.world_size > 1:
            preds = self.all_gather(preds)
            targets = self.all_gather(targets)
        preds = preds.cpu().numpy()
        targets = targets.cpu().numpy()
        acc = accuracy_score(targets, preds)
        f1 = f1_score(targets, preds, average='macro')
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        self.log('val_f1', f1, prog_bar=True)
    def test_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(**inputs)
        _, preds = torch.max(outputs, dim=1)
        self.test_predictions.append(preds)
        self.test_targets.append(labels)
    def on_test_epoch_start(self):
        self.test_predictions = []
        self.test_targets = []
    def on_test_epoch_end(self):
        preds = torch.cat(self.test_predictions)
        targets = torch.cat(self.test_targets)
        if self.trainer.world_size > 1:
            preds = self.all_gather(preds)
            targets = self.all_gather(targets)
        preds = preds.cpu().numpy()
        targets = targets.cpu().numpy()
        acc = accuracy_score(targets, preds)
        f1 = f1_score(targets, preds, average='macro')
        self.log('test_acc', acc)
        self.log('test_f1', f1)
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr, correct_bias=False, no_deprecation_warning=True)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=self.trainer.estimated_stepping_batches
        )
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

In [6]:
CHECKPOINT_NAME = 'kykim/bert-kor-base'

In [9]:
import os
import glob
import pandas as pd
import torch
from transformers import BertTokenizerFast
from pytorch_lightning import LightningModule

label_mapping = {0: 'AS', 1: 'Q', 2: 'CL', 3: 'CP', 4: 'B'}

# 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델과 토크나이저 경로 설정
best_model_path = '/home/son/ml/nlp_classification/Project_nlp_classification/best_class5/final_model.ckpt'
CHECKPOINT_NAME = 'kykim/bert-kor-base'

# 모델과 토크나이저 로드
bert_model = BertLightningModel.load_from_checkpoint(best_model_path)
bert_model.eval()
bert_model.to(device)

tokenizer = BertTokenizerFast.from_pretrained(CHECKPOINT_NAME)

# 폴더 경로와 파일 리스트 설정
folder_path = '/home/son/ml/nlp_classification/infer_datasets'
file_list = glob.glob(os.path.join(folder_path, '*.xlsx'))

# 최대 시퀀스 길이와 배치 크기 설정
max_len = 128
batch_size = 16

for file_path in file_list:
    df = pd.read_excel(file_path)
    
    if 'Text' not in df.columns:
        print(f"'Text' 컬럼이 {file_path}에 없습니다.")
        continue
    
    texts = df['Text'].tolist()
    
    tokens = tokenizer(
        texts,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=max_len,
        add_special_tokens=True
    )
    
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    token_type_ids = tokens['token_type_ids'].to(device)
    
    predicted_classes = []
    
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_input_ids = input_ids[i:i+batch_size]
            batch_attention_mask = attention_mask[i:i+batch_size]
            batch_token_type_ids = token_type_ids[i:i+batch_size]
            
            outputs = bert_model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                token_type_ids=batch_token_type_ids
            )
            
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
            preds = torch.argmax(probabilities, dim=1).cpu().numpy()
            predicted_classes.extend(preds)
    
    # 숫자 클래스를 레이블 문자열로 변환
    predicted_labels = [label_mapping[pred] for pred in predicted_classes]
    
    df['Prediction'] = predicted_labels
    df.to_excel(file_path, index=False)
    
    print(f"{file_path} 파일에 예측 결과를 저장했습니다.")


/home/son/anaconda3/envs/nlp/lib/python3.10/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


/home/son/ml/nlp_classification/infer_datasets/A_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/E_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/C_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/F_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/H_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/B_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/I_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/J_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/G_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
/home/son/ml/nlp_classification/infer_datasets/D_W1_R.xlsx 파일에 예측 결과를 저장했습니다.
