In [None]:
%pip install ipywidgets scikit-learn torch matplotlib seaborn

In [None]:
from utils import data_load

In [None]:
# data setting
data_base_dir = "./model_data/"
bert_file = "bert_inputs.txt"
MAXLEN = 512
nlp_data_label = data_load(base_dir=data_base_dir, bert_flie=bert_file, MAXLEN=MAXLEN)
len(nlp_data_label[0])

In [None]:
# bert Setting
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import transformers
MAX_LEN = 500
class BERTClass(torch.nn.Module):
    def __init__(self, output_numbers=13, 
                 device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-multilingual-cased',
                                                         problem_type='multi_label_classification', 
                                                         num_labels=output_numbers, output_attentions=True)  # 5
        self.config = self.l1.config
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, output_numbers)
        self.device = device

    def forward(self, input_ids, attention_mask, token_type_ids):
        # _, output_1 = self.l1(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        _, output_1, _ = self.l1(input_ids, attention_mask=attention_mask, 
                                 token_type_ids=token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output


class CustomDataset(Dataset):
    def __init__(self, indexlist, dataframe, real_label, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.indexlist = indexlist
        self.comment_text = dataframe
        self.targets = real_label
        if max_len == -1:
            self.max_len = None
        else:
            self.max_len = max_len
        self.MAX_LEN = MAX_LEN
    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])[:self.max_len]

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'pid': self.indexlist[index],
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

    def get_decode_item(self, index):
        comment_text = str(self.comment_text[index])
        return comment_text


def setting_bert_model(device, bert_model_path=None, output_numbers=13):
    if bert_model_path is None:
        model = BERTClass(output_numbers=output_numbers, device=device)
        model.to(device)
    else:
        model = BERTClass(output_numbers=output_numbers).to(device)
        model.load_state_dict(torch.load(f'{bert_model_path}', map_location=device))
    return model

In [None]:
from tqdm.notebook import tqdm
import torch.nn.functional as F

def validation_bert(model, x, device):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    predbs = []
    pids = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(x), total=len(x)):
            ids = data['input_ids'].to(device, dtype=torch.long)
            mask = data['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.int64)
            outputs = model(ids, mask, token_type_ids)
            probs = F.softmax(outputs, dim=-1)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            outputs = outputs.argmax(dim=1)
            max_probs, _ = torch.max(probs, dim=-1)
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
            pids.extend(data['pid'].cpu().detach().numpy().tolist())
            predbs.extend(max_probs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets, predbs, pids

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, auc, roc_auc_score
import pandas as pd
def bert_validation(device, datas, validation_count=100, max_lan=MAX_LEN, bert_model_file=None, resume=False):
    nlp_data = datas[0][:validation_count]  # Assuming nlp_data is a DataFrame or similar structure
    labels = datas[1]
    nlp_index = datas[2][:validation_count]
    # output_numbers = len(set(labels))
    output_numbers = 12
    print("output_numbers:", output_numbers)
    labels = labels[:validation_count]
    cur_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    validation_set = CustomDataset(nlp_index, nlp_data, labels, cur_tokenizer, max_lan)
    params = {
        'batch_size': 32,
        'shuffle': True,
        'num_workers': 0
    }
    validation_loader = DataLoader(validation_set, **params)
    bert_model = setting_bert_model(device, bert_model_file, output_numbers=output_numbers)
    outputs, targets, preds, idxs = validation_bert(bert_model, validation_loader, device=device)
    validation_acc = accuracy_score(targets, outputs)
    validation_f1 = f1_score(targets, outputs, average='weighted')
    df = pd.DataFrame([idxs, targets, outputs, preds], index = ['pid', 'target', 'output', 'preds']).T
    print("accuracy: ", validation_acc, "f1: ", validation_f1)
    return df

In [None]:
device = torch.device(f"cuda") if torch.cuda.is_available() else torch.device("cpu")
datas = nlp_data_label
bert_model_file = "model/bert_classification.pth"
validation_count = 150#-1이면 전체
result_df = bert_validation(device, datas, validation_count=validation_count, max_lan=-1, bert_model_file=bert_model_file)

In [None]:
display(result_df)

### 💡 코멘트
- 모델 실행 결과를 담은 `result_df`는 기본적으로 float 형식으로 출력됨.
- `pid`, `target`, `output`은 정수형(label, id)이므로 `.astype(int)`로 변환하여 가독성을 높임.
- 변환 후 CSV 저장 시 분석/리포트에 활용하기 편리함.

In [None]:
result_df[["pid", "target", "output"]] = result_df[["pid", "target", "output"]].astype(int)
result_df['target'].value_counts()

### 📊 클래스별 성능 지표 산출

- `precision_recall_fscore_support(average=None)`로 **각 클래스별** Precision / Recall / F1 / Support를 계산.
- `confusion_matrix`로부터 각 클래스의 **TP/FP/FN/TN**을 구해,  
  **클래스별 Accuracy = (TP + TN) / N** 를 계산해 비교.
- `prevalence`는 전체 표본에서 해당 클래스가 차지하는 비율로, **불균형 데이터** 진단에 유용.

**컬럼 해석**
- `class`: 클래스 라벨
- `support`: 해당 클래스의 실제 표본 수
- `precision`: 그 클래스로 예측한 것 중 맞은 비율
- `recall`: 실제 그 클래스인 것 중 맞춘 비율 (민감도)
- `f1`: precision/recall의 조화 평균
- `accuracy_classwise`: 정의상 (TP+TN)/N (참고: 멀티클래스에서는 전 클래스에 동일한 TN이 포함되므로, 보통 클래스별 평가는 `recall` 중심으로 해석)
- `tp/fp/fn/tn`: 혼동행렬 기반 카운트
  - `tp` (True Positive): 실제 클래스가 해당 클래스이고, 모델도 해당 클래스로 올바르게 예측한 개수
  - `fp` (False Positive): 실제는 다른 클래스인데, 모델이 해당 클래스로 잘못 예측한 개수
  - `fn` (False Negative): 실제는 해당 클래스인데, 모델이 다른 클래스로 잘못 예측한 개수
  - `tn` (True Negative): 실제도 다른 클래스이고, 모델도 다른 클래스로 예측한 개수
- `prevalence`: 데이터 내 클래스 비율

> TIP: 클래스별 성능을 볼 때는 **recall**(놓치지 않는가)과 **precision**(헛집지 않는가)을 함께 보며,  
> `support`가 매우 작은 클래스는 분산이 크므로 주의해서 해석하세요.


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# 1) 레이블 준비
y_true = result_df["target"].astype(int).to_numpy()
y_pred = result_df["output"].astype(int).to_numpy()
labels = sorted(np.unique(y_true))  # 등장한 클래스만
# labels = 12
# 2) per-class PRF, support
prec, rec, f1, support = precision_recall_fscore_support(
    y_true, y_pred, labels=labels, average=None, zero_division=0
)

##
#
#     정상 악성
#정상  TP    FP
#악성  F    T
###

# 3) 혼동행렬 & per-class accuracy (TP+TN)/N
cm = confusion_matrix(y_true, y_pred, labels=labels)
print("cm: \n", cm)
N = cm.sum()
per_class_acc = []
tp_list, fp_list, fn_list, tn_list = [], [], [], []

for i, _cls in enumerate(labels):
    tp = cm[i, i]
    fn = cm[i, :].sum() - tp
    fp = cm[:, i].sum() - tp
    tn = N - tp - fn - fp
    acc_c = (tp + tn) / N  # 정의에 따른 클래스별 accuracy

    tp_list.append(int(tp)); fp_list.append(int(fp))
    fn_list.append(int(fn)); tn_list.append(int(tn))
    per_class_acc.append(acc_c)

# 4) DataFrame로 정리
per_class_df = pd.DataFrame({
    "class": labels,
    "support": support,
    "precision": prec,
    "recall": rec,
    "f1": f1,
    "accuracy_classwise": per_class_acc,
    "tp": tp_list,
    "fp": fp_list,
    "fn": fn_list,
    "tn": tn_list,
    "prevalence": np.array(support) / N,
}).sort_values("class").reset_index(drop=True)

# 5) 보기 좋게 소수점 반올림 & 저장
show_cols = ["class","support","prevalence","precision","recall","f1","accuracy_classwise","tp","fp","fn","tn"]
per_class_df[["precision","recall","f1","accuracy_classwise","prevalence"]] = \
    per_class_df[["precision","recall","f1","accuracy_classwise","prevalence"]].round(4)
display(per_class_df)


### 📊 Confusion Matrix (원본 + 정규화)

- Confusion Matrix는 실제 라벨과 예측 라벨 간의 분포를 표로 표현.
- 정규화(%)를 적용하면 클래스별 비율을 직관적으로 비교 가능.
- 이후 시각화(heatmap)로 확장하기 전에, 우선 **DataFrame**으로 확인.

**출력 예시**
- `cm_raw_df` : 원본 개수 단위 Confusion Matrix
- `cm_norm_df` : 행 단위 정규화(%) Confusion Matrix


In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix

# 실제값, 예측값
y_true = result_df["target"].astype(int)
y_pred = result_df["output"].astype(int)

# 클래스 라벨 추출
labels = sorted(result_df["target"].unique())

# 혼동행렬 계산
cm = confusion_matrix(y_true, y_pred, labels=labels)

# DataFrame (원본 개수 단위)
cm_raw_df = pd.DataFrame(cm, index=labels, columns=labels)
cm_raw_df.index.name = "True"
cm_raw_df.columns.name = "Predicted"

# DataFrame (정규화 → % 단위, 행 기준 비율)
cm_norm = cm.astype("float") / cm.sum(axis=1)[:, None] * 100
cm_norm_df = pd.DataFrame(cm_norm.round(2), index=labels, columns=labels)
cm_norm_df.index.name = "True"
cm_norm_df.columns.name = "Predicted"

print("Confusion Matrix (Counts)")
display(cm_raw_df)

print("\nConfusion Matrix (Row-normalized %)")
display(cm_norm_df)


In [None]:
result_df.to_csv(f'validation_result_{validation_count}.csv', index=False)
per_class_df.to_csv(f'validation_result_{validation_count}_all.csv', index=False)
cm_norm_df.to_csv(f'validation_result_cm_{validation_count}.csv', index=False)

### Traininig
``` python

training_set = CustomDataset(train_dataset, train_dataset, train_labels, cur_tokenizer, max_len)
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
	}

def train(model, x, epoch=0, optimizer=None, device=None, loss_fn=None):
    model.train()
    curr_loss = 0
    correct = 0
    counts = 0
    preds = []
    labels = []
    for _, data in tqdm(enumerate(x), total=len(x), desc=f'Epoch {epoch}', leave=False):
        ids = data['input_ids'].to(device, dtype=torch.int64)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.int64)
        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        outputs = outputs.argmax(dim=1)
        correct += (outputs == targets).sum().item()
        counts += len(targets)
        curr_loss += loss.item()
        preds.extend(outputs.cpu().numpy())
        labels.extend(targets.cpu().numpy())

    loss_result = curr_loss / len(x)

    acc, pre, recall, f1, cm = \
        accuracy_score(labels, preds), precision_score(labels, preds, average='weighted'), \
            recall_score(labels, preds, average='weighted'), f1_score(labels, preds, average='weighted'), \
            confusion_matrix(labels, preds)

    return loss_result, acc, pre, recall, f1, cm

for epoch in tqdm(range(EPOCHS)):
	loss_result, acc, pre, recall, f1, cm = train(bert_model, training_loader, epoch=epoch, optimizer=optimizer, device=device, loss_fn=loss_fn)

torch.save(bert_model.state_dict(), f'{bert_model_output}.pth')

```
