In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, classification_report, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import tensorflow as tf
import torch

# from pykospacing import Spacing
# import re

import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv(r'./data/train_df_processed.csv')
dev_df = pd.read_csv(r'./data/dev_df_processed.csv')
test_df = pd.read_csv(r'./data/test_df_processed.csv')

In [None]:
# 데이터셋 정의 시, labels가 시리즈 형태라 오류 발생하여, tolist() 추가
# 해도 오류라서 먼저 변경 해봄.
train_title_list = train_df['text'].tolist()
train_comment_list = train_df['comments'].tolist()
train_label_list = train_df['labels'].astype(int).tolist()
dev_title_list = dev_df['text'].tolist()
dev_comment_list = dev_df['comments'].tolist()
dev_label_list = dev_df['labels'].astype(int).tolist()
print(type(train_label_list))
print(type(dev_label_list))

In [None]:
model_name = 'monologg/koelectra-base-v3-discriminator'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
# 데이터셋 클래스 정의
class TitleCommentDataset(Dataset):
    def __init__(self, titles, comments, labels, tokenizer, max_length=256):
        self.titles = titles
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        if isinstance(idx, list):
            return self.get_batch(idx)
        return self.get_single_item(idx)
    
    def get_single_item(self, idx):
        title = self.titles[idx]
        comment = self.comments[idx]
        label = self.labels[idx]

        # 제목과 댓글을 특별 토큰으로 구분하여 결합
        text = f'{title} [SEP] {comment}'

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    
    def get_batch(self, indices):
        batch = {
            'input_ids': [],
            'attention_mask': [],
            'labels': [],
        }
        for idx in indices:
            item = self.get_single_item(idx)
            for key in batch:
                batch[key].append(item[key])
    
        return {key: torch.stack(batch[key]) for key in batch}

In [None]:
# 데이터 분할 및 데이터로더 생성
train_dataset = TitleCommentDataset(train_title_list, train_comment_list, train_label_list, tokenizer)
val_dataset = TitleCommentDataset(dev_title_list, dev_comment_list, dev_label_list, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=0)

In [None]:
# 학습 함수 정의
def train(model, train_loader, val_loader, epochs=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        
        # 검증
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                _, predicted = torch.max(outputs.logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_accuracy = correct / total
        print(f'Epoch {epoch+1}/{epoch}, Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_accuracy:.4f}')

In [None]:
# 모델 학습
train(model, train_loader, val_loader)

In [None]:
# 평가 수행
def evaluate_model(model, val_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # 정확도 계산
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Accuracy: {accuracy:.4f}')

    # F1 스코어 계산
    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f'F1 Score: {f1:.4f}')

    # 혼동 행렬 계산 및 시각화
    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1, 2])
    
    fig, ax = plt.subplots(figsize=(10, 10))
    disp.plot(ax=ax)
    plt.title('Confusion Matrix')
    plt.show()

# 모델 평가 실행
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
evaluate_model(model, val_loader, device)

In [None]:
# 모델 저장
def make_tensors_contiguous(model):
    for param in model.parameters():
        param.data = param.data.contiguous()

# 모델 저장 전에 호출
make_tensors_contiguous(model)
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

In [None]:
# 예측 함수 정의
def predict(title, comment):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    
    text = f"{title} [SEP] {comment}"
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
    
    return predicted.item()  # 0, 1, 또는 2를 반환
    
# 예측 결과를 저장할 새로운 컬럼 생성
test_df['predicted_label'] = None

# tqdm을 사용하여 진행 상황 표시
for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    title = row['text']  # CSV 파일의 제목 컬럼 이름
    comment = row['comments']  # CSV 파일의 댓글 컬럼 이름
    
    # 예측 수행
    predicted_label = predict(title, comment)
    
    # 예측 결과를 DataFrame에 저장
    test_df.at[index, 'predicted_label'] = predicted_label

# 결과를 CSV 파일로 저장
test_df.to_csv('./data/save_results.csv', index=False)

print("예측이 완료되었고 결과가 저장되었습니다.")

In [None]:
result_df = pd.read_csv('./data/save_results.csv')
result_df

In [None]:
result_df = result_df.drop(['text', 'processed_comments', 'processed_newstitle'], axis=1)

In [None]:
result_df = result_df.rename(columns={'predicted_label':'label'})

In [None]:
result_df.to_csv('./data/results.csv', index=False)