In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder # 카테고리형 데이터를 수치형 데이터로 변환



In [None]:

cd /content/drive/MyDrive/AI/

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import mean_squared_error, accuracy_score
import numpy as np

# 데이터 로드
data = pd.read_csv('감성대화말뭉치.csv', encoding='cp949')

# 데이터 전처리
sentences = data['문장'].tolist()
labels = data['감정'].tolist()

# MultiLabelBinarizer를 사용하여 라벨 인코딩
mlb = MultiLabelBinarizer()
encoded_labels = mlb.fit_transform([[label] for label in labels])

# 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')
model = BertForSequenceClassification.from_pretrained('klue/bert-base', num_labels=len(mlb.classes_))

# 토큰화
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128)
labels = torch.tensor(encoded_labels)

# 데이터셋 클래스 정의
class EmotionDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item['labels'] = self.labels[idx]
        return item

dataset = EmotionDataset(inputs, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# 학습 준비
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 손실 함수 정의
criterion = BCEWithLogitsLoss()

# 학습 루프
model.train()
for epoch in range(3):  # 3 에폭 동안 학습
    total_loss = 0
    total_mse = 0
    total_rmse = 0
    total_accuracy = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device).float()
        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # MSE, RMSE, Accuracy 계산
        predictions = torch.sigmoid(outputs.logits).cpu().detach().numpy()
        true_labels = labels.cpu().numpy()
        mse = mean_squared_error(true_labels, predictions)
        rmse = np.sqrt(mse)
        predicted_labels = (predictions > 0.5).astype(int)
        accuracy = accuracy_score(true_labels, predicted_labels)

        total_mse += mse
        total_rmse += rmse
        total_accuracy += accuracy

    avg_loss = total_loss / len(dataloader)
    avg_mse = total_mse / len(dataloader)
    avg_rmse = total_rmse / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)

    print(f"Epoch {epoch + 1}")
    print(f"Loss: {avg_loss}")
    print(f"MSE: {avg_mse}")
    print(f"RMSE: {avg_rmse}")
    print(f"Accuracy: {avg_accuracy}")

# 모델 저장
model.save_pretrained("./emotion_model")
tokenizer.save_pretrained("./emotion_model")

# 모델 불러오기
model = BertForSequenceClassification.from_pretrained('./emotion_model')
tokenizer = BertTokenizer.from_pretrained('./emotion_model')
model.to(device)

# 예측 수행
def predict(sentences):
    model.eval()
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.sigmoid(logits).cpu().numpy()
    return predictions

# 새로운 데이터 로드
new_data = pd.read_csv('감성대화말뭉치_Validation.csv', encoding='cp949')
new_sentences = new_data['문장'].tolist()

# 예측 수행
predictions = predict(new_sentences)

# 예측 결과 디코딩
predicted_labels = mlb.inverse_transform(predictions > 0.5)

# 예측 결과를 데이터프레임으로 변환
results_df = pd.DataFrame({'문장': new_sentences, '예측된 감정': [' '.join(labels) for labels in predicted_labels]})

# 예측 결과를 CSV 파일로 저장
results_df.to_csv('./prediction_results.csv', index=False, encoding='utf-8-sig')

In [None]:

results_df.head()  # Display the first few rows of the result dataframe