In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from datetime import datetime

In [None]:
# 데이터 전처리



# 1. 데이터 로드 및 전처리
file_path = '/Users/kyunghwanoh/Project/test/korean_emotional.xlsx'
data = pd.read_excel(file_path)
data = data[['Sentence', 'Emotion']].dropna()

# 라벨 인코딩
label_encoder = LabelEncoder()
data['Emotion'] = label_encoder.fit_transform(data['Emotion'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Sentence'], data['Emotion'], test_size=0.2, random_state=42
)



In [None]:

# 2. Hugging Face Tokenizer 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# 3. 데이터셋 변환 함수
def convert_to_hf_dataset(texts, labels, tokenizer, max_length=128):
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length
    )
    hf_dataset = Dataset.from_dict({
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": labels
    })
    return hf_dataset

    # Hugging Face Dataset으로 변환
train_dataset = convert_to_hf_dataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = convert_to_hf_dataset(val_texts.tolist(), val_labels.tolist(), tokenizer)




In [None]:

# 4. 모델 로드
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=len(label_encoder.classes_)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 모댈 학습 설정


# 오늘 날짜를 YYYYMMDD 형식으로 설정
today_date = datetime.now().strftime("%Y%m%d")

batch_size = 32
# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir=f'/Users/kyunghwanoh/Project/notebook/weights/class_{today_date}',  # output_dir 설정
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,  # 최대 저장 수를 2개로 제한
    logging_steps=10,
    report_to="none"  # wandb 비활성화
)

# 6. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [6]:
# 모델 학습
trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [7]:
#예측 및 활용
def predict(text, model, tokenizer):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to("mps")
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return label_encoder.inverse_transform([probs.argmax().item()])

# 예시
new_text = input()
predicted_emotion = predict(new_text, model, tokenizer)
print(f"예측 감정: {predicted_emotion}")


 ㅊㅊ


예측 감정: ['슬픔']


---
# 학습된 마지막 모델 불러오기

In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification, BertTokenizer
import torch
print(torch.backends.mps.is_available())  # True여야 함
print(torch.backends.mps.is_built())     # True여야 함


True
True


In [9]:
# 1. 데이터 로드 및 전처리
file_path = '/Users/kyunghwanoh/Project/test/korean_emotional.xlsx'
data = pd.read_excel(file_path)
data = data[['Sentence', 'Emotion']].dropna()

# 라벨 인코딩
label_encoder = LabelEncoder()
data['Emotion'] = label_encoder.fit_transform(data['Emotion'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Sentence'], data['Emotion'], test_size=0.2, random_state=42
)

# 2. 체크포인트 로드 함수
def get_latest_checkpoint(checkpoint_dir):
    checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint")]
    if not checkpoints:
        raise ValueError("체크포인트가 없습니다.")
    latest_checkpoint = max(checkpoints, key=os.path.getmtime)
    return latest_checkpoint

# 3. 모델과 토크나이저 로드
model_checkpoint_dir = '/Users/kyunghwanoh/Project/notebook/weights/class_241117'
latest_checkpoint = get_latest_checkpoint(model_checkpoint_dir)
print(f"가장 최근 체크포인트: {latest_checkpoint}")

# 장치 설정 (MPS, CUDA, CPU 중 사용 가능한 장치 선택)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

# 모델과 토크나이저 로드
load_model = BertForSequenceClassification.from_pretrained(latest_checkpoint).to(device)
load_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')



가장 최근 체크포인트: /Users/kyunghwanoh/Project/notebook/weights/class_241117/checkpoint-965
사용 장치: mps




In [11]:
# 4. 예측 함수
def predict(text, load_model, load_tokenizer):
    load_model.eval()
    inputs = load_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to("mps")  # 필요에 따라 'cuda' 또는 'cpu'로 변경
    outputs = load_model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return label_encoder.inverse_transform([probs.argmax().item()])

# 5. 입력 및 추론
new_text = input("입력 텍스트: ")
predicted_emotion = predict(new_text, load_model, load_tokenizer)
print(f"예측 감정: {predicted_emotion}")

입력 텍스트:   진짜 짜증


예측 감정: ['분노']
