In [None]:
import pandas as pd

file_path = "/content/train_data_merge_0604.csv"

df = pd.read_csv(file_path, encoding='utf-8')

print(df.head())


In [None]:
!pip install -U datasets
!pip install -U transformers

In [3]:
# 1. 라이브러리 로딩
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset


In [None]:
import pandas as pd

# 1. NaN 제거
df = df.dropna(subset=["context", "response", "label"])

# 2. 라벨 처리 (공백 제거 및 숫자 매핑)
df["label"] = df["label"].str.strip().str.capitalize()  # 'non-sarcasm' -> 'Non-sarcasm'
df = df[df["label"].isin(["Sarcasm", "Non-sarcasm"])].copy()
df["label"] = df["label"].map({"Non-sarcasm": 0, "Sarcasm": 1})

# 3. 프롬프트 생성
df["text"] = df.apply(
    lambda row: f"다음 상황을 읽고, 이어지는 발언이 풍자(Sarcasm)인지 아닌지 분류하세요.\n상황: {row['context']}\n발언: {row['response']}",
    axis=1
)

# 4. 결과 확인
print(df[["text", "label"]].sample(3).to_string(index=False))


In [5]:
# 5. 데이터 분리
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    stratify=df["label"]
)


In [None]:
# 6. 토크나이저 및 모델 로딩
model_name = "monologg/kobert"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [7]:
# pad_token 보완
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

In [8]:
# 7. Dataset 정의
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [9]:
# 8. Dataset 객체 생성
train_dataset = SarcasmDataset(train_texts, train_labels, tokenizer)
val_dataset = SarcasmDataset(val_texts, val_labels, tokenizer)

In [None]:
# 9. Trainer 설정
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="no",  # ✅ 모델 저장 기능 제거
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="wandb",  # or "none"
    run_name="kobert-kocosa-run"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
###########################################
# 9-1. 허깅페이스 로그인

# WRITE token
!huggingface-cli login --token "[token]" # code

In [13]:
from transformers import AutoTokenizer
import os
import shutil

# KoBERT 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)

# 저장할 디렉토리 생성
save_dir = "./kobert_sarcasm_tokenizer"
os.makedirs(save_dir, exist_ok=True)

# vocab.txt 복사
shutil.copyfile(tokenizer.vocab_file, os.path.join(save_dir, "vocab.txt"))

# 구성 파일 수동 저장
with open(os.path.join(save_dir, "tokenizer_config.json"), "w", encoding="utf-8") as f:
    f.write('{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}')

with open(os.path.join(save_dir, "special_tokens_map.json"), "w", encoding="utf-8") as f:
    f.write('{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}')


In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path=save_dir,
    repo_id="tlttlto/sktBERT",
    path_in_repo="",  # 루트에 업로드
    repo_type="model"
)

In [None]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)
print(tok.vocab_file)


In [16]:
import shutil
import os

model_file = tok.vocab_file  # .model 파일 경로

# 저장 디렉토리
save_dir = "./kobert_sarcasm_tokenizer"
os.makedirs(save_dir, exist_ok=True)

# SentencePiece 모델 복사 (핵심)
shutil.copyfile(model_file, os.path.join(save_dir, "tokenizer_78b3253a26.model"))

# 구성 파일 생성
with open(os.path.join(save_dir, "tokenizer_config.json"), "w", encoding="utf-8") as f:
    f.write('{"tokenizer_class": "KoBertTokenizer"}')

with open(os.path.join(save_dir, "special_tokens_map.json"), "w", encoding="utf-8") as f:
    f.write('{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}')


In [None]:
from huggingface_hub import HfApi, login

login("[token]")  # 또는 login("hf_...")

api = HfApi()
api.upload_folder(
    folder_path=save_dir,
    repo_id="tlttlto/sktBERT",
    path_in_repo="",
    repo_type="model"
)


In [18]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score
import torch

# 평가용 Dataset 클래스
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [None]:
from transformers import BertTokenizer

# 다시 불러오기
tokenizer = BertTokenizer.from_pretrained("monologg/kobert", do_lower_case=False)

# 저장
tokenizer.save_pretrained("./finetuned_model")


In [20]:
model.save_pretrained("./finetuned_model")


In [21]:
from transformers import BertTokenizer, AutoModelForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("./finetuned_model")
model = AutoModelForSequenceClassification.from_pretrained("./finetuned_model")


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score

# ✅ 1. 저장된 모델 경로
model_path = "./finetuned_model"

# ✅ 2. 저장된 모델과 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

In [23]:
# ✅ 3. 평가용 데이터셋 클래스 정의
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [24]:
# ✅ 4. 평가 데이터셋 구성 (이미 존재하는 val_texts, val_labels 사용)
eval_dataset = SarcasmDataset(val_texts, val_labels, tokenizer)
eval_loader = DataLoader(eval_dataset, batch_size=16)


In [25]:
from tqdm import tqdm  # 진행률 표시

all_preds = []
all_labels = []

# tqdm으로 eval_loader 감싸기
with torch.no_grad():
    for batch in tqdm(eval_loader, desc="평가 진행 중"):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())


평가 진행 중: 100%|██████████| 50/50 [04:49<00:00,  5.80s/it]


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# 평가 결과 출력
print("분류 리포트:\n")
print(classification_report(all_labels, all_preds, digits=4))

In [None]:
from collections import Counter
print("예측 결과 분포:", Counter(all_preds))

for i in range(20):
    print(f"\n[예시 {i+1}]")
    print("문장:", val_texts[i])
    print("실제 라벨:", val_labels[i])
    print("예측 라벨:", all_preds[i])
