In [1]:
!pip install google



In [3]:
#코랩 - 구글드라이브 연동 코드
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#mecab
!apt-get update
!apt-get install g++ openjdk-8-jdk -y
!pip install konlpy
!pip install mecab-python
!apt-get install curl -y
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [72.6 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,824 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,688 kB]
Get:14

In [5]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [6]:
from konlpy.tag import Mecab
mecab = Mecab()

In [8]:
# ✅ 코랩에서 실행할 ABSA 학습 코드 템플릿
# 목적: 미리 정의된 aspect term이 주어졌을 때 해당 문장에서의 감성(긍/부정/중립) 분류

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# ✅ 1. 데이터 로드: (aspect, sentence, label) 형식의 CSV 파일을 불러옴
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/absa_train.csv")  # 'aspect', 'sentence', 'label' 컬럼 필요

# ✅ 라벨별로 원하는 수량만 샘플링
df_pos = df[df["sentiment"] == "positive"].sample(n=200, random_state=42)
df_neg = df[df["sentiment"] == "negative"].sample(n=200, random_state=42, replace=True)
df_neu = df[df["sentiment"] == "neutral"].sample(n=100, random_state=42, replace=True)

# ✅ 병합 후 섞기 (옵션)
df_sampled = pd.concat([df_pos, df_neg, df_neu]).sample(frac=1, random_state=42).reset_index(drop=True)


# ✅ 2. 라벨 인코딩
label2id = {"positive": 0, "neutral": 1, "negative": 2}
id2label = {v: k for k, v in label2id.items()}
df["labels"] = df["sentiment"].map(label2id)

# ✅ 3. 입력 형식: "[ASPECT] [SEP] 문장" 구조로 만들기
df["text"] = df["aspect"] + " [SEP] " + df["sentence"]

# ✅ 4. Dataset으로 변환
dataset = Dataset.from_pandas(df[["text", "labels"]])


# ✅ 5. Tokenizer & Model 선택 (한국어 지원 모델)
model_name = "beomi/KcBERT-base"  # 또는 'beomi/KcBERT-base', 'snunlp/KR-BERT'
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize(example):
    tokenized = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokenized["labels"] = example["labels"]
    return tokenized

tokenized_dataset = dataset.map(tokenize)

# ✅ 6. 모델 로드
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3, id2label=id2label, label2id=label2id)

# ✅ 7. 학습 설정
training_args = TrainingArguments(
    output_dir="./absa_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    save_total_limit=1,
    load_best_model_at_end=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

# ✅ 8. 학습 실행
trainer.train()

# ✅ 9. 예측 함수
from transformers import TextClassificationPipeline
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False)

def predict_sentiment(aspect, sentence):
    text = f"{aspect} [SEP] {sentence}"
    result = pipeline(text)[0]
    return result['label']



Map:   0%|          | 0/1808 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/KcBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2535,0.128118


Device set to use cuda:0


positive




In [27]:
# ✅ 예시
print(predict_sentiment("색 조합", "한번 교환했는데 새로 온 db20은 불량화소가 있고 ㅜ ㅜ ㅜ"))  # 예: 'negative'

positive


In [12]:
df_sampled

Unnamed: 0,sentence,aspect,sentiment
0,-_-; 색 조합이 저게 뭐냐?,색 조합,negative
1,아이들과 워터파크 놀러가서 사용하기 딱 좋은 백,백,positive
2,-_-; 색 조합이 저게 뭐냐?,색 조합,negative
3,벌써 몇 통째 쓰고 있는 #아이오페 #시카크림,아이오페 #시카크림,positive
4,천연재료를 이용하여 신생아부터 민감한 피부의 어른까지 온가족이 사용가능한 #천연라이...,천연재료,positive
...,...,...,...
495,첫째도 둘째도 발진없이 잘사용했던 기저귀 ㅎㅎㅎ,기저귀,positive
496,한번 교환했는데 새로 온 UD20은 불량화소가 있고 ㅜ ㅜ ㅜ,UD20,negative
497,전에 작동 안되었던 자막 검색 후 등록 기능이 똑같이 작동 안 된다!!!,자막 검색 후 등록 기능,negative
498,파운데이션팩트라해서다크닝이 있지는 않을까...걱정했는데 노노노!!,파운데이션팩트,neutral
