In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
import pandas as pd
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset, ClassLabel, Sequence

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ner_dataset_motsunabe.csv")

In [None]:
# sentence_id 기준으로 문장별 input_ids, labels 리스트 만들기
grouped = df.groupby("sentence_id")
input_ids = grouped["token_id"].apply(list).tolist()
labels = grouped["tag"].apply(list).tolist()

# label → 숫자 인덱스로 변환
unique_labels = sorted(set(l for doc in labels for l in doc))
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for l, i in label2id.items()}
labels_ids = [[label2id[l] for l in doc] for doc in labels]

In [None]:
# HuggingFace Dataset 생성
dataset = Dataset.from_dict({
    "input_ids": input_ids[:111],
    "labels": labels_ids[:111]
})

train_dataset = dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "monologg/koelectra-base-v3-discriminator",
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
help(TrainingArguments)

In [None]:
args = TrainingArguments(
    output_dir="./ner_kc_electra",
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    metric_for_best_model="eval_loss",
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
save_path_drive = "/content/drive/MyDrive/ner_kc_electra_final"

# 모델 및 토크나이저 저장
trainer.save_model(save_path_drive)
tokenizer.save_pretrained(save_path_drive)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
repo_name = "5gidong/kcelectra_review_ner"

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

In [None]:
model = AutoModelForTokenClassification.from_pretrained("5gidong/kcelectra_review_ner")
tokenizer = AutoTokenizer.from_pretrained("5gidong/kcelectra_review_ner")

In [None]:
import re

def clean_text(text):
  # 줄바꿈 기준 분리
  sentences = []
  lines = text.split('\n')
  for line in lines:
      line = line.strip()
      if not line:
          continue

      # 느낌표를 마침표로 변환
      bang2dot = re.sub(r'!+', '.', line)
      # 마침표 기준 문장 분리
      sents = re.split(r'(?<=\.)\s+', bang2dot)

      for s in sents:
          s = s.strip()
          if not s:
              continue

          # 한글, 숫자, 공백, 문장부호(., !, ?, ", ', “ ”)만 남기기
          s = re.sub(r'[^가-힣0-9\s\.\?\"\'“”]', '', s)
          if s:
              sentences.append(s)
  return sentences

In [None]:
import torch

In [None]:
def get_menu(text):
  sentence = clean_text(text)

  # 4️⃣ 토큰화
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=False)

  # 5️⃣ 모델 추론
  model.eval()
  with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits  # (batch, seq_len, num_labels)
  predictions = torch.argmax(logits, dim=2)

  # 6️⃣ 토큰과 예측 라벨 매핑
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
  pred_labels = [id2label[p.item()] for p in predictions[0]]

  # 7️⃣ 결과 출력
  results = []
  temp = []
  concat = False
  for tok, lab in zip(tokens, pred_labels):
      if lab[0] == 'B' and lab[2] == 'M':
        if concat == True:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
        concat = True
        temp.append(tok)

      elif concat:
        if lab[0] == 'I' and lab[2] == 'M':
          temp.append(tok)
        else:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
          concat = False
  return results


In [None]:
def get_amb(text):
  sentence = clean_text(text)

  # 4️⃣ 토큰화
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=False)

  # 5️⃣ 모델 추론
  model.eval()
  with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits  # (batch, seq_len, num_labels)
  predictions = torch.argmax(logits, dim=2)

  # 6️⃣ 토큰과 예측 라벨 매핑
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
  pred_labels = [id2label[p.item()] for p in predictions[0]]

  # 7️⃣ 결과 출력
  results = []
  temp = []
  concat = False
  for tok, lab in zip(tokens, pred_labels):
      if lab[0] == 'B' and lab[2] == 'A':
        if concat == True:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
        concat = True
        temp.append(tok)

      elif concat:
        if lab[0] == 'I' and lab[2] == 'A':
          temp.append(tok)
        else:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
          concat = False
  return results


In [None]:
def get_exp(text):
  sentence = clean_text(text)

  # 4️⃣ 토큰화
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=False)

  # 5️⃣ 모델 추론
  model.eval()
  with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits  # (batch, seq_len, num_labels)
  predictions = torch.argmax(logits, dim=2)

  # 6️⃣ 토큰과 예측 라벨 매핑
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
  pred_labels = [id2label[p.item()] for p in predictions[0]]

  # 7️⃣ 결과 출력
  results = []
  temp = []
  concat = False
  for tok, lab in zip(tokens, pred_labels):
      if lab[0] == 'B' and lab[2] == 'E':
        if concat == True:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
        concat = True
        temp.append(tok)

      elif concat:
        if lab[0] == 'I' and lab[2] == 'E':
          temp.append(tok)
        else:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
          concat = False
  return results


In [None]:
def get_pp(text):
  sentence = clean_text(text)

  # 4️⃣ 토큰화
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=False)

  # 5️⃣ 모델 추론
  model.eval()
  with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits  # (batch, seq_len, num_labels)
  predictions = torch.argmax(logits, dim=2)

  # 6️⃣ 토큰과 예측 라벨 매핑
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
  pred_labels = [id2label[p.item()] for p in predictions[0]]

  # 7️⃣ 결과 출력
  results = []
  temp = []
  concat = False
  for tok, lab in zip(tokens, pred_labels):
      if lab[0] == 'B' and lab[2] == 'P':
        if concat == True:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
        concat = True
        temp.append(tok)

      elif concat:
        if lab[0] == 'I' and lab[2] == 'P':
          temp.append(tok)
        else:
          results.append(tokenizer.convert_tokens_to_string(temp))
          temp = []
          concat = False
  return results


In [None]:
sentences = ["아, 매운 명란젓이 코스에 포함되어 있어서 밥을 추가했어요.", "모츠나베를 먹기 전에 소고기 식초 모츠와 냉두부를 선택했습니다.", "센마이도 주문했는데, 개인적으로는 조금 더 크게 썰어주면 좋겠다고 생각했습니다.", "다른 가게보다 조금 비싼데도 불구하고, 거의 젊은 손님들로 가득 차 있었던 것은 데이트에 사용하기 좋기 때문일까요?", "가장 마음에 들었던 것은 챤폰면입니다."]

menu_result = [get_menu(t) for t in sentences]
amb_result = [get_amb(t) for t in sentences]
exp_result = [get_exp(t) for t in sentences]
pp_result = [get_pp(t) for t in sentences]
for idx, sent in enumerate(sentences):
  print("\n", sent)
  print("메뉴: ", menu_result[idx])
  print("분위기: ", amb_result[idx])
  print("경험: ", exp_result[idx])
  print("인원: ", pp_result[idx])