In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Necessary Package Calls

In [None]:
!pip install transformers torch
!pip install datasets



In [None]:
!pip install faiss-cpu # install the CPU version of faiss

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


### Open CSV

In [None]:
# open data
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/AI/defamation_laws.csv')
df_copy = df.copy()
df_copy

Unnamed: 0,법률,조항,내용
0,형법,제307조(명예훼손),① 공연히 사실을 적시하여 사람의 명예를 훼손한 자는 2년 이하의 징역이나 금고 또...
1,형법,제308조(사자의 명예훼손),공연히 허위의 사실을 적시하여 사자의 명예를 훼손한 자는 2년 이하의 징역이나 금고...
2,형법,제309조(출판물 등에 의한 명예훼손),"① 사람을 비방할 목적으로 신문, 잡지 또는 라디오 기타 출판물에 의하여 제307조..."
3,정보통신망법,제70조제1항(인터넷 명예훼손 - 사실 적시),사람을 비방할 목적으로 정보통신망을 통해 공공연하게 사실을 드러내어 다른 사람의 명...
4,정보통신망법,제70조제2항(인터넷 명예훼손 - 허위 사실 적시),사람을 비방할 목적으로 정보통신망을 통해 공공연하게 거짓의 사실을 드러내어 다른 사...


### Model Download

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) 

Token is valid (permission: fineG

In [None]:
import os
import re
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from huggingface_hub import login
from transformers.models.t5.tokenization_t5 import T5Tokenizer  # Import T5Tokenizer directly
from datasets import Dataset

os.environ["HF_TOKEN"] = "AhnAlyaZum"
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
model_name = "wisenut-nlp-team/kot5-small"

# Use AutoTokenizer to download and cache the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Instead of printing tokenizer.cache_dir, print the resolved path
# using tokenizer.vocab_file or tokenizer.model_input_names
print(f"Tokenizer vocab file at: {tokenizer.vocab_file}")
# Or
print(f"Tokenizer model input names: {tokenizer.model_input_names}")

Tokenizer vocab file at: None
Tokenizer model input names: ['input_ids', 'attention_mask']


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset


# 데이터 전처리 함수 (CSV 데이터 → 모델 입력용 텍스트)
def preprocess_data(df):
    formatted_texts = []
    for _, row in df.iterrows():
        text = f"<법률>: {row['법률']}\n<조항>: {row['조항']}\n<내용>: {row['내용']}\n\n"
        formatted_texts.append({"text": text, "labels": text}) # Added 'labels' key with same value as 'text'
    return {"text": [x["text"] for x in formatted_texts], "labels": [x["labels"] for x in formatted_texts]} # Changed to return a dictionary with 'text' and 'labels' as keys

# 텍스트 변환
formatted_texts = preprocess_data(df)

# Hugging Face Datasets 형식으로 변환
dataset = Dataset.from_dict(formatted_texts) # Now 'formatted_texts' is a dictionary

# 데이터 확인
print(dataset[0])

{'text': '<법률>: 형법\n<조항>: 제307조(명예훼손)\n<내용>: ① 공연히 사실을 적시하여 사람의 명예를 훼손한 자는 2년 이하의 징역이나 금고 또는 500만원 이하의 벌금에 처한다. ② 공연히 허위의 사실을 적시하여 사람의 명예를 훼손한 자는 5년 이하의 징역, 10년 이하의 자격정지 또는 1천만원 이하의 벌금에 처한다.\n\n', 'labels': '<법률>: 형법\n<조항>: 제307조(명예훼손)\n<내용>: ① 공연히 사실을 적시하여 사람의 명예를 훼손한 자는 2년 이하의 징역이나 금고 또는 500만원 이하의 벌금에 처한다. ② 공연히 허위의 사실을 적시하여 사람의 명예를 훼손한 자는 5년 이하의 징역, 10년 이하의 자격정지 또는 1천만원 이하의 벌금에 처한다.\n\n'}


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq # Import DataCollatorForSeq2Seq
from datasets import Dataset


# 모델 로드
model_name = "wisenut-nlp-team/kot5-small"
# Instead of AutoModelForCausalLM, use AutoModelForSeq2SeqLM for T5 models
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 토큰화 함수
def tokenize_function(examples):
    # Tokenize both inputs and labels
    model_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["labels"], truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 데이터셋 토큰화 적용
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text", "labels"])

# 데이터 로더 설정 - Use DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 학습 설정
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/kosaul_finetuned",
    # ... (other training arguments) ...
)

# Trainer 설정
trainer = Trainer(
    model=model, # Now 'model' is defined
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator # Pass the data collator to the Trainer
    # ... (other trainer arguments) ...
)

# 학습 시작
trainer.train()

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=3, training_loss=18.202295939127605, metrics={'train_runtime': 2.7859, 'train_samples_per_second': 5.384, 'train_steps_per_second': 1.077, 'total_flos': 507531755520.0, 'train_loss': 18.202295939127605, 'epoch': 3.0})

### FAISS 활용

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

# 판례 데이터 로드
df_cases = pd.read_csv("/content/drive/MyDrive/legal_cases.csv")

# 한국어 문장 임베딩 모델 로드
embedding_model = SentenceTransformer("snunlp/KR-SBERT-V40K-klueNLI-augSTS")

# 판례 데이터를 벡터로 변환하여 저장
df_cases["embedding"] = df_cases["내용"].apply(lambda x: embedding_model.encode(x))
dimension = len(df_cases["embedding"][0])
index = faiss.IndexFlatL2(dimension)

# 판례 임베딩을 인덱스에 추가
embeddings = np.vstack(df_cases["embedding"].to_numpy())
index.add(embeddings)

# FAISS 인덱스 저장
faiss.write_index(index, "/content/drive/MyDrive/faiss_legal_cases.index")


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/legal_cases.csv'

In [None]:
# 토큰화 함수
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) # Reduced max_length further

# 데이터셋 토큰화 적용
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# 데이터 로더 설정
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 학습 설정
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/kosaul_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/content/logs",
    logging_steps=10,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=16,  # Increased gradient accumulation steps further
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # Enable fp16 for lower memory usage
    push_to_hub=False,
    gradient_checkpointing=True # Enable gradient checkpointing
)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# If you have multiple GPUs and want to utilize them, you can use accelerate launch
# !accelerate launch --mixed_precision="fp16" --num_processes=2 ipython-input-22-746d1f830ec4.py # Adjust num_processes to the number of GPUs you have. Make sure this filename matches the source

# If you prefer to keep device_map="auto" and not use accelerate launch
# Add this line before trainer.train()
model.to(training_args.device) # move the model to the device

# 학습 시작
trainer.train()

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

  trainer = Trainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 22.16 GiB of which 145.38 MiB is free. Process 11965 has 22.01 GiB memory in use. Of the allocated memory 21.83 GiB is allocated by PyTorch, and 1.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)