# 1. 메세지 내 링크 포함 여부 Classification

In [None]:
!pip install transformers datasets



In [None]:
import pandas as pd
from datasets import Dataset

# CSV 로드
df = pd.read_csv("/content/스팸_데이터셋_최종.csv")
df = df.dropna(subset=["message", "label"])
df["message"] = df["message"].astype(str)
df["label"] = df["label"].astype(int)

# HF Dataset 변환
dataset = Dataset.from_pandas(df[["message", "label"]])
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
val_ds = dataset["test"]

In [None]:
from transformers import AutoTokenizer

model_ckpt = "beomi/KcELECTRA-base"  # 한국어 분류 모델
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def preprocess(example):
    return tokenizer(example["message"], truncation=True, padding="max_length", max_length=256)

train_ds = train_ds.map(preprocess)
val_ds = val_ds.map(preprocess)


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
!pip install --upgrade transformers



In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Assuming model_ckpt, train_ds, val_ds, and tokenizer are defined elsewhere

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    # Ensure labels are integers for f1_score if they aren't already
    # (often necessary depending on how datasets are loaded)
    labels = labels.astype(int)
    preds = preds.astype(int)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average='binary') # Specify average for binary classification
    }

args = TrainingArguments(
    output_dir="./link_classifier",
    eval_strategy="steps",        # <--- Renamed from evaluation_strategy
    logging_strategy="steps",       # <--- Renamed from logging_strategy
    logging_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
    # You might need metric_for_best_model if load_best_model_at_end=True
    # e.g., metric_for_best_model="f1",
    # greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Before starting training, it's good practice to handle the warning:
# "Some weights of ElectraForSequenceClassification were not initialized..."
# This warning is normal when adding a new classification head.
# Training will initialize these weights.

print("Starting training...")
trainer.train()
print("Training finished.")

# Optional: Evaluate after training
# eval_results = trainer.evaluate()
# print(f"Evaluation results: {eval_results}")

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.2797,0.222203,0.944,0.958944
200,0.1915,0.20657,0.944,0.95679
300,0.1283,0.106736,0.972,0.979228
400,0.0859,0.110905,0.976,0.981873
500,0.0888,0.079261,0.976,0.982036
600,0.0609,0.15962,0.966,0.974125
700,0.0548,0.083342,0.976,0.982036
800,0.0413,0.083663,0.982,0.986547
900,0.0337,0.1453,0.976,0.981928
1000,0.0286,0.113363,0.976,0.981982


Training finished.


In [None]:
# 1. 모델과 토크나이저 저장
trainer.save_model("/content/link_classifier_best")
tokenizer.save_pretrained("/content/link_classifier_best")

# 2. zip으로 압축
import shutil
shutil.make_archive("/content/link_classifier_best", 'zip', "/content/link_classifier_best")

'/content/link_classifier_best.zip'

# 2. 링크 추출

In [None]:
!pip install pandas datasets transformers torch accelerate evaluate rouge_score sentencepiece

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12=

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import evaluate # ROUGE 계산을 위해 evaluate 라이브러리 사용


In [None]:
try:
    df = pd.read_csv("/content/스팸_데이터셋_최종.csv")
    print(f"CSV 로드 완료. 총 행 수: {len(df)}")

    # 레이블 1, restored_link가 유효한 데이터만 필터링
    df_filtered = df[(df["label"] == 1) & (df["restored_link"].notna()) & (df["restored_link"].str.strip() != "")]
    print(f"필터링 후 행 수: {len(df_filtered)}")

    if len(df_filtered) == 0:
        raise ValueError("필터링 후 학습 데이터가 없습니다. CSV 파일의 'label' 및 'restored_link' 열을 확인하세요.")

    # 타입 변환 (오류 방지 위해 명시적 처리)
    df_filtered["message"] = df_filtered["message"].astype(str)
    df_filtered["restored_link"] = df_filtered["restored_link"].astype(str)

    # 데이터셋 생성
    dataset = Dataset.from_pandas(df_filtered[["message", "restored_link"]])

    # 훈련/검증 데이터셋 분할
    dataset = dataset.train_test_split(test_size=0.1, seed=42) # 재현성을 위해 seed 추가
    train_ds = dataset["train"]
    val_ds = dataset["test"]

    print(f"훈련 데이터셋 크기: {len(train_ds)}")
    print(f"검증 데이터셋 크기: {len(val_ds)}")

except FileNotFoundError:
    print("오류: '/content/최종_데이터셋.csv' 파일을 찾을 수 없습니다.")
    # 여기서 스크립트 실행을 중단하거나 기본값으로 진행하도록 처리할 수 있습니다.
    exit()
except KeyError as e:
    print(f"오류: CSV 파일에 필요한 열({e})이 없습니다.")
    exit()
except ValueError as e:
    print(f"오류: {e}")
    exit()

CSV 로드 완료. 총 행 수: 5000
필터링 후 행 수: 3250
훈련 데이터셋 크기: 2925
검증 데이터셋 크기: 325


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["message"] = df_filtered["message"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["restored_link"] = df_filtered["restored_link"].astype(str)


In [None]:
model_ckpt = "google/flan-t5-base"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)
except Exception as e:
    print(f"모델 또는 토크나이저 로드 중 오류 발생: {e}")
    exit()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def preprocess(example):
    prompt = f"다음 문자에서 도박 사이트 링크를 복원하세요:\n{example['message']}\n복원된 링크:"
    # max_length는 데이터 특성에 맞게 조절 가능
    # padding="longest"는 배치 내 가장 긴 시퀀스에 맞춰 패딩하여 효율적일 수 있음
    tokenized_example = tokenizer(
        prompt,
        text_target=example["restored_link"],
        padding="max_length", # 또는 "longest"
        truncation=True,
        max_length=256 # 필요시 조절
    )
    return tokenized_example

In [None]:
# --- 4. 데이터셋 매핑 ---
def preprocess(batch): # 인자 이름을 batch로 변경하여 가독성 향상
    # 배치 내의 각 메시지에 대해 프롬프트 생성
    prompts = [f"다음 문자에서 도박 사이트 링크를 복원하세요:\n{msg}\n복원된 링크:" for msg in batch['message']]

    # 토크나이저에 프롬프트 리스트와 복원된 링크 리스트를 직접 전달
    tokenized_batch = tokenizer(
        prompts, # 프롬프트 리스트
        text_target=batch["restored_link"], # 복원된 링크 리스트
        padding="max_length",
        truncation=True,
        max_length=256
    )
    # 토크나이저는 이미 {'input_ids': [[], [], ...], 'attention_mask': [[], [], ...], 'labels': [[], [], ...]} 형태의
    # 배치 결과를 반환하므로 그대로 반환하면 됨
    return tokenized_batch

try:
    # map 함수에 batched=True와 수정된 preprocess 함수 적용
    # remove_columns를 사용하여 원본 'message', 'restored_link' 열 제거 (선택 사항이지만 권장)
    train_ds = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
    val_ds = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

    # 형식 설정 (set_format은 map 이후에 적용하는 것이 일반적)
    # 이미 map에서 필요한 컬럼만 반환했으므로 columns 인자 지정 불필요
    train_ds.set_format(type="torch")
    val_ds.set_format(type="torch")

    print("데이터셋 매핑 및 형식 설정 완료.")
    print("훈련 데이터셋 샘플:", train_ds[0]) # 매핑 결과 확인

except Exception as e:
    print(f"데이터셋 매핑 중 오류 발생: {e}")
    # 오류 발생 시 스크립트 중단 또는 다른 처리
    exit()



Map:   0%|          | 0/2925 [00:00<?, ? examples/s]

Map:   0%|          | 0/325 [00:00<?, ? examples/s]

데이터셋 매핑 및 형식 설정 완료.
훈련 데이터셋 샘플: {'input_ids': tensor([    3,     2,     3,     2,     3,     2,     3,     2,     3,     2,
            3,     2,    10,   784,  4853,  2688,   908,   784, 15805,     2,
          908,     2,     3, 15757,     2,  1220,     2,     5,     2,  5948,
            7,  1303,     7,    15,    32,     5,    35,    29,     5,   157,
           52,     3,     2,     3,     2,    10,     1,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,    

In [None]:
# --- 5. 데이터 콜레이터 정의 ---
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

In [None]:
import numpy as np
import evaluate

# ROUGE 평가 지표 로드
rouge_metric = evaluate.load("rouge")

# 모델 로드 시 사용했던 토크나이저 변수(tokenizer)가 이 함수 범위에서 접근 가능해야 합니다.
# 만약 그렇지 않다면, tokenizer를 함수의 인자로 전달하거나 전역 변수로 사용해야 합니다.
# 예시에서는 tokenizer가 이미 로드되어 있다고 가정합니다.

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions: 모델이 생성한 토큰 ID (NumPy array)
    # labels: 참조 토큰 ID (NumPy array)

    # --- 디버깅 출력 (선택 사항) ---
    print("\n--- Inside compute_metrics (Before Processing) ---")
    print(f"Predictions shape: {predictions.shape}, dtype: {predictions.dtype}, min: {np.min(predictions)}, max: {np.max(predictions)}")
    print(f"Labels shape: {labels.shape}, dtype: {labels.dtype}, min: {np.min(labels)}, max: {np.max(labels)}")
    # --- End Debug Print ---

    try:
        # !!!!! 중요 수정 !!!!!
        # predictions 배열에서도 -100 값을 pad_token_id로 변경
        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

        # 레이블의 패딩(-100)을 실제 pad_token_id로 변경
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

        # --- 디버깅 출력 (선택 사항) ---
        print("\n--- Inside compute_metrics (After Processing) ---")
        print(f"Processed Predictions min: {np.min(predictions)}, max: {np.max(predictions)}")
        print(f"Processed Labels min: {np.min(labels)}, max: {np.max(labels)}")
        # --- End Debug Print ---

        # 디코딩 (skip_special_tokens=True는 패딩 등 특수 토큰 제거)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # 후처리 (공백 제거)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]

        # ROUGE 점수 계산 (빈 문자열 처리 포함)
        filtered_preds = [p for p, l in zip(decoded_preds, decoded_labels) if p and l]
        filtered_labels = [l for p, l in zip(decoded_preds, decoded_labels) if p and l]

        if not filtered_preds: # 리스트가 비어있으면 0점 반환
            rouge_results = {key: 0.0 for key in ["rouge1", "rouge2", "rougeL", "rougeLsum"]}
        else:
            rouge_results = rouge_metric.compute(
                predictions=filtered_preds,
                references=filtered_labels,
                use_stemmer=True # 한국어의 경우 stemmer 사용 여부 재고 필요할 수 있음
            )

        # 정확도 (Exact Match) 계산
        acc = np.mean([p == l for p, l in zip(decoded_preds, decoded_labels)])

        # 결과 딕셔너리
        result = {"accuracy": acc}
        result.update(rouge_results) # ROUGE 결과 추가

        # 생성된 길이 정보 추가 (디버깅에 유용)
        # pad_token_id가 0이므로, 0이 아닌 토큰 수를 세면 길이가 됨
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
        result["gen_len"] = np.mean(prediction_lens)

        return {k: round(v, 4) for k, v in result.items()} # 로깅 편의를 위해 반올림

    except OverflowError as oe:
        print(f"!!! OverflowError during decoding: {oe} !!!")
        # 여전히 오류 발생 시 추가 디버깅 정보 출력
        print("Sample Processed Predictions (first 10):", predictions[0, :10])
        print("Sample Processed Labels (first 10):", labels[0, :10])
        raise oe
    except Exception as e:
        print(f"!!! Error during metrics computation: {e} !!!")
        # 기타 오류 발생 시
        print("Sample Processed Predictions (first 10):", predictions[0, :10])
        print("Sample Processed Labels (first 10):", labels[0, :10])
        raise e



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# --- 7. 학습 인자 정의 (수정됨) ---
args = Seq2SeqTrainingArguments(
    output_dir="./link_restorer_stable", # 출력 디렉토리 변경
    per_device_train_batch_size=32,      # 배치 크기 줄임 (메모리 부족 시 더 줄이기)
    per_device_eval_batch_size=32,       # 배치 크기 줄임
    gradient_accumulation_steps=4,       # 유효 배치 크기 유지 (16*4=64)
    learning_rate=2e-5,                  # 학습률 감소
    num_train_epochs=30,                  # 에포크 수 (필요시 조절)
    eval_strategy="steps",               # "evaluation_strategy"를 "eval_strategy"로 변경
    eval_steps=50,                      # 평가 빈도 (데이터셋 크기에 따라 조절)
    logging_steps=50,                    # 로깅 빈도
    predict_with_generate=True,          # 생성 기반 평가 활성화 (필수)
    save_strategy="steps",               # 단계별 저장
    save_steps=250,                      # 저장 빈도 (eval_steps와 맞추는 것이 일반적)
    save_total_limit=2,                  # 최대 체크포인트 저장 수
    fp16=False,                           # FP16 비활성화 (안정성 우선)
    bf16=False, # Ampere 이상 GPU + 최신 torch 사용 시 FP16 대신 고려
    report_to="none",                    # 리포팅 대상 (필요시 "wandb" 등 설정)
    max_grad_norm=1.0,                   # Gradient Clipping 적용
    lr_scheduler_type="linear",          # Linear 학습률 스케줄러 사용
    warmup_steps=100,                    # Warmup 단계 수 (데이터셋 크기에 따라 조절)
    load_best_model_at_end=True,         # 최적 모델 로드
    metric_for_best_model="accuracy",    # 최적 모델 선정 기준 (또는 "rougeL")
    seed=42,                             # 재현성을 위한 시드 고정
    generation_max_length=128,
)

In [None]:
# --- 8. 트레이너 정의 ---
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics # 수정된 평가 함수 사용
)

  trainer = Seq2SeqTrainer(


In [None]:
# --- 9. 학습 시작 ---
print("학습을 시작합니다...")
try:
    trainer.train()
    print("학습 완료!")
except Exception as e:
    print(f"학습 중 오류 발생: {e}")

학습을 시작합니다...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Accuracy,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
50,37.5801,26.171736,0.0062,0.1347,0.0933,0.1345,0.1349,75.3231
100,13.6361,4.329117,0.0,0.223,0.1705,0.2236,0.2219,45.1538
150,3.0468,0.578946,0.0,0.4,0.0,0.4,0.4,0.0123
200,0.7104,0.105887,0.0862,0.567,0.3933,0.5688,0.5682,9.08
250,0.1609,0.04176,0.5508,0.8077,0.7086,0.8071,0.8077,10.6523
300,0.064,0.030927,0.5877,0.8311,0.717,0.8327,0.8323,10.5938
350,0.0439,0.026402,0.6185,0.8523,0.7458,0.8534,0.8537,10.5138
400,0.0362,0.023835,0.6523,0.8587,0.7602,0.8595,0.8592,10.9969
450,0.0319,0.0223,0.6892,0.8706,0.7768,0.8721,0.8717,11.4338
500,0.0293,0.021318,0.6954,0.8769,0.7806,0.8777,0.8771,11.3138



--- Inside compute_metrics (Before Processing) ---
Predictions shape: (325, 128), dtype: int64, min: 0, max: 31032
Labels shape: (325, 256), dtype: int64, min: 0, max: 28803

--- Inside compute_metrics (After Processing) ---
Processed Predictions min: 0, max: 31032
Processed Labels min: 0, max: 28803

--- Inside compute_metrics (Before Processing) ---
Predictions shape: (325, 128), dtype: int64, min: 0, max: 32110
Labels shape: (325, 256), dtype: int64, min: 0, max: 28803

--- Inside compute_metrics (After Processing) ---
Processed Predictions min: 0, max: 32110
Processed Labels min: 0, max: 28803

--- Inside compute_metrics (Before Processing) ---
Predictions shape: (325, 128), dtype: int64, min: 0, max: 5339
Labels shape: (325, 256), dtype: int64, min: 0, max: 28803

--- Inside compute_metrics (After Processing) ---
Processed Predictions min: 0, max: 5339
Processed Labels min: 0, max: 28803

--- Inside compute_metrics (Before Processing) ---
Predictions shape: (325, 128), dtype: int

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


학습 완료!


In [None]:
# --- 10. 결과 확인 및 저장 (선택적) ---
print("최종 평가 결과:")
eval_results = trainer.evaluate()
print(eval_results)

최종 평가 결과:



--- Inside compute_metrics (Before Processing) ---
Predictions shape: (325, 128), dtype: int64, min: -100, max: 28803
Labels shape: (325, 256), dtype: int64, min: 0, max: 28803

--- Inside compute_metrics (After Processing) ---
Processed Predictions min: 0, max: 28803
Processed Labels min: 0, max: 28803
{'eval_loss': 0.021317772567272186, 'eval_accuracy': 0.6954, 'eval_rouge1': 0.8769, 'eval_rouge2': 0.7806, 'eval_rougeL': 0.8777, 'eval_rougeLsum': 0.8771, 'eval_gen_len': 11.3138, 'eval_runtime': 13.3159, 'eval_samples_per_second': 24.407, 'eval_steps_per_second': 0.826, 'epoch': 30.0}


In [None]:
# 모델 저장
trainer.save_model("./link_restorer_stable/final_model")
tokenizer.save_pretrained("./link_restorer_stable/final_model")
print("최종 모델과 토크나이저가 './link_restorer_stable/final_model'에 저장되었습니다.")

최종 모델과 토크나이저가 './link_restorer_stable/final_model'에 저장되었습니다.


In [None]:
# 1. 모델과 토크나이저 저장
trainer.save_model("./link_restorer_stable/final_model")
tokenizer.save_pretrained("./link_restorer_stable/final_model")

# 2. zip으로 압축
import shutil
shutil.make_archive("./link_restorer_stable/final_model", 'zip', "./link_restorer_stable/final_model")

'/content/link_restorer_stable/final_model.zip'

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델과 토크나이저 로드
model_path = "/content/link_restorer_stable/final_model"  # 학습된 모델 경로
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)  # 모델을 GPU로 이동

def predict_link(input_text):
    """
    입력 메시지에서 복원된 링크를 예측하는 함수.
    모델이 링크 하나만 정확하게 출력하도록 유도하며, 결과에서 실제 링크만 정제해서 반환합니다.
    """
    # 명확한 지시 포함한 프롬프트
    prompt = (
        f"다음 문자에서 도박 사이트 링크를 복원하세요.\n"
        f"반드시 하나의 링크만 정확하게 출력하세요. \n"
        f"링크 시작은 'http://'로 구성되어 있어야 합니다."
        f"문자: {input_text}\n복원된 링크:"
    )

    # 토큰화 및 디바이스 이동
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # 모델 예측
    outputs = model.generate(
        **inputs,
        max_new_tokens=32,
        num_beams=4,
        early_stopping=True
    )

    # 결과 디코딩
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # 링크만 추출 (http/https 또는 도메인 패턴)
    import re
    match = re.search(r'(https?://[^\s]+|[a-zA-Z0-9\-]+\.[a-z]{2,}[^\s]*)', decoded_output)
    restored_link = match.group(0) if match else decoded_output  # 없으면 그대로 반환

    return restored_link


In [None]:
example_input = "[국제발신] 슈퍼스타 [ abit.ly/슈퍼스타 ] 신 𝟒𝟎 컴 𝟒 규 %  프 % 입력 :  82𝓣𝓥 무 제 한 [𝟙억] 보증"
print(predict_link(example_input))

abit.ly/
