<a href="https://colab.research.google.com/github/raymondkang4837/2048/blob/master/T5_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### IMPORT


In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

from tqdm import tqdm

### DATA LOAD & Pre-Processing

In [4]:
train = pd.read_csv('/content/train.csv')

In [5]:
# 입력 텍스트와 타켓 텍스트를 구성하는 함수
def make_input(row):
    sentences = [ row[f"sentence_{i}"] for i in range(4)]
    input_text = '문장을 순서대로 정렬하세요:' + '</s>'.join(sentences)
    answer = [row[f'answer_{i}'] for i in range(4)]
    target_text = ''.join(map(str, answer)) # ex) '0 3 1 2'
    return { 'input' : input_text, 'target' : target_text}


In [6]:
# 데이터셋 가공 및 분할
inputs = train.apply(make_input, axis=1).tolist()
train_data, valid_data = train_test_split(inputs, test_size= 0.2, random_state= 42)

train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
valid_dataset = Dataset.from_pandas(pd.DataFrame(valid_data))

### Dataset.from_pandas() :

 - Hugging Face의 Dataset 객체로 변환

 - 학습 파이프라인에서 쓰기 좋게 바꿔주는 작업

### Model Load

In [7]:
# 토크나이저 및 모델 로딩
model_name = 't5-small'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

### Train

In [8]:
# 토크나이징 함수 정의
def tokenize(example):
    model_inputs = tokenizer(example['input'], max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(example['target'], max_length=16, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# 데이터셋 토크나이징
tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_valid = valid_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/5880 [00:00<?, ? examples/s]

Map:   0%|          | 0/1471 [00:00<?, ? examples/s]

In [10]:
# 학습 설정
training_args = TrainingArguments(
    output_dir = './result',
    learning_rate = 3e-5,
    per_device_train_batch_size = 10,
    num_train_epochs = 1,
)

# Trainer 정의 및 학습
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_valid,
)

trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraymondkang4837[0m ([33mraymondkang4837-s[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,16.7712
20,12.766
30,9.5202
40,6.3161
50,3.5053
60,2.1339
70,1.7824
80,1.5953
90,1.4328
100,1.3027


TrainOutput(global_step=735, training_loss=1.1716843408792197, metrics={'train_runtime': 341.0081, 'train_samples_per_second': 17.243, 'train_steps_per_second': 2.155, 'total_flos': 795809792655360.0, 'train_loss': 1.1716843408792197, 'epoch': 1.0})

# 새 섹션

In [11]:
tokenizer.save_pretrained('./results')
model.save_pretrained('./results')

### Inference

In [12]:
# 모델 로드
model_dir = "./results"
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [13]:
# 테스트 데이터
test = pd.read_csv("./test.csv")
sentences = test[[f"sentence_{i}" for i in range(4)]].values.tolist()

# 추론 함수
def predict_order(sent_list):
    input_text = "문장을 순서대로 정렬하세요: " + " </s> ".join(sent_list)
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="longest",
        max_length=512
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=16,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    try:
        order = list(map(int, decoded.strip().split()))
        return order
    except:
        return [0, 1, 2, 3]

# 예측
predictions = []
for sent_group in tqdm(sentences, desc="Predicting"):
    pred = predict_order(sent_group)
    predictions.append(pred)

Predicting: 100%|██████████| 1780/1780 [04:26<00:00,  6.67it/s]


### Submission

In [14]:
# sample_submission 불러오기
sample_submission = pd.read_csv("./sample_submission.csv")

# 예측 결과 적용
for i in range(4):
    sample_submission[f"answer_{i}"] = [
        pred[i] if len(pred) == 4 else i for pred in predictions
    ]

# 저장
sample_submission.to_csv("baseline_submission.csv", index=False)