In [1]:
import tensorflow as tf
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
from datasets import Dataset
import sentencepiece as spm
import argparse
import pandas as pd
import re
import transformers
import numpy as np
import evaluate

print(transformers.__version__)

4.27.1


In [2]:
train_data = pd.read_table("./ratings_train.txt")
test_data = pd.read_table("./ratings_test.txt")

In [3]:
train_data = train_data.dropna(axis=0)
test_data= test_data.dropna(axis=0)

train_sentences = train_data["document"]
train_label = list(train_data["label"])

test_sentences = test_data["document"]
test_label = list(test_data["label"])

In [4]:
def preprocess(sentence):
    sentence = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s.,?!\'\"]", "", sentence)
    sentence = re.sub("ㅋ+", "ㅋㅋ", sentence)
    sentence = re.sub("ㅎ+", "ㅎㅎ", sentence)
    sentence = re.sub("ㅜ+", "ㅜ", sentence)
    sentence = re.sub("ㅠ+", "ㅠ", sentence)
    sentence = re.sub("\"+", "\"", sentence)
    sentence = re.sub(r"\.+", "..", sentence)
    sentence = re.sub("!+", "!", sentence)
    sentence = re.sub("\?+", "?", sentence)
    sentence.lower().strip()
    
    return sentence

In [5]:
train_corpus = []
train_labels = []
for i, sentence in enumerate(train_sentences):
    train_corpus.append(preprocess(sentence))
    train_labels.append(train_label[i])

In [6]:
test_corpus = []
test_labels = []
for i, sentence in enumerate(test_sentences):
    test_corpus.append(preprocess(sentence))
    test_labels.append(test_label[i])

In [7]:
train_data = pd.DataFrame([i for i in zip(train_corpus, train_labels)], columns = ["text", "labels"])
test_data = pd.DataFrame([i for i in zip(test_corpus, test_labels)], columns = ["text", "labels"])

In [8]:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [9]:
train_dataset = train_dataset.class_encode_column("labels")
test_dataset = test_dataset.class_encode_column("labels")

Stringifying the column:   0%|          | 0/149995 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/149995 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/49997 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/49997 [00:00<?, ? examples/s]

In [10]:
model =  AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [11]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=False)

In [12]:
train_datasets = train_dataset.map(tokenize_function, batched=True)
test_datasets = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/149995 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [27]:
training_args = TrainingArguments(
    output_dir="./nsmc",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [28]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=test_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2423,0.252106,0.900774
2,0.1905,0.247572,0.904354
3,0.1249,0.314475,0.904734
4,0.0917,0.376837,0.903634
5,0.06,0.477918,0.902954


TrainOutput(global_step=23440, training_loss=0.14751171284568187, metrics={'train_runtime': 10848.1234, 'train_samples_per_second': 69.134, 'train_steps_per_second': 2.161, 'total_flos': 2.818518700000836e+16, 'train_loss': 0.14751171284568187, 'epoch': 5.0})

In [26]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()