In [None]:
# !wandb login

In [None]:
# import wandb
# wandb.init(project="dacon-roberta-small")

In [None]:
!nvidia-smi

In [None]:
!kill -9 27928

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, logging
import torch
import torch.nn as nn

logging.set_verbosity_info()

# Options

In [None]:
category_dict = {"유형": "type", "극성": "polarity", "시제": "tense", "확실성": "certainty"}
category = "유형"     # 유형, 극성, 시제, 확실성
english_category = category_dict[category]   # type, polarity, tense, certainty
pretrained_model_name_or_path = "kykim/electra-kor-base"

# Prepare Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset(
    "csv",
    data_files={"train": f"dataset/train_data_{english_category}.csv", \
                "test": f"dataset/validation_data_{english_category}.csv"}
)

In [None]:
from datasets import ClassLabel

names = list(set(ds["train"][category]))
num_labels = len(names)
cl = ClassLabel(num_classes=num_labels, names=names)
id2label = {k: v for k, v in enumerate(cl.names)}

ds = ds.cast_column(category, cl)

# Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path,
    num_labels=num_labels,
    id2label=id2label
)

# Preprocess

In [None]:
remove_columns = list(set(ds["train"].features) - {"input_ids", "token_type_ids", "attention_mask", category})
remove_columns

In [None]:
def tokenize_function(batch):
    tokens = tokenizer(batch["문장"], padding="max_length", truncation=True)
    return tokens

ds = ds.map(tokenize_function, batched=True, remove_columns=remove_columns)

In [None]:
ds = ds.with_format("torch")
ds = ds.rename_column(category, "labels")

# Metrics

In [None]:
class ConfiguredMetric:
    def __init__(self, metric, *metric_args, **metric_kwargs):
        self.metric = metric
        self.metric_args = metric_args
        self.metric_kwargs = metric_kwargs

    def add(self, *args, **kwargs):
        return self.metric.add(*args, **kwargs)

    def add_batch(self, *args, **kwargs):
        return self.metric.add_batch(*args, **kwargs)

    def compute(self, *args, **kwargs):
        return self.metric.compute(*args, *self.metric_args, **kwargs, **self.metric_kwargs)

    @property
    def name(self):
        return self.metric.name

    def _feature_names(self):
        return self.metric._feature_names()

In [None]:
import evaluate
import numpy as np

metrics = evaluate.combine([
    evaluate.load('accuracy'),
    ConfiguredMetric(evaluate.load('precision'), average='weighted'),
    ConfiguredMetric(evaluate.load('recall'), average='weighted'),
    ConfiguredMetric(evaluate.load('f1'), average='weighted'),
])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

# Trainer

In [None]:
from datetime import datetime
now = datetime.now()
name = f"[{category}] " + now.strftime("%Y-%m-%d_%H%M%S")
name

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=f'./results/{name}',          # output directory
    num_train_epochs=10,             # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    gradient_accumulation_steps=8,   # Number of updates steps to accumulate the gradients for
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    evaluation_strategy = "epoch",
    save_strategy= "epoch",
    learning_rate=5e-5,
    do_eval=True,
    logging_steps=50,
    fp16=True,
    run_name=name,
)

In [None]:
from transformers import DataCollatorWithPadding

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Inference

In [None]:
import datasets
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.notebook import tqdm

tds = load_dataset(
    "csv",
    data_files={"predict": "data/result.csv"}
)

pipe = pipeline("text-classification", model=model, tokenizer = tokenizer, device=0)
result = []
for out in tqdm(pipe(KeyDataset(tds["predict"], "문장"))):
    result.append(out["label"])

tds["predict"] = tds["predict"].add_column(name=category, column=result)

In [None]:
import pandas as pd

df = pd.DataFrame(tds["predict"])
df.to_csv("data/result.csv", encoding="UTF-8", index=False)

In [None]:
df.columns

In [None]:
for label in ["유형", "극성", "시제", "확실성"]:
    if label not in df.columns:
        break
else:
    df["label"] = df["유형"] + "-" +  df["극성"] + "-" + df["시제"] + "-" +  df["확실성"]
    df = df[["ID", "label"]]
    df.to_csv("submission.csv", encoding="UTF-8", index=False)