In [1]:
import random
import logging
from IPython.display import display, HTML
import os

import numpy as np
import pandas as pd
import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
def seed_everything(seed: int = 17):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
seed_everything(17)

In [3]:
model_checkpoint = "klue/bert-base"
batch_size = 64
task = "nli"
MODEL_P = "models/klue-bert-base-augmented.pth"
RANDOM_SEED = 17

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [5]:
dataset = pd.read_csv("data/train_data.csv",index_col=False)
test = pd.read_csv("data/test_data.csv",index_col=False)

In [25]:
for i in range(len(dataset['title'])):
    if "..." in dataset['title'][i]:
        dataset['title'][i] = dataset['title'][i].replace("."," ")
        
for i in range(len(test['title'])):
    if "..." in test['title'][i]:
        test['title'][i] = test['title'][i].replace("."," ")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['title'][i] = dataset['title'][i].replace("."," ")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['title'][i] = test['title'][i].replace("."," ")


In [26]:
dataset_train, dataset_val = train_test_split(dataset,test_size = 0.2,random_state = RANDOM_SEED)

In [27]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_key, label_key, bert_tokenizer):
        
        self.sentences = [ bert_tokenizer(i,truncation=True,return_token_type_ids=False) for i in dataset[sent_key] ]
        
        if not label_key == None:
            self.mode = "train"
        else:
            self.mode = "test"
            
        if self.mode == "train":
            self.labels = [np.int64(i) for i in dataset[label_key]]
        else:
            self.labels = [np.int64(0) for i in dataset[sent_key]]

    def __getitem__(self, i):
        if self.mode == "train":
            self.sentences[i]["label"] = self.labels[i]
            return self.sentences[i]
#             return ( self.sentences[i] , self.labels[i] )
        else:
            return self.sentences[i]

    def __len__(self):
        return (len(self.labels))


In [28]:
data_train = BERTDataset(dataset_train, "title", "topic_idx", tokenizer)
data_val = BERTDataset(dataset_val, "title", "topic_idx", tokenizer)
data_test = BERTDataset(test, "title", None, tokenizer)

In [29]:
num_labels = 7
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at C:\Users\or7l0/.cache\huggingface\transformers\fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.7cee10e8ea7ffa278f8be4b141000263f2b18795e5ef5e025352b2af6851f8fb
Model config BertConfig {
  "architectures": [
    "BertForPretraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_

In [30]:
metric = load_metric("glue", "qnli")

In [31]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [32]:
metric_name = "accuracy"

args = TrainingArguments(
    "test-nli",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=data_train,
    eval_dataset=data_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 36523
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2855


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4572,0.327837,0.888074
2,0.275,0.324671,0.886759
3,0.2214,0.346445,0.882817
4,0.1788,0.365311,0.887197
5,0.1366,0.383788,0.88665


***** Running Evaluation *****
  Num examples = 9131
  Batch size = 64
Saving model checkpoint to test-nli\checkpoint-571
Configuration saved in test-nli\checkpoint-571\config.json
Model weights saved in test-nli\checkpoint-571\pytorch_model.bin
tokenizer config file saved in test-nli\checkpoint-571\tokenizer_config.json
Special tokens file saved in test-nli\checkpoint-571\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 9131
  Batch size = 64
Saving model checkpoint to test-nli\checkpoint-1142
Configuration saved in test-nli\checkpoint-1142\config.json
Model weights saved in test-nli\checkpoint-1142\pytorch_model.bin
tokenizer config file saved in test-nli\checkpoint-1142\tokenizer_config.json
Special tokens file saved in test-nli\checkpoint-1142\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 9131
  Batch size = 64
Saving model checkpoint to test-nli\checkpoint-1713
Configuration saved in test-nli\checkpoint-1713\config.json
Model weight

TrainOutput(global_step=2855, training_loss=0.2369660639721124, metrics={'train_runtime': 515.8269, 'train_samples_per_second': 354.024, 'train_steps_per_second': 5.535, 'total_flos': 2742463069747872.0, 'train_loss': 0.2369660639721124, 'epoch': 5.0})

In [35]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 9131
  Batch size = 64


{'eval_loss': 0.327837198972702,
 'eval_accuracy': 0.8880735954440916,
 'eval_runtime': 6.564,
 'eval_samples_per_second': 1391.069,
 'eval_steps_per_second': 21.785,
 'epoch': 5.0}

In [16]:
pred = trainer.predict(data_test)
pred = pred[0]
pred = np.argmax(pred,1)
submission = pd.read_csv('data/sample_submission.csv')
submission['topic_idx'] = pred
submission.to_csv("results/klue-bert-hyperparameter-tuning.csv",index=False)

***** Running Prediction *****
  Num examples = 9131
  Batch size = 64


In [21]:
test

Unnamed: 0,index,title
0,45654,유튜브 내달 2일까지 크리에이터 지원 공간 운영
1,45655,어버이날 맑다가 흐려져…남부지방 옅은 황사
2,45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다
3,45657,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것
4,45658,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간
...,...,...
9126,54780,인천 오후 3시35분 대설주의보…눈 3.1cm 쌓여
9127,54781,노래방에서 지인 성추행 외교부 사무관 불구속 입건종합
9128,54782,40년 전 부마항쟁 부산 시위 사진 2점 최초 공개
9129,54783,게시판 아리랑TV 아프리카개발은행 총회 개회식 생중계


In [24]:
for idx, sentence in enumerate(test["title"]):
    if "..." in sentence:
#         print(idx, sentence)
        
        if pred[idx] != 5:
            print(idx, pred[idx])

2422 2
3801 6
3963 2
6396 2
7213 6


In [22]:
pred[33]

5