# packages

In [1]:
import numpy as np

from tqdm.notebook import tqdm

import torch ## version >= 1.8.2
import torch.nn as nn

import pytorch_lightning as pl ## version == 1.4.9

import datasets ## version == 2.1.0

from transformers import AutoTokenizer, AutoModel ## version == 4.12.3

In [2]:
RANDOM_SEED = 42
pl.seed_everything(RANDOM_SEED)

Global seed set to 42


42

# data

In [3]:
from datasets import load_dataset

dataset = load_dataset("searle-j/kote")

No config specified, defaulting to: kote/dichotomized
Reusing dataset kote (/home/ubuntu/.cache/huggingface/datasets/searle-j___kote/dichotomized/0.0.0/9e18d6e4c5fb5b54c412810da99dfa5e5ece83c40924ee5eb3f41ce5b4d5b436)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
## check the shape.

dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
})

In [5]:
## check one sample in the train set.

dataset["train"][25597]

{'ID': '32521',
 'text': '구슬픈 봄날 저녁 무렵, 도시의 뒤섞여 있는 건축과 건축의 그림자를 찾아서 커다란 군중 속에 휩쓸려 가는 것은 얼마나 즐거운 일인가. <우울한 고양이_하기와라 사쿠타로>15',
 'labels': [2, 4, 5, 13, 14, 15, 16, 27, 28, 38, 40, 42]}

In [6]:
## convert the integer labels into multi-hot form (44-dimensional).

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(dataset["train"]["labels"])
test_labels = mlb.fit_transform(dataset["test"]["labels"])
val_labels = mlb.fit_transform(dataset["validation"]["labels"])

print("train_labels shape ::: {}".format(train_labels.shape))
print("test_labels shape :::: {}".format(test_labels.shape))
print("val_labels shape ::::: {}".format(val_labels.shape))
print("\ncool..!!")

train_labels shape ::: (40000, 44)
test_labels shape :::: (5000, 44)
val_labels shape ::::: (5000, 44)

cool..!!


In [7]:
## extract the texts, since we will use a custom datset not the huggingface dataset.

train_texts = dataset["train"]["text"]
test_texts = dataset["test"]["text"]
val_texts = dataset["validation"]["text"]

In [8]:
## and the label names...

LABELS = dataset["train"].features["labels"].feature.names

# tokenization

In [9]:
## download the pretrained tokenizer from huggingface.

MODEL_NAME = "beomi/KcELECTRA-base" # <-- Thank you!
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
## let us mask and switch some tokens in the train set for a better performance.

In [11]:
def token_masking(encoding, prob):
    for i, token in enumerate(encoding["input_ids"][0]):
        if token not in [0,1,2,3]: # 0 ~ 3, [PAD], [UNK], [CLS], and [SEP], respectively.
            if np.random.uniform(0,1) < prob:
                encoding["input_ids"][0][i] = 4 # 4 is [MASK]
                
    return encoding

In [12]:
def token_switching(encoding, prob):
    for i, token in enumerate(encoding["input_ids"][0]):
        if token not in [0,1,2,3,4]: # 0 ~ 4, [PAD], [UNK], [CLS], [SEP], and [MASK], respectively.
            if np.random.uniform(0,1) < prob:
                encoding["input_ids"][0][i] = np.random.choice(np.arange(5,tokenizer.vocab_size), 1)[0]
                
    return encoding

In [13]:
def mask_and_switch(encoding, prob=0.1):
    encoding = token_masking(encoding, prob/2)
    encoding = token_switching(encoding, prob/2)
    
    return encoding

# custom dataset

In [14]:
from torch.utils.data import Dataset

In [15]:
## maximum token lengths

MAX_LENGTH = 512

In [16]:
## define our dataset...!

class KOTEDataset(Dataset):
    
    def __init__(self, texts, labels, tokenizer, max_length:int=MAX_LENGTH,
                would_you_like_some_mask_and_switch:bool=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mask = would_you_like_some_mask_and_switch
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx:int):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_token_type_ids=False,
        )
        
        if self.mask:
            encoding = mask_and_switch(encoding, prob=0.1)
        else:
            pass
        
        return dict(
          input_ids=encoding["input_ids"].flatten(),
          attention_mask=encoding["attention_mask"].flatten(),
          labels=torch.FloatTensor(labels), ## must be a float tensor.
        )

In [17]:
## create the datasets.

train_dataset = KOTEDataset(train_texts, train_labels, tokenizer=tokenizer, would_you_like_some_mask_and_switch=True)
test_dataset = KOTEDataset(test_texts, test_labels, tokenizer=tokenizer)
val_dataset = KOTEDataset(val_texts, val_labels, tokenizer=tokenizer)

# modeling

In [18]:
## download the pretrained electra model.

electra = AutoModel.from_pretrained(MODEL_NAME, return_dict=True)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
## we will use the default arguments, except for the last gelu for classification.

electra.config

ElectraConfig {
  "_name_or_path": "beomi/KcELECTRA-base",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50135
}

# dataloader with pl

In [20]:
from torch.utils.data import DataLoader

In [21]:
class KOTEDataModule(pl.LightningDataModule):
    
    def __init__(self, train_dataset, test_dataset, val_dataset, batch_size=32):
        super().__init__()
        self.batch_size = batch_size
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.val_dataset = val_dataset

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=6, ## choose a befitting number depending on your environment.
        )
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=6, ## choose a befitting number depending on your environment.
        )
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=6, ## choose a befitting number depending on your environment.
        )

In [22]:
BATCH_SIZE = 32 ## about 28 ~ 30 Gb memory required, if my memory serves me right.

data_module = KOTEDataModule(
  train_dataset,
  test_dataset,
  val_dataset,
  batch_size=BATCH_SIZE,
)

# model

In [23]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [24]:
INITIAL_LR = 2e-5

In [25]:
class KOTETagger(pl.LightningModule):
    
    def __init__(self, n_training_steps=None, n_warmup_steps=None, gamma_for_expLR=None):
        super().__init__()
        self.electra = electra
        self.classifier = nn.Linear(self.electra.config.hidden_size, 44) ## the label dimension == 44 <-- what an ominous number for asians though... <-- I didn't intend it!
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        
        ## the loss
        self.criterion = nn.BCELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.electra(input_ids, attention_mask=attention_mask)
        output = output.last_hidden_state[:,0,:] ## [CLS] of the last hidden state
        output = self.classifier(output)
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        
        torch.cuda.empty_cache()
        
        return loss, output
    
    def step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self.forward(input_ids, attention_mask, labels)

        preds = outputs

        y_true = list(labels.detach().cpu())
        y_pred = list(preds.detach().cpu())

        return {"loss": loss, "y_true": y_true, "y_pred": y_pred}
    
    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)
    
    def epoch_end(self, outputs, state="train"):
        loss = torch.tensor(0, dtype=torch.float)
        for out in outputs:
            loss += out["loss"].detach().cpu()
        loss = loss / len(outputs)

        y_true = []
        y_pred = []
        for out in outputs:
            y_true += out["y_true"]
            y_pred += out["y_pred"]

        self.log(state + "_loss", float(loss), on_epoch=True, prog_bar=True)
        print(f"[Epoch {self.trainer.current_epoch} {state.upper()}] Loss: {loss}")
        return {"loss": loss}
    
    def training_epoch_end(self, outputs):
        self.epoch_end(outputs, state="train")

    def validation_epoch_end(self, outputs):
        self.epoch_end(outputs, state="val")
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=INITIAL_LR)
        
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        
        return dict(
          optimizer=optimizer,
          lr_scheduler=dict(
            scheduler=scheduler,
            interval="step"
          )
        )

In [26]:
## determine the schedule for our optimizer

N_EPOCHS = 10

steps_per_epoch = len(train_dataset) // BATCH_SIZE
TOTAL_STEPS = steps_per_epoch * N_EPOCHS
WARMUP_STEPS = TOTAL_STEPS // 5
WARMUP_STEPS, TOTAL_STEPS

(2500, 12500)

In [27]:
## define the model.

model = KOTETagger(
    n_warmup_steps=WARMUP_STEPS,
    n_training_steps=TOTAL_STEPS,
)

# training

In [28]:
## set a logger and some stuffs...

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

## the check point
checkpoint_callback = ModelCheckpoint(
    ###dirpath="YOUR DIRECTORY PATH",
    filename="epoch{epoch}-val_loss{val_loss:.4f}",
    monitor="val_loss",
    save_top_k=1,
    mode="min",
    auto_insert_metric_name=False,
)

## for early stopping
early_stopping_callback = EarlyStopping(monitor="val_loss", patience=5, min_delta=0.00)

## the logger
logger = TensorBoardLogger("YOUR_FOLDER_NAME", name="ONE_MORE_FOLDER_NAME")

In [29]:
## trainer

N_EPOCHS = 15 ## redefine the number of the epochs, just to make sure there is no more room to improve.

trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback],
    max_epochs=N_EPOCHS,
    gpus=[2], ## GPU number
    progress_bar_refresh_rate=5
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [30]:
## about 4 ~ 5 hours to reach the optimum...

trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name       | Type         | Params
--------------------------------------------
0 | electra    | ElectraModel | 123 M 
1 | classifier | Linear       | 33.8 K
2 | criterion  | BCELoss      | 0     
--------------------------------------------
123 M     Trainable params
0         Non-trainable params
123 M     Total params
495.953   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


[Epoch 0 VAL] Loss: 0.7097086906433105


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

[Epoch 0 VAL] Loss: 0.3745824098587036
[Epoch 0 TRAIN] Loss: 0.4992274045944214


Validating: 0it [00:00, ?it/s]

[Epoch 1 VAL] Loss: 0.3168054521083832
[Epoch 1 TRAIN] Loss: 0.34891679883003235


Validating: 0it [00:00, ?it/s]

[Epoch 3 VAL] Loss: 0.2815024256706238
[Epoch 3 TRAIN] Loss: 0.2872730791568756


Validating: 0it [00:00, ?it/s]

[Epoch 4 VAL] Loss: 0.27795523405075073
[Epoch 4 TRAIN] Loss: 0.27514782547950745


Validating: 0it [00:00, ?it/s]

[Epoch 5 VAL] Loss: 0.27747949957847595
[Epoch 5 TRAIN] Loss: 0.2677864134311676


Validating: 0it [00:00, ?it/s]

[Epoch 6 VAL] Loss: 0.27661705017089844
[Epoch 6 TRAIN] Loss: 0.261890709400177


Validating: 0it [00:00, ?it/s]

[Epoch 7 VAL] Loss: 0.2767089307308197
[Epoch 7 TRAIN] Loss: 0.25691649317741394


Validating: 0it [00:00, ?it/s]

[Epoch 8 VAL] Loss: 0.2769763767719269
[Epoch 8 TRAIN] Loss: 0.25327250361442566


Validating: 0it [00:00, ?it/s]

[Epoch 9 VAL] Loss: 0.2765798568725586
[Epoch 9 TRAIN] Loss: 0.2511317729949951


Validating: 0it [00:00, ?it/s]

[Epoch 10 VAL] Loss: 0.2765798568725586
[Epoch 10 TRAIN] Loss: 0.2502129375934601


Validating: 0it [00:00, ?it/s]

[Epoch 11 VAL] Loss: 0.2765798568725586
[Epoch 11 TRAIN] Loss: 0.25066524744033813


Validating: 0it [00:00, ?it/s]

[Epoch 12 VAL] Loss: 0.2765798568725586
[Epoch 12 TRAIN] Loss: 0.25033998489379883


Validating: 0it [00:00, ?it/s]

[Epoch 13 VAL] Loss: 0.2765798568725586
[Epoch 13 TRAIN] Loss: 0.2501673996448517


Validating: 0it [00:00, ?it/s]

[Epoch 14 VAL] Loss: 0.2765798568725586
[Epoch 14 TRAIN] Loss: 0.25027579069137573


In [31]:
torch.cuda.empty_cache()

# prediction

In [32]:
from glob import glob

par_dir = './YOUR_FOLDER_NAME/ONE_MORE_FOLDER_NAME/version_0/checkpoints/'
best_ckpt = sorted(glob(par_dir + '*.ckpt'))[-1]
best_ckpt

'./YOUR_FOLDER_NAME/ONE_MORE_FOLDER_NAME/version_0/checkpoints/epoch9-val_loss0.2766.ckpt'

In [33]:
gruesome_mind_reader = KOTETagger.load_from_checkpoint(best_ckpt)

In [34]:
gruesome_mind_reader.eval()
gruesome_mind_reader.freeze()

In [35]:
THRESHOLD = 0.3

sample_text = "고니요? 제가 아는 타짜 중에 최고였어요..."
encoding = tokenizer.encode_plus(
  sample_text,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors="pt",
)

_, predictions = gruesome_mind_reader(encoding["input_ids"], encoding["attention_mask"])
predictions = predictions.flatten().numpy()
for l,p in zip(LABELS, predictions):
    if p < THRESHOLD:
        continue
    print(f"{l}: {p}")

환영/호의: 0.8828433752059937
감동/감탄: 0.975392758846283
고마움: 0.320080429315567
존경: 0.7944819927215576
기대감: 0.722480297088623
뿌듯함: 0.4538567066192627
신기함/관심: 0.8110296726226807
아껴주는: 0.8170523643493652
즐거움/신남: 0.7045897841453552
흐뭇함(귀여움/예쁨): 0.6821712255477905
놀람: 0.4573516249656677
행복: 0.6389814019203186
기쁨: 0.7407119870185852
안심/신뢰: 0.5029057860374451


# evaluation

In [36]:
## test

DEVICE = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") ## set the GPU number!
gruesome_mind_reader = gruesome_mind_reader.to(DEVICE)

predictions = []
labels = []

for item in tqdm(test_dataset):
    _, pred = gruesome_mind_reader(
        item["input_ids"].unsqueeze(dim=0).to(DEVICE),
        item["attention_mask"].unsqueeze(dim=0).to(DEVICE)
        )
    predictions.append(pred.flatten())
    labels.append(item["labels"].round().int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

  0%|          | 0/5000 [00:00<?, ?it/s]

In [37]:
from torchmetrics.functional import accuracy, f1, auroc

THRESHOLD = 0.3
accuracy(predictions, labels, threshold=THRESHOLD)

tensor(0.8642)

In [38]:
## we should check the roc scores, since KOTE is imbalanced..!

macro_auroc = []
print("AUROC per tag")
for i, name in enumerate(LABELS):
    try:
        tag_auroc = auroc(predictions[:, i], labels[:, i], pos_label=1)
        macro_auroc.append(tag_auroc)
        print(f"{i}:: {str(name)}: {tag_auroc}")
    except:
        pass
    
print()
print("MACRO_AVG :: {}".format(np.array(macro_auroc).mean()))

AUROC per tag
0:: 불평/불만: 0.9365819692611694
1:: 환영/호의: 0.8934606909751892
2:: 감동/감탄: 0.9294263124465942
3:: 지긋지긋: 0.8327970504760742
4:: 고마움: 0.9178640842437744
5:: 슬픔: 0.9033969044685364
6:: 화남/분노: 0.9369299411773682
7:: 존경: 0.9155579209327698
8:: 기대감: 0.8818700909614563
9:: 우쭐댐/무시함: 0.8310950994491577
10:: 안타까움/실망: 0.8795443177223206
11:: 비장함: 0.860582172870636
12:: 의심/불신: 0.8715171217918396
13:: 뿌듯함: 0.8662175536155701
14:: 편안/쾌적: 0.8772338628768921
15:: 신기함/관심: 0.8687111139297485
16:: 아껴주는: 0.8906732797622681
17:: 부끄러움: 0.7513707876205444
18:: 공포/무서움: 0.8868585228919983
19:: 절망: 0.8480616807937622
20:: 한심함: 0.8787931203842163
21:: 역겨움/징그러움: 0.8980967402458191
22:: 짜증: 0.9233975410461426
23:: 어이없음: 0.8891122341156006
24:: 없음: 0.8734162449836731
25:: 패배/자기혐오: 0.8482953310012817
26:: 귀찮음: 0.8192627429962158
27:: 힘듦/지침: 0.8519691824913025
28:: 즐거움/신남: 0.933516263961792
29:: 깨달음: 0.8232670426368713
30:: 죄책감: 0.8641752600669861
31:: 증오/혐오: 0.9314982891082764
32:: 흐뭇함(귀여움/예쁨): 0.921582579

In [39]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = predictions.numpy()
y_true = labels.numpy()
upper, lower = 1, 0
y_pred = np.where(y_pred > THRESHOLD, upper, lower)
print(classification_report(
  y_true,
  y_pred,
  target_names=LABELS,
  zero_division=0
))

              precision    recall  f1-score   support

       불평/불만       0.79      0.89      0.84      2113
       환영/호의       0.55      0.82      0.66      1109
       감동/감탄       0.67      0.86      0.76      1323
        지긋지긋       0.47      0.57      0.51       816
         고마움       0.56      0.71      0.62       637
          슬픔       0.59      0.64      0.61       545
       화남/분노       0.74      0.86      0.79      1538
          존경       0.51      0.69      0.59       460
         기대감       0.58      0.81      0.68      1359
     우쭐댐/무시함       0.44      0.50      0.47       743
     안타까움/실망       0.69      0.88      0.77      2185
         비장함       0.47      0.46      0.46       416
       의심/불신       0.62      0.77      0.69      1539
         뿌듯함       0.43      0.56      0.49       602
       편안/쾌적       0.45      0.51      0.48       458
      신기함/관심       0.57      0.77      0.66      1346
        아껴주는       0.56      0.70      0.63       897
        부끄러움       0.33    

In [40]:
## computation of some vector is impossible if it is a zero vector with zero variance. --> just turn off error signs

from sklearn.metrics import matthews_corrcoef as MCC
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("error")
    
    totalCorr = 0
    totalLen = 0
    for i in range(10_000):
        try:
            totalCorr += MCC(y_pred[i], y_true[i])
            totalLen += 1
        except:
            pass
        
print('computed # ::: {}'.format(totalLen))
print('MCC  :::::::::  {}'.format(totalCorr/totalLen))

computed # ::: 5000
MCC  :::::::::  0.588452811767147
