In [1]:
import numpy as np

import torch ## version >= 1.8.2

from transformers import AutoTokenizer, AutoModelForSequenceClassification ## version == 4.12.3

import datasets ## version == 2.1.0

# data

In [2]:
from datasets import load_dataset

dataset = load_dataset("searle-j/kote")

No config specified, defaulting to: kote/dichotomized
Reusing dataset kote (C:\Users\wuju9\.cache\huggingface\datasets\searle-j___kote\dichotomized\0.0.0\9e18d6e4c5fb5b54c412810da99dfa5e5ece83c40924ee5eb3f41ce5b4d5b436)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
})

In [4]:
# check a sample in the train set.
dataset["train"][25597]

{'ID': '32521',
 'text': '구슬픈 봄날 저녁 무렵, 도시의 뒤섞여 있는 건축과 건축의 그림자를 찾아서 커다란 군중 속에 휩쓸려 가는 것은 얼마나 즐거운 일인가. <우울한 고양이_하기와라 사쿠타로>15',
 'labels': [2, 4, 5, 13, 14, 15, 16, 27, 28, 38, 40, 42]}

In [5]:
# get multi-hot labels (44-dimensional).

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(dataset["train"]["labels"])
test_labels = mlb.fit_transform(dataset["test"]["labels"])
val_labels = mlb.fit_transform(dataset["validation"]["labels"])

print("train_labels shape ::: {}".format(train_labels.shape))
print("test_labels shape :::: {}".format(test_labels.shape))
print("val_labels shape ::::: {}".format(val_labels.shape))
print("\ngood!")

train_labels shape ::: (40000, 44)
test_labels shape :::: (5000, 44)
val_labels shape ::::: (5000, 44)

good!


In [6]:
# add the multi-hot labels

dataset["train"] = dataset["train"].add_column("binary_labels", train_labels.tolist())
dataset["test"] = dataset["test"].add_column("binary_labels", test_labels.tolist())
dataset["validation"] = dataset["validation"].add_column("binary_labels", val_labels.tolist())

dataset["train"].features.keys()

dict_keys(['ID', 'text', 'labels', 'binary_labels'])

# tokenization

In [7]:
# download the pretrained tokenizer from huggingface.

MODEL_NAME = 'beomi/KcELECTRA-base' # <-- Thank you!
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### sample encoding

In [8]:
# let us see how a sample text converted into a torch tensor...
sample_text = dataset["train"]["text"][25597]
sample_text

'구슬픈 봄날 저녁 무렵, 도시의 뒤섞여 있는 건축과 건축의 그림자를 찾아서 커다란 군중 속에 휩쓸려 가는 것은 얼마나 즐거운 일인가. <우울한 고양이_하기와라 사쿠타로>15'

In [9]:
encoding = tokenizer.encode_plus(
    sample_text,
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors='pt',
)

encoding.keys() # it contains token ids and attention mask

dict_keys(['input_ids', 'attention_mask'])

In [10]:
encoding # in more details

{'input_ids': tensor([[    2, 32585,  4924, 38506, 14080, 28312,    16, 11678,  4041, 47287,
          8039, 27495,  4169, 27495,  4041, 29273,  4023, 17006, 28265, 32332,
         17207, 44807,  9107,  8853,  8176, 29295, 21047,    18, 25000, 14999,
          4069, 17374,    65,  9936, 10584, 43960, 37229, 25001,  9154,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [11]:
encoding["input_ids"].shape, encoding["attention_mask"].shape
# because we have set 'max_length' as 512.

(torch.Size([1, 512]), torch.Size([1, 512]))

In [12]:
# we can convert those input ids back into the tokens.

print(tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze()))

['[CLS]', '구슬', '##픈', '봄날', '저녁', '무렵', ',', '도시', '##의', '뒤섞여', '있는', '건축', '##과', '건축', '##의', '그림자', '##를', '찾아서', '커다란', '군중', '속에', '휩쓸려', '가는', '것은', '얼마나', '즐거운', '일인가', '.', '<', '우울', '##한', '고양이', '_', '하기', '##와라', '사쿠', '##타로', '>', '15', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

### token switching and token masking for the training instances

In [13]:
# for a better performance, let us switch and mask some tokens in the train set.

In [14]:
def token_masking(encoding, prob):
    for i, token in enumerate(encoding['input_ids'][0]):
        if token not in [0,1,2,3]: # 0 ~ 3, [PAD], [UNK], [CLS], and [SEP], respectively.
            if np.random.uniform(0,1) < prob:
                encoding['input_ids'][0][i] = 4 #4 is '[MASK]'
                
    return encoding

In [15]:
def token_switching(encoding, prob):
    for i, token in enumerate(encoding['input_ids'][0]):
        if token not in [0,1,2,3,4]: # 0 ~ 4, [PAD], [UNK], [CLS], [SEP], and [MASK], respectively.
            if np.random.uniform(0,1) < prob:
                encoding['input_ids'][0][i] = np.random.choice(np.arange(5,tokenizer.vocab_size), 1)[0]
                
    return encoding

In [16]:
def mask_and_switch(encoding, prob:float=0.1):
    encoding = token_masking(encoding, prob/2)
    encoding = token_switching(encoding, prob/2)
    
    return encoding

In [17]:
# see some tokens in the sample text has changed...

encoding = mask_and_switch(encoding, prob=0.1)
print(tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze()))

['[CLS]', '구슬', '관대', '봄날', '저녁', '무렵', ',', '도시', '##의', '뒤섞여', '있는', '건축', '##과', '건축', '##의', '그림자', '##를', '찾아서', '커다란', '군중', '[MASK]', '휩쓸려', '가는', '것은', '얼마나', '즐거운', '일인가', '.', '<', '가드', '##한', '고양이', '_', '하기', '##와라', '사쿠', '##타로', '>', '15', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

### real encoding for real data

In [18]:
# add split indicator columns, since we don't want to change the tokens in the test and val sets.

dataset["train"] = dataset["train"].add_column("split", ["train"]*len(dataset["train"]))
dataset["test"] = dataset["test"].add_column("split", ["test"]*len(dataset["test"]))
dataset["validation"] = dataset["validation"].add_column("split", ["validation"]*len(dataset["validation"]))

In [19]:
def _preprocess(instance, prob:float=0.1, max_length:int=512):
    # encoding
    encoded = tokenizer(
        instance["text"],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    # token masking and token switching
    if instance["split"]=="train":
        encoded = mask_and_switch(encoded, prob=prob)
    else:
        pass
    # add the binary labels
    encoded["binary_labels"] = instance["binary_labels"]
    
    return encoded

In [20]:
from datasets import Features, Value, Sequence

features = Features({
    "ID": Value("string"),
    "text": Value("string"),
    "input_ids": Sequence(Value("int64")),
    "attention_mask": Sequence(Value("int64")),
    "token_type_ids": Sequence(Value("int64")),
    "labels": Sequence(Value("int64")),
    "binary_labels": Sequence(Value("float32")), # <-- pyarrow does not preserve the dtype, so have to force it.
    "split": Value("string")
})
tokenized_dataset = dataset.map(_preprocess, features=features, batched=True, batch_size=2) # Use a befitting batch size
                                                                                            # 28 ~ 30 Gb required for batch size 32
tokenized_dataset = tokenized_dataset.with_format(type="torch", columns=["input_ids","attention_mask","token_type_ids","binary_labels"])

  0%|          | 0/20000 [00:00<?, ?ba/s]

  0%|          | 0/2500 [00:00<?, ?ba/s]

  0%|          | 0/2500 [00:00<?, ?ba/s]

In [21]:
tokenized_dataset = tokenized_dataset.remove_columns(["ID","text","labels"])
tokenized_dataset = tokenized_dataset.rename_column("binary_labels","labels")

In [22]:
tokenized_dataset["train"]["labels"].dtype

torch.float32

# model

In [23]:
# first, let us get the label names from the original dataset.

LABELS = dataset["train"].features["labels"].feature.names
LABELS

['불평/불만',
 '환영/호의',
 '감동/감탄',
 '지긋지긋',
 '고마움',
 '슬픔',
 '화남/분노',
 '존경',
 '기대감',
 '우쭐댐/무시함',
 '안타까움/실망',
 '비장함',
 '의심/불신',
 '뿌듯함',
 '편안/쾌적',
 '신기함/관심',
 '아껴주는',
 '부끄러움',
 '공포/무서움',
 '절망',
 '한심함',
 '역겨움/징그러움',
 '짜증',
 '어이없음',
 '없음',
 '패배/자기혐오',
 '귀찮음',
 '힘듦/지침',
 '즐거움/신남',
 '깨달음',
 '죄책감',
 '증오/혐오',
 '흐뭇함(귀여움/예쁨)',
 '당황/난처',
 '경악',
 '부담/안_내킴',
 '서러움',
 '재미없음',
 '불쌍함/연민',
 '놀람',
 '행복',
 '불안/걱정',
 '기쁨',
 '안심/신뢰']

In [24]:
# load a pretrained model.

MODEL_NAME = 'beomi/KcELECTRA-base'
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    problem_type="multi_label_classification",
    num_labels=len(LABELS),
)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.ou

In [25]:
# check the model configurations.
model.config
# see the id2label and label2id is not defined yet? ---> then let us define them now.

ElectraConfig {
  "_name_or_path": "beomi/KcELECTRA-base",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_31",
    "32": "LABEL_

In [27]:
## Define the names of the labels in the model configuration.
model.config.id2label = {i:label for i,label in enumerate(LABELS)}
model.config.label2id = {label:i for i,label in enumerate(LABELS)}

In [28]:
from transformers import TrainingArguments

#BATCH_SIZE_per_device = 2  <--  activate this line if you are rich and can afford more than one GPU.
EPOCHS = 15

args = TrainingArguments(
    output_dir="kote_output",
    evaluation_strategy="epoch",
#    per_device_train_batch_size=BATCH_SIZE_per_device,
#    per_device_eval_batch_size=BATCH_SIZE_per_device,
    num_train_epochs=EPOCHS,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [29]:
from transformers import Trainer, AdamW, get_linear_schedule_with_warmup, EarlyStoppingCallback#, DataCollatorWithPadding

#data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  <--  We have already padded the texts. Use the collator if you prefer it.
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=2_500, num_training_steps=12_500)

metric = datasets.load_metric("matthews_correlation")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    optimizers=(optimizer,scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.0)],
    compute_metrics=compute_metrics,
#    data_collator=data_collator,
)



Downloading builder script:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

In [None]:
trainer.train()