In [1]:
import torch, datasets, evaluate
import pandas as pd
import numpy as np
from transformers import FunnelTokenizerFast, FunnelModel, Trainer, DataCollatorWithPadding, TrainingArguments, FunnelForSequenceClassification
from time import time, localtime
from tqdm import tqdm_notebook

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
# device = 'dml' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")
print(torch.version.hip)

device: cuda
None


In [2]:
tokenizer_funnel = FunnelTokenizerFast.from_pretrained("kykim/funnel-kor-base")
model_funnel = FunnelForSequenceClassification.from_pretrained("kykim/funnel-kor-base", num_labels=2)
model_funnel.train()
model_funnel.to(device)

Some weights of the model checkpoint at kykim/funnel-kor-base were not used when initializing FunnelForSequenceClassification: ['decoder.layers.1.ffn.linear_2.bias', 'decoder.layers.0.attention.v_head.bias', 'decoder.layers.0.ffn.linear_1.weight', 'decoder.layers.1.ffn.layer_norm.weight', 'decoder.layers.1.attention.v_head.weight', 'decoder.layers.0.attention.k_head.weight', 'decoder.layers.1.attention.r_r_bias', 'decoder.layers.1.ffn.linear_1.bias', 'decoder.layers.1.ffn.linear_1.weight', 'decoder.layers.1.attention.post_proj.weight', 'decoder.layers.0.ffn.linear_1.bias', 'decoder.layers.0.ffn.layer_norm.bias', 'decoder.layers.0.ffn.linear_2.bias', 'decoder.layers.0.ffn.layer_norm.weight', 'decoder.layers.1.attention.r_w_bias', 'decoder.layers.0.attention.q_head.weight', 'decoder.layers.1.ffn.layer_norm.bias', 'decoder.layers.1.attention.layer_norm.weight', 'decoder.layers.0.attention.r_s_bias', 'decoder.layers.1.attention.k_head.weight', 'decoder.layers.1.ffn.linear_2.weight', 'decod

FunnelForSequenceClassification(
  (funnel): FunnelBaseModel(
    (embeddings): FunnelEmbeddings(
      (word_embeddings): Embedding(42000, 768)
      (layer_norm): LayerNorm((768,), eps=1e-09, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): FunnelEncoder(
      (attention_structure): FunnelAttentionStructure(
        (sin_dropout): Dropout(p=0.1, inplace=False)
        (cos_dropout): Dropout(p=0.1, inplace=False)
      )
      (blocks): ModuleList(
        (0-2): 3 x ModuleList(
          (0-5): 6 x FunnelLayer(
            (attention): FunnelRelMultiheadAttention(
              (hidden_dropout): Dropout(p=0.1, inplace=False)
              (attention_dropout): Dropout(p=0.1, inplace=False)
              (q_head): Linear(in_features=768, out_features=768, bias=False)
              (k_head): Linear(in_features=768, out_features=768, bias=True)
              (v_head): Linear(in_features=768, out_features=768, bias=True)
              (post_pro

In [3]:
raw_data = datasets.load_dataset("nsmc")
raw_data

Using custom data configuration default
Found cached dataset nsmc (C:/Users/sizzf/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})

In [4]:
print(raw_data['train'][0])
print(raw_data['train'].features)

{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}
{'id': Value(dtype='string', id=None), 'document': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['negative', 'positive'], id=None)}


In [5]:
def tokenizer_function(examples):
    return tokenizer_funnel(examples["document"])

In [6]:
tokenized_datasets = raw_data.map(tokenizer_function, batched=True)
print(tokenized_datasets)
print(tokenizer_funnel.model_max_length)

Loading cached processed dataset at C:/Users/sizzf/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3\cache-67863f3023fd0586.arrow
Loading cached processed dataset at C:/Users/sizzf/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3\cache-d49b264e64756d71.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})
512


In [7]:
batch_size = 32

collator = DataCollatorWithPadding(tokenizer=tokenizer_funnel, padding=True)
training_args = TrainingArguments("test_trainer", learning_rate=3e-5, num_train_epochs=15, per_device_train_batch_size=batch_size, no_cuda=False, save_strategy="epoch")
metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
train_samples = tokenized_datasets["train"]
test_samples = tokenized_datasets["test"]

In [None]:
trainer = Trainer(model_funnel,
                  training_args,
                  train_dataset=train_samples,
                  eval_dataset=test_samples,
                  data_collator=collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer_funnel)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
lt = localtime(time())
model_funnel.save_pretrained(f"./FunnelLM-{lt.tm_mon}-{lt.tm_mday}-{lt.tm_hour}-{lt.tm_min}-{lt.tm_sec}")

Configuration saved in ./FunnelLM-1-11-9-13-5\config.json
Model weights saved in ./FunnelLM-1-11-9-13-5\pytorch_model.bin
