In [11]:
import tensorflow as tf
import numpy as np
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset, Dataset
import sentencepiece as spm
import argparse
import pandas as pd
import re
import transformers
from transformers import pipeline

print(transformers.__version__)

4.11.3


In [2]:
train_data = pd.read_table("./ratings_test.txt")
test_data = pd.read_table("./ratings_train.txt")

In [3]:
train_data.drop_duplicates(["document"])
test_data.drop_duplicates(["document"])
train_data = train_data.dropna(axis=0)
test_data = test_data.dropna(axis=0)

In [4]:
train_data["document"] = "[CLS]" + train_data["document"] + "[SEP]"
test_data["document"] = "[CLS]" + test_data["document"] + "[SEP]"

In [5]:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [6]:
train_dataset = train_dataset.class_encode_column("label")
test_dataset = test_dataset.class_encode_column("label")

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [8]:
def tokenize_function(example):
    return tokenizer(example["document"], truncation=True)

In [9]:
train_datasets = train_dataset.map(tokenize_function, batched=True)
test_datasets = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/150 [00:00<?, ?ba/s]

In [10]:
train_datasets

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'document', 'id', 'input_ids', 'label', 'token_type_ids'],
    num_rows: 49997
})

In [14]:
feature_extractor = pipeline("text-classification", model="klue/bert-base")

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [24]:
from tqdm import tqdm

features = []
for data in tqdm(train_dataset):
    features.append(feature_extractor(data["document"]))

100%|██████████| 49997/49997 [53:41<00:00, 15.52it/s] 


In [73]:
def get_acc(features):
    score = 0
    for i, featrue in tqdm(enumerate(features)):
        f_label = int(featrue[0]["label"][-1])
        d_label = test_datasets["label"][i]
        if f_label == d_label: score += 1
    return score / len(features)

In [75]:
acc = get_acc(features)

49997it [38:40, 21.55it/s]


In [76]:
print("accuracy : {}".format(acc))

accuracy : 0.49946996819809186
