In [None]:
# if you working on colab uncomment
# !pip install transformers

# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH = "C:/Users/withus/Desktop/sabin/oss/"

In [3]:
train = pd.read_csv(PATH+"Dataset/unsmile_train_v1.0.tsv", sep="\t")
valid = pd.read_csv(PATH+"Dataset/unsmile_valid_v1.0.tsv", sep="\t")

In [4]:
train.head()

Unnamed: 0,문장,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean,개인지칭
0,일안하는 시간은 쉬고싶어서 그런게 아닐까,0,0,0,0,0,0,0,0,0,1,0
1,아동성범죄와 페도버는 기록바 끊어져 영원히 고통 받는다. 무슬림 50퍼 근친이다. ...,0,0,0,0,0,0,1,0,0,0,0
2,루나 솔로앨범 나왔을 때부터 머모 기운 있었음 ㅇㅇ Keep o doin 진짜 띵...,0,0,0,0,0,0,0,0,0,1,0
3,홍팍에도 어버이연합인가 보내요 뭐 이런뎃글 있는데 이거 어버이연합측에 신고하면 그쪽...,0,0,0,0,0,0,0,0,0,1,0
4,아놔 왜 여기 댓들은 다 여자들이 김치녀라고 먼저 불렸다! 여자들은 더 심하게 그런...,1,0,0,0,0,0,0,0,0,0,0


In [5]:
train.shape, valid.shape

((15005, 12), (3737, 12))

In [6]:
MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading: 100%|██████████| 288/288 [00:00<00:00, 288kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 504/504 [00:00<00:00, 504kB/s]
Downloading: 100%|██████████| 396k/396k [00:00<00:00, 487kB/s]  
Downloading: 100%|██████████| 124/124 [00:00<00:00, 124kB/s]


In [7]:
tokenized_train = tokenizer(
                    list(train["문장"]), 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=512, 
                    add_special_tokens=True
                )

In [8]:
tokenized_train.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
print(tokenized_train[0])
print(tokenized_train[0].tokens)
print(tokenized_train[0].ids)
print(tokenized_train[0].attention_mask)

Encoding(num_tokens=113, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '일', '##안하는', '시간', '##은', '쉬고', '##싶어', '##서', '그런게', '아닐까', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [10]:
tokenized_valid = tokenizer(
                    list(valid["문장"]), 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=512, 
                    add_special_tokens=True
                )

In [11]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self, encoding, label):
        self.encoding = encoding
        self.label = label
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
        item["label"] = torch.tensor(self.label[idx])
        return item

    def __len__(self):
        return len(self.label)

In [12]:
train_label = train['clean'].values
valid_label = valid['clean'].values

train_dataset = myDataset(tokenized_train, train_label)
valid_dataset = myDataset(tokenized_valid, valid_label)

In [13]:
train_dataset.__getitem__(1)

  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}


{'input_ids': tensor([    2, 10756,  4078,  9099,  4331,  3682,  4017, 14646, 11202,  4293,
         20933,  4248, 10400,  9191, 14542,    18, 31157,  8619,  4167,   365,
          4716,  7984,    18,  8229, 11564,  8005,    43,  5358, 16104, 11464,
          9185,  4192, 15977,     9,   820,    18,     3,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)

Downloading: 100%|██████████| 498M/498M [00:14<00:00, 34.9MB/s] 
Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base an

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [15]:
training_args = TrainingArguments(
    output_dir="C:/Users/withus/Desktop/sabin/oss/output_binary",
    num_train_epochs=10,
    per_device_eval_batch_size=64,
    per_device_train_batch_size=8,
    logging_dir="C:/Users/withus/Desktop/sabin/oss/log_binary",
    save_steps=1000,
    save_total_limit=2,
)

In [16]:
def compute_metrics(preds):
    labels = preds.label_ids
    preds = preds.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "acc": acc, 
        "precision": precision, 
        "recall": recall,
        "f1": f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [18]:
trainer.evaluate(eval_dataset=valid_dataset)

***** Running Evaluation *****
  Num examples = 3737
  Batch size = 64
  item = {key: torch.tensor(value[idx]) for key, value in self.encoding.items()}
100%|██████████| 59/59 [00:04<00:00, 14.04it/s]


{'eval_loss': 0.9311850666999817,
 'eval_acc': 0.886004816697886,
 'eval_precision': 0.8033373063170441,
 'eval_recall': 0.720855614973262,
 'eval_f1': 0.7598647125140925,
 'eval_runtime': 4.4966,
 'eval_samples_per_second': 831.064,
 'eval_steps_per_second': 13.121,
 'epoch': 10.0}

In [None]:
def sentence_predict(sent):
    model.eval()

    tokenized_sent = tokenizer(
        sent,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
        add_special_tokens=True,
    )

    tokenized_sent.to(device)

    with torch.no_grad():
        output = model(**tokenized_sent)

    logits = output[0].detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    if result == 0:
        return "악성댓글"
    else:
        return "정상댓글"

while True:
    sent = input("문장을 입력하세요: ")
    if sent == "0":
        break
    print(sentence_predict(sent))
    print("-"*50)