In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer , AutoModel
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_data = pd.read_csv("./Dataset/unsmile_train_v1.0.tsv", sep="\t")
valid_data = pd.read_csv("./Dataset/unsmile_valid_v1.0.tsv", sep="\t")

cols = train_data.columns.tolist()
cols.remove("문장")
cols.remove("clean")

MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 128
tokenized_train = tokenizer(
                    list(train_data["문장"]), 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=MAX_LEN, 
                    add_special_tokens=True
                )

tokenized_valid = tokenizer(
                    list(valid_data["문장"]), 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True, 
                    max_length=MAX_LEN, 
                    add_special_tokens=True
                )

In [2]:

class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe["문장"]
        self.targets = self.data[cols].values
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.targets[index], dtype=torch.float)
        }
    


training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(valid_data, tokenizer, MAX_LEN)

In [3]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': 16,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [4]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base",num_labels=10)
        

    def forward(self, input_ids, attention_mask,token_type_ids):
        output = self.l1(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.weight', 'classifi

BERTClass(
  (l1): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(50135, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_features=768, out_fe

In [5]:
training_args = TrainingArguments(
    output_dir="./output_1",
    num_train_epochs=10,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=8,
    logging_dir="./log",
    save_steps=1000,
    save_total_limit=2,
)

def compute_metrics(preds):
    labels = preds.label_ids
    preds = preds.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "acc": acc, 
        "precision": precision, 
        "recall": recall,
        "f1": f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    eval_dataset=testing_set,
    
)

In [6]:

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

LEARNING_RATE = 5e-5
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [7]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs['logits'], targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
for epoch in range(5):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
1it [00:00,  1.74it/s]

Epoch: 0, Loss:  0.6956549882888794


1876it [02:14, 13.92it/s]
2it [00:00, 14.15it/s]

Epoch: 1, Loss:  0.12894748151302338


1876it [02:14, 13.96it/s]
2it [00:00, 14.27it/s]

Epoch: 2, Loss:  0.025834811851382256


1876it [02:14, 13.95it/s]
2it [00:00, 14.22it/s]

Epoch: 3, Loss:  0.035010017454624176


1876it [02:14, 13.94it/s]
2it [00:00, 14.20it/s]

Epoch: 4, Loss:  0.023657796904444695


1876it [02:14, 13.94it/s]


In [8]:
# trainer.train()

In [9]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs["logits"]).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [10]:
outputs, targets = validation(testing_loader)
final_outputs = np.array(outputs) >=0.5


234it [00:08, 26.76it/s]


In [11]:
from sklearn import metrics

def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)


val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.7619302470787619
Hamming Loss = 0.0356970832218357


In [50]:
def sentence_classification(sentence):
    
    tokenized_sent = tokenizer(
        sentence,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        add_special_tokens=True,
    )
    
    tokenized_sent.to(device)
    
    model.eval()
    with torch.no_grad():
        output = model(**tokenized_sent)
        
    outputs = output['logits'].detach().cpu().numpy()
    final_outputs = np.array(outputs) >=0.5
    
    return (pd.DataFrame(final_outputs,columns=cols))
    
sentence_classification("그만하쟈")

Unnamed: 0,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,개인지칭
0,False,False,False,False,False,False,False,False,False,False


In [31]:
train_data[train_data["문장"].str.contains("조국")]

Unnamed: 0,문장,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean,개인지칭
56,조국 수사는 이제 막바지. 기껏해봐야 와이프가 표장장관련 의혹이 전부. 국민을 위한...,0,0,0,0,0,0,0,1,0,0,1
860,천조국같았으면 바로 총알받이이넫 아숩 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ무슬림시발세키덜,0,0,0,0,0,0,1,0,0,0,1
953,조팔 저게 18년도 선진국에서 일어나는 일이라니 믿을수가없노 내조국 좆되기전에 무슬...,0,1,0,1,0,0,0,0,0,0,1
1387,도대체 무슨 겉멋이 들어서 다문화 우대 개지랄들을 하는지 모르겠다 특히 십짱개새끼들...,0,0,0,1,0,0,0,0,0,0,1
3529,갓조국 미투운동 응원합니다 충성충성^^7 한녀는 제외,1,0,0,0,0,0,0,0,0,0,1
3552,ㅋㅋㅋㅋ 한국은 대놓고 성차별하네 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ천조국 게이...,0,0,0,1,0,0,0,0,0,0,1
4393,내가 이래사 갓조국으로 이민온거임. 이새끼들은 진짜 불합리한거보면 절대 가만히 안 ...,0,0,0,1,0,0,0,0,0,0,1
4758,탈북자 새끼들 다 북한으로 돌려보내야한다 얘들은 자기 조국을 배신한 변절자다참고로 ...,0,0,0,1,0,0,0,0,0,0,1
5710,ㅋㅋㅋ없는증거도 만들어내겠군 조국때처럼,0,0,0,0,0,0,0,0,1,0,0
5760,인권팔이하는 애들아~~조국 아들딸때문에 피해본 학생들의 인권은 뭐라 이야기할래?,0,0,0,0,0,0,0,1,0,0,1


In [None]:
train_data