# KoBERT finetuning

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

In [None]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [None]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
# ## CPU
# device = torch.device("cpu")

# GPU
device = torch.device("cuda:0")

NameError: ignored

In [None]:
# GPU 확인하기
n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

1
Tesla P100-PCIE-16GB


In [None]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [None]:
#구글드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
feeling_df = pd.read_excel('/content/drive/MyDrive/NLP/dataset/한국어_단발성_대화_데이터셋.xlsx')

In [None]:
feeling_df['Emotion'].unique()

array(['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'], dtype=object)

In [None]:
feeling_df.loc[(feeling_df['Emotion'] == "공포"), 'Emotion'] = 0  #공포 => 0
feeling_df.loc[(feeling_df['Emotion'] == "놀람"), 'Emotion'] = 1  #놀람 => 1
feeling_df.loc[(feeling_df['Emotion'] == "분노"), 'Emotion'] = 2  #분노 => 2
feeling_df.loc[(feeling_df['Emotion'] == "슬픔"), 'Emotion'] = 3  #슬픔 => 3
feeling_df.loc[(feeling_df['Emotion'] == "중립"), 'Emotion'] = 4  #중립 => 4
feeling_df.loc[(feeling_df['Emotion'] == "행복"), 'Emotion'] = 5  #행복 => 5
feeling_df.loc[(feeling_df['Emotion'] == "혐오"), 'Emotion'] = 6  #혐오 => 6

In [None]:
df_list = []
for q, label in zip(feeling_df['Sentence'], feeling_df['Emotion'])  :
    data = []
    data.append(q)
    data.append(str(label))

    df_list.append(data)


In [None]:
df_list

In [None]:
print(df_list[0])
print(df_list[6000])
print(df_list[12000])
print(df_list[18000])
print(df_list[24000])
print(df_list[30000])
print(df_list[-1])

['언니 동생으로 부르는게 맞는 일인가요..??', '0']
['기술적으로도 아직도 해체해서 다시 완벽히 돌려놓는게 어려운데 해체를한다고?', '1']
['당연히 그렇게 해야지 우리나라도 판매를 중단하라', '2']
['그거들은 뒤부터 미치겠어요...', '3']
['최악의 상황중 그나마 나은 방법이네. 기분은 잡치겠지만', '4']
['  요리하는것이 숙제하는것처럼 힘든저에게 용기나게 해주시고 할수 있을것같은 희망을 주셔서감사합니다!!', '5']
['와이프도 그렇고 댓글 다 볼텐데 이휘재 좀 하차 하라고 전해주세요', '6']


In [None]:
feeling_df

Unnamed: 0,Sentence,Emotion,Unnamed: 2,Unnamed: 3,Unnamed: 4,공포,5468
0,언니 동생으로 부르는게 맞는 일인가요..??,0,,,,놀람,5898.0
1,그냥 내 느낌일뿐겠지?,0,,,,분노,5665.0
2,아직너무초기라서 그런거죠?,0,,,,슬픔,5267.0
3,유치원버스 사고 낫다던데,0,,,,중립,4830.0
4,근데 원래이런거맞나요,0,,,,행복,6037.0
...,...,...,...,...,...,...,...
38589,솔직히 예보 제대로 못하는 데 세금이라도 아끼게 그냥 폐지해라..,6,,,,,
38590,재미가 없으니 망하지,6,,,,,
38591,공장 도시락 비우생적임 아르바이트했는데 화장실가성 손도 않씯고 재료 담고 바닥 떨어...,6,,,,,
38592,코딱지 만한 나라에서 지들끼리 피터지게 싸우는 센징 클래스 ㅉㅉㅉ,6,,,,,


In [None]:
df = feeling_df[['Sentence', 'Emotion']]

In [None]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split
                                                         
df_train, df_test = train_test_split(df_list, test_size=0.25, random_state=42)

In [None]:
# BERT 모델에 들어가기 위한 dataset을 만들어주는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [None]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
data_train  = BERTDataset(df_train, 0, 1, tok, max_len, True, False)
data_test  = BERTDataset(df_test, 0, 1, tok, max_len, True, False)

In [None]:
print(data_train[0])
print(data_test[0])

(array([   2, 5191,  703,  517,  463,  517,    5,  517,    5,  632,  517,
         54, 3480, 6579, 2043, 7127,  517,   46,  517,   46,  517,   46,
          3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32), array(23, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32), 3)
(array([   2, 3223, 6553, 5357, 5495, 6003, 6116,    3,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
    

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


# KoBERT 학습모델 만들기


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [None]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# KoBERT 모델 학습시키기

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))



  cpuset_checked))


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.9877619743347168 train acc 0.171875
epoch 1 batch id 201 loss 1.5820517539978027 train acc 0.23041044776119404
epoch 1 batch id 401 loss 1.292946457862854 train acc 0.343944825436409
epoch 1 train acc 0.36063904038436567


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 1 test acc 0.49034920259494524


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.3655612468719482 train acc 0.46875
epoch 2 batch id 201 loss 1.0858924388885498 train acc 0.5118159203980099
epoch 2 batch id 401 loss 1.2146068811416626 train acc 0.5369778678304239
epoch 2 train acc 0.542413322945072


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 2 test acc 0.511521827273956


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.1471161842346191 train acc 0.546875
epoch 3 batch id 201 loss 0.8502593636512756 train acc 0.6131840796019901
epoch 3 batch id 401 loss 1.0162063837051392 train acc 0.6355985037406484
epoch 3 train acc 0.6422096156343332


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 3 test acc 0.5166639917556427


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.9775562286376953 train acc 0.65625
epoch 4 batch id 201 loss 0.5280645489692688 train acc 0.7002487562189055
epoch 4 batch id 401 loss 0.8976680040359497 train acc 0.718555174563591
epoch 4 train acc 0.7226557427606805


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 4 test acc 0.5179922962562508


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.7512392401695251 train acc 0.734375
epoch 5 batch id 201 loss 0.3706977963447571 train acc 0.7677238805970149
epoch 5 batch id 401 loss 0.5711212158203125 train acc 0.7868220074812967
epoch 5 train acc 0.788915400597325


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 5 test acc 0.5242727057710501


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.6819859743118286 train acc 0.796875
epoch 6 batch id 201 loss 0.4219885468482971 train acc 0.8352767412935324
epoch 6 batch id 401 loss 0.46675848960876465 train acc 0.8494389027431422
epoch 6 train acc 0.8514417770419426


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 6 test acc 0.529860454115421


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.3560352623462677 train acc 0.90625
epoch 7 batch id 201 loss 0.1596393883228302 train acc 0.8858830845771144
epoch 7 batch id 401 loss 0.3736632466316223 train acc 0.8966645885286783
epoch 7 train acc 0.8992825607064018


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 7 test acc 0.5331315887282065


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.3180430829524994 train acc 0.90625
epoch 8 batch id 201 loss 0.13568897545337677 train acc 0.9246735074626866
epoch 8 batch id 401 loss 0.34789973497390747 train acc 0.9302914588528678
epoch 8 train acc 0.9314983443708609


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 8 test acc 0.5397224287065819


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.28774505853652954 train acc 0.921875
epoch 9 batch id 201 loss 0.12426026910543442 train acc 0.9502487562189055
epoch 9 batch id 401 loss 0.27470704913139343 train acc 0.9529691396508728
epoch 9 train acc 0.9535389072847682


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 9 test acc 0.5393803216650899


  0%|          | 0/453 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.26601070165634155 train acc 0.921875
epoch 10 batch id 201 loss 0.07547002285718918 train acc 0.9600435323383084
epoch 10 batch id 401 loss 0.23179516196250916 train acc 0.9620480049875312
epoch 10 train acc 0.9620584988962473


  0%|          | 0/151 [00:00<?, ?it/s]

epoch 10 test acc 0.5389664143803217


In [None]:
PATH = '/content/drive/MyDrive/NLP/dataset/'

In [None]:
torch.save({
  'epoch' : 10,
  'model_state_dict': model.state_dict(),
  'optimizer_state_dict': optimizer.state_dict()
  }, PATH + 'model_{}.tar'.format(10) )

In [None]:
checkpoint = torch.load(PATH + 'model_10.tar')   # dict 불러오기
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# KoBERT 모델 예측

In [None]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("공포")
            elif np.argmax(logits) == 1:
                test_eval.append("놀람")
            elif np.argmax(logits) == 2:
                test_eval.append("분노")
            elif np.argmax(logits) == 3:
                test_eval.append("슬픔")
            elif np.argmax(logits) == 4:
                test_eval.append("중립")
            elif np.argmax(logits) == 5:
                test_eval.append("행복")
            elif np.argmax(logits) == 6:
                test_eval.append("혐오")

        return (test_eval[0])


In [None]:
 women_policy = pd.read_csv('/content/drive/MyDrive/NLP/dataset/women_policy_labelword.csv')
 minister = pd.read_csv('/content/drive/MyDrive/NLP/dataset/minister.csv')
 repeal = pd.read_csv('/content/drive/MyDrive/NLP/dataset/repeal.csv')

In [None]:
def clean(text):
    #\n 공백 제거
    text = re.sub('\n',' ',str(text))
    # &gt; &lt; &amp;
    text = re.sub('[&]+[a-z]+', '', str(text))
    # URL 제거
    text = re.sub('https://[A-Za-z0-9./]*','',str(text))
    # 한글자 제거 (ex:ㅋㅋ, ㅜㅜ)
    text = re.sub('([ㄱ-ㅎㅏ-ㅣ])+', '', str(text))
    # 숫자 제거 (숫자 + 숫자만 제거, ex: 1인가족)
    text= re.sub('[0-9]{2}', '', str(text))
    # @알파벳 제거 
    text = re.sub('@[A-Za-z0-9./]*','',str(text))
    return str(text)

In [None]:
import re

In [None]:
women_policy['text_re'] = women_policy['text'].apply(lambda x:clean(x))
minister['text_re'] = minister['text'].apply(lambda x:clean(x))
repeal['text_re'] = repeal['text'].apply(lambda x:clean(x))

In [None]:
women_policy['multi_label'] = women_policy['text_re'].apply(lambda x:predict(x))
minister['multi_label'] = minister['text_re'].apply(lambda x:predict(x))
# repeal['multi_label'] = repeal['text_re'].apply(lambda x:predict(x))

In [None]:
women_policy.to_csv('/content/drive/MyDrive/NLP/dataset/women_policy_multilabel2.csv', index=False)
minister.to_csv('/content/drive/MyDrive/NLP/dataset/minister_multilabel2.csv', index=False)

In [None]:
repeal['multi_label'] = repeal['text_re'].apply(lambda x:predict(x))

In [None]:
# women_policy.to_csv('/content/drive/MyDrive/NLP/dataset/women_policy_multilabel2.csv', index=False)
# minister.to_csv('/content/drive/MyDrive/NLP/dataset/minister_multilabel2.csv', index=False)
repeal.to_csv('/content/drive/MyDrive/NLP/dataset/repeal_multilabel2.csv', index=False)

# 긍부정 분석

In [30]:
import os
import re
import platform

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
from PIL import Image
from wordcloud import WordCloud, STOPWORDS

from tqdm import tqdm
from collections import Counter
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
 women_policy = pd.read_csv('/content/drive/MyDrive/NLP/dataset/women_policy_multilabel.csv')
 minister = pd.read_csv('/content/drive/MyDrive/NLP/dataset/minister_multilabel.csv')
 repeal = pd.read_csv('/content/drive/MyDrive/NLP/dataset/repeal_multilabel.csv')

In [None]:
import re
import urllib.request
from tensorflow.keras.models import load_model

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7ff526378950>)

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
train_data.drop_duplicates(subset=['document'], inplace=True)

In [None]:
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]", "")
train_data['document'] = train_data['document'].str.replace('^ +', "")
train_data['document'].replace('',np.nan, inplace=True)

test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]", "")
test_data['document'] = test_data['document'].str.replace('^ +', "")
test_data['document'].replace('',np.nan, inplace=True)

In [None]:
# nan값 제거
train_data = train_data.dropna()
test_data = test_data.dropna()

In [None]:
!pip install transformers

In [None]:
import transformers
from transformers import BertTokenizerFast
from transformers import TextClassificationPipeline
from transformers import TFBertForSequenceClassification

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("klue/bert-base")

In [None]:
X_train_list = train_data['document'].tolist()
X_test_list = test_data['document'].tolist()
y_train = train_data['label'].tolist() # label 긍부정 표시
y_test = test_data['label'].tolist()

In [None]:
X_train = tokenizer(X_train_list, truncation=True, padding=True)
X_test = tokenizer(X_test_list, truncation=True, padding=True)

In [None]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(X_test), y_test))

In [None]:
from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [None]:
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=2, from_pt=True)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
callback_earlystop = EarlyStopping(
    monitor='val_accuracy',
    min_delta=0.001,
    patience=2)

model.fit(train_dataset.shuffle(10000).batch(32), epochs=5, batch_size=64,
          validation_data = val_dataset.shuffle(10000).batch(64),
          callbacks = [callback_earlystop])

In [None]:
model.save_pretrained('nsmc_model/bert-base')
tokenizer.save_pretrained('nsmc_model/bert-base')

In [None]:
loaded_tokenizer = BertTokenizerFast.from_pretrained('nsmc_model/bert-base')
loaded_model = TFBertForSequenceClassification.from_pretrained('nsmc_model/bert-base')

text_classifier = TextClassificationPipeline(
    tokenizer=loaded_tokenizer,
    model=loaded_model,
    framework='tf',
    return_all_scores=False
)

In [None]:
# # label : 0 neg
# # label : 1 pos

# labeling = []
# for i in range(len(women_policy)):
#   label_list = text_classifier(women_policy['text_re'][i])[0]
#   labeling.append(label_list)

# labeling = pd.DataFrame(labeling)

# women_policy['label'] = labeling['label']
# women_policy['score'] = labeling['score']

# women_policy['label_word'] = women_policy['label'].replace('LABEL_1','P').replace('LABEL_0','N')

In [None]:
# # label : 0 neg
# # label : 1 pos

# labeling = []
# for i in range(len(minister)):
#   label_list = text_classifier(minister['text_re'][i])[0]
#   labeling.append(label_list)

# labeling = pd.DataFrame(labeling)

# minister['label'] = labeling['label']
# minister['score'] = labeling['score']

# minister['label_word'] = minister['label'].replace('LABEL_1','P').replace('LABEL_0','N')

In [None]:
len(repeal)

In [None]:
# label : 0 neg
# label : 1 pos

labeling = []
for i in range(len(repeal)):
  try:
    label_list = text_classifier(repeal['text_re'][i])[0]
    labeling.append(label_list)
  except:
    labeling.append({'label' : 'nan', 'score': 'nan'})


labeling = pd.DataFrame(labeling)

repeal['label'] = labeling['label']
repeal['score'] = labeling['score']

repeal['label_word'] = repeal['label'].replace('LABEL_1','P').replace('LABEL_0','N')

In [None]:
# repeal = repeal.dropna()
# repeal = repeal.reset_index(drop=True)

In [None]:
# women_policy.to_csv('/content/drive/MyDrive/NLP/dataset/women_policy_final.csv', index=False)
# minister.to_csv('/content/drive/MyDrive/NLP/dataset/minister_final.csv', index=False)
repeal.to_csv('/content/drive/MyDrive/NLP/dataset/repeal_final.csv', index=False)

In [None]:
import pandas as pd

In [None]:
pd.read_csv('/content/drive/MyDrive/NLP/dataset/repeal_final.csv')

# 스마일 데이트 혐오표현

In [None]:
# !pip install transformers
!pip install datasets==1.17.0

In [34]:
# data load
from datasets import load_dataset
dataset = load_dataset('smilegate-ai/kor_unsmile')

Using custom data configuration smilegate-ai--kor_unsmile-1dba960877497f9f
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/smilegate-ai--kor_unsmile-1dba960877497f9f/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121)


  0%|          | 0/2 [00:00<?, ?it/s]

In [35]:
unsmile_labels = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]

In [37]:
from transformers import TextClassificationPipeline, BertForSequenceClassification, AutoTokenizer

model_name = 'smilegate-ai/kor_unsmile'

model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=0,     # cpu: -1, gpu: gpu number
    return_all_scores=True,
    function_to_apply='sigmoid'
    )

In [None]:
# import pandas as pd
# import numpy as np

# women_policy = pd.read_csv('/content/drive/MyDrive/NLP/dataset/women_policy_final.csv')
# minister = pd.read_csv('/content/drive/MyDrive/NLP/dataset/minister_final.csv')
# repeal = pd.read_csv('/content/drive/MyDrive/NLP/dataset/repeal_final.csv')

In [38]:
label =  ['여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', 'clean']

In [39]:
def predict_smilegate(columns):
  labeling = []
  for text in columns.astype(str):
      labeling.extend(pipe(text))
  
  return labeling

In [40]:
def predict_preprocessing(test_list):
  score = [[item['score'] for item in i] for i in test_list]
  score = np.round(score,2)
 
  return score

In [45]:
def making_df (score, original_df, name):
  label_df = pd.DataFrame(score, columns=label)
  df = pd.concat([original_df, label_df], axis=1)
  df.to_csv(f'/content/drive/MyDrive/NLP/dataset/{name}_sm.csv', index=False)

  return df


In [46]:
labeling = predict_smilegate(women_policy['text_re'])
score = predict_preprocessing(labeling)
making_df(score, women_policy, 'women_policy')



Unnamed: 0,datetime,id,text,username,tokenized,label,score,label_word,text_re,multi_label,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean
0,2022-02-10 01:57:58+00:00,1491592245379678208,대통령을 본 이래 가장 강한 발언이 나왔음 각하 충성충성충성 민주당의 여성정책 노동...,cue3u,"['대통령', '이래', '가장', '강한', '발언', '나왔', '각하', '충...",LABEL_1,0.993853,P,대통령을 본 이래 가장 강한 발언이 나왔음 각하 충성충성충성 민주당의 여성정책 노동...,행복,0.09,0.01,0.01,0.02,0.01,0.01,0.01,0.02,0.12,0.71
1,2022-02-10 03:03:04+00:00,1491608626779557890,문대통령은 지지하고 이재명은 존나 싫어서 뽑을 사람 없어가지고 괴롭네 어쩌네 하는 ...,Unkn0wnG0ldf1sh,"['대통령', '지지', '이재명', '존나', '사람', '괴롭', '진짜', '...",LABEL_1,0.786990,P,문대통령은 지지하고 이재명은 존나 싫어서 뽑을 사람 없어가지고 괴롭네 어쩌네 하는 ...,혐오,0.08,0.01,0.01,0.01,0.01,0.01,0.01,0.02,0.49,0.41
2,2022-02-10 03:06:08+00:00,1491609398858248192,"언제는 피해자 언급과 사과를 안 했으니 민주당에 돌아서겠다 하더니, 이제는 살인자가...",remmremmr,"['언제', '피해자', '언급', '사과', '했으니', '민주당', '돌아서',...",LABEL_0,0.535426,N,"언제는 피해자 언급과 사과를 안 했으니 민주당에 돌아서겠다 하더니, 이제는 살인자가...",혐오,0.11,0.01,0.02,0.01,0.00,0.01,0.01,0.03,0.54,0.39
3,2022-02-10 05:03:53+00:00,1491639032224583681,신남성연대 배인규랑 하등 다를바 없는 놈을 당 공식 직위에 앉혀놓은 안철수를 여성 ...,dthebi,"['신남', '연대', '배인규', '하등', '다를', '공식', '직위', '앉...",LABEL_0,0.811699,N,신남성연대 배인규랑 하등 다를바 없는 놈을 당 공식 직위에 앉혀놓은 안철수를 여성 ...,혐오,0.17,0.04,0.02,0.02,0.01,0.01,0.00,0.07,0.30,0.17
4,2022-02-10 07:20:30+00:00,1491673415773683713,여가부 폐지하겠다는 말은 빈대 잡겠다고 초가삼간 태우는 소리와 다를 바 없습니다. ...,UBzop1,"['폐지', '빈대', '초가삼간', '태우', '다를', '여성', '정책', '...",LABEL_1,0.961402,P,여가부 폐지하겠다는 말은 빈대 잡겠다고 초가삼간 태우는 소리와 다를 바 없습니다. ...,중립,0.38,0.01,0.03,0.02,0.01,0.01,0.00,0.03,0.11,0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,2022-03-08 16:33:58+00:00,1501234781530382337,출구조사하시는분들 꼭 출구조사 참가해주시고 지지이유에 꼭 여성정책이라고 말씀해주세요...,YAnPy_inthere,"['출구', '조사', '출구조사', '참가', '이유', '여성', '정책', '...",LABEL_1,0.970889,P,출구조사하시는분들 꼭 출구조사 참가해주시고 지지이유에 꼭 여성정책이라고 말씀해주세요...,중립,0.10,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.07,0.85
1124,2022-03-08 16:57:04+00:00,1501240596077236224,"'윤석열 후보가 대통령이 되고 이준석 국민의힘 대표가 여당 대표가 된다면, 정말로 ...",JungJaHyun,"['윤석열', '후보', '대통령', '이준석', '국민', '대표', '여당', ...",LABEL_1,0.882403,P,"'윤석열 후보가 대통령이 되고 이준석 국민의힘 대표가 여당 대표가 된다면, 정말로 ...",중립,0.04,0.01,0.02,0.01,0.01,0.01,0.01,0.01,0.09,0.88
1125,2022-03-08 22:16:05+00:00,1501320879464144899,여성의날에 여성정책을 관장하는 정부부처에 대해 얘기한 게 뭐가 문제? 문제는 '여가...,boktheseon,"['여성', '여성', '정책', '관장', '정부', '부처', '대해', '얘기...",LABEL_0,0.788480,N,여성의날에 여성정책을 관장하는 정부부처에 대해 얘기한 게 뭐가 문제? 문제는 '여가...,혐오,0.57,0.01,0.02,0.02,0.01,0.01,0.01,0.04,0.14,0.27
1126,2022-03-08 22:25:14+00:00,1501323180069175301,…? 윤석열 여성 정책이 있어야 비교를 하지,dthebi,"['윤석열', '여성', '정책', '비교']",LABEL_0,0.756631,N,…? 윤석열 여성 정책이 있어야 비교를 하지,중립,0.09,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.07,0.84


In [47]:
labeling = predict_smilegate(minister['text_re'])
score = predict_preprocessing(labeling)
making_df(score, minister, 'minister')



Unnamed: 0,datetime,id,text,username,text_re,multi_label,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean
0,2022-02-10 01:54:58+00:00,1491591490899902466,"윤석열의 메시지를 보면 \n정치보복, 중국혐오, 북한선제타격, 백신패스, 여가부폐지...",mungryang,"윤석열의 메시지를 보면 정치보복, 중국혐오, 북한선제타격, 백신패스, 여가부폐지,...",혐오,0.18,0.01,0.02,0.65,0.01,0.01,0.01,0.15,0.18,0.09
1,2022-02-10 02:00:14+00:00,1491592815926968324,이재명 과거나 과연 이 공약을 지킬 지 여부를 떠나서...(사실 떠나야하나 내적갈등...,LSrgBqhSH0I04UP,이재명 과거나 과연 이 공약을 지킬 지 여부를 떠나서...(사실 떠나야하나 내적갈등...,중립,0.60,0.02,0.02,0.03,0.01,0.01,0.01,0.04,0.09,0.34
2,2022-02-10 02:27:07+00:00,1491599581721030659,여가부 폐지하자는 새끼들인데...\n소 귀에 경 읽어주고 계신 이용수 할머니;;;\...,SEOJH3180,여가부 폐지하자는 새끼들인데... 소 귀에 경 읽어주고 계신 이용수 할머니;;; 🤦...,슬픔,0.68,0.01,0.02,0.03,0.02,0.01,0.01,0.13,0.24,0.08
3,2022-02-10 02:32:35+00:00,1491600955636592640,위안부 문제 지원할거라고 당사자한테 면담한다고 가서 여가부 폐지한다고 씨부리고 오는...,beatdead22,위안부 문제 지원할거라고 당사자한테 면담한다고 가서 여가부 폐지한다고 씨부리고 오는...,혐오,0.04,0.01,0.01,0.04,0.01,0.01,0.01,0.06,0.74,0.07
4,2022-02-10 02:34:28+00:00,1491601429483896832,이용수 할머니 “여가부 폐지 말라” 호소에도 이준석 “입장 변화 없어”@newsvo...,newsvop,이용수 할머니 “여가부 폐지 말라” 호소에도 이준석 “입장 변화 없어”,혐오,0.22,0.01,0.01,0.02,0.01,0.01,0.00,0.03,0.18,0.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3577,2022-03-08 23:07:43+00:00,1501333872868569090,일부 찢 지지자들이 자꾸 윤이 남녀 갈라치기 한다고 조장하는데 윤은 세금만 축내는 ...,loveis86,일부 찢 지지자들이 자꾸 윤이 남녀 갈라치기 한다고 조장하는데 윤은 세금만 축내는 ...,중립,0.74,0.02,0.01,0.04,0.02,0.02,0.02,0.16,0.22,0.06
3578,2022-03-08 23:29:20+00:00,1501339311458844676,내 부모세대 중 여가부가 어떤 일을 하는지 알기보다 그 빌어먹을 보soo 너튜브 +...,Snowl_00,내 부모세대 중 여가부가 어떤 일을 하는지 알기보다 그 빌어먹을 보soo 너튜브 +...,혐오,0.81,0.01,0.01,0.03,0.01,0.02,0.04,0.05,0.08,0.08
3579,2022-03-08 23:31:55+00:00,1501339965204021251,아침부터 정치얘기 나와서 존나 동생이랑 싸우다가 (역시나 여가부 폐지한다고 2번찍은...,0o040o0,아침부터 정치얘기 나와서 존나 동생이랑 싸우다가 (역시나 여가부 폐지한다고 2번찍은...,혐오,0.59,0.01,0.01,0.04,0.01,0.03,0.01,0.11,0.39,0.04
3580,2022-03-08 23:35:00+00:00,1501340740806320133,"나는 무고죄 강화 , 여가부 폐지 , 의료 민영화를 외치는 인간이 대통령 하는 거 ...",affectionate524,"나는 무고죄 강화 , 여가부 폐지 , 의료 민영화를 외치는 인간이 대통령 하는 거 ...",분노,0.66,0.02,0.01,0.04,0.01,0.02,0.01,0.10,0.37,0.04


In [48]:
labeling = predict_smilegate(repeal['text_re'])
score = predict_preprocessing(labeling)
making_df(score, repeal, 'repeal')



Unnamed: 0,datetime,id,text,username,text_re,multi_label,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean
0,2022-02-10 00:10:08+00:00,1491565107343675396,"표절 아니면 무식, 교육 정책은 관심도 없는 윤석열\n\n윤석열 후보의 무식이 상상...",yookihong,"표절 아니면 무식, 교육 정책은 관심도 없는 윤석열 윤석열 후보의 무식이 상상을 ...",혐오,0.01,0.01,0.03,0.01,0.01,0.01,0.01,0.10,0.54,0.22
1,2022-02-10 00:10:53+00:00,1491565298310336515,"면세점 구매한도, 43년만에 폐지..'부가세 면제' 희귀 의약품 범위 확대 | 다음...",KKobukiMelon,"면세점 구매한도, 년만에 폐지..'부가세 면제' 희귀 의약품 범위 확대 | 다음뉴스",중립,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.10,0.89
2,2022-02-10 00:12:49+00:00,1491565785692524546,"하루 10만명 확진 영국의 '위드코로나'\n실내 마스크, 코로나19 패스 없어져.....",JeongJ15,"하루 만명 확진 영국의 '위드코로나' 실내 마스크, 코로나 패스 없어져..확진자 자...",중립,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.12,0.67
3,2022-02-10 00:17:57+00:00,1491567077701734400,"독일, 백신패스 폐지...폴란드·체코, 다음 달 방역규제 완전 철폐 (출처 : YT...",prosumer1005,"독일, 백신패스 폐지...폴란드·체코, 다음 달 방역규제 완전 철폐 (출처 : YT...",중립,0.01,0.01,0.02,0.18,0.01,0.01,0.01,0.01,0.07,0.70
4,2022-02-10 00:18:22+00:00,1491567179585802240,[백신패스 폐지국가] \n독일🇩🇪/스웨덴🇸🇪/덴마크🇩🇰/이스라엘🇮🇱/미국🇺🇲/폴란드...,Green_Frog_Day,[백신패스 폐지국가] 독일🇩🇪/스웨덴🇸🇪/덴마크🇩🇰/이스라엘🇮🇱/미국🇺🇲/폴란드🇵...,중립,0.02,0.01,0.02,0.74,0.01,0.01,0.03,0.03,0.08,0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13025,2022-03-08 23:35:00+00:00,1501340740806320133,"나는 무고죄 강화 , 여가부 폐지 , 의료 민영화를 외치는 인간이 대통령 하는 거 ...",affectionate524,"나는 무고죄 강화 , 여가부 폐지 , 의료 민영화를 외치는 인간이 대통령 하는 거 ...",분노,0.66,0.02,0.01,0.04,0.01,0.02,0.01,0.10,0.37,0.04
13026,2022-03-08 23:48:38+00:00,1501344168378912769,세계 여성의 날에 다시 여성가족부 폐지와 무고죄 처벌 강화를 SNS에 재공유하는 ...,jinsungjoon,세계 여성의 날에 다시 여성가족부 폐지와 무고죄 처벌 강화를 SNS에 재공유하는 ...,혐오,0.36,0.01,0.02,0.02,0.00,0.01,0.01,0.05,0.18,0.26
13027,2022-03-08 23:50:15+00:00,1501344577252470787,노후를 생각하면 이재명이죠! 투표해야 가능합니다. (3월 9일 수요일 오전 6시 ~...,GinAgadaHong,노후를 생각하면 이재명이죠! 투표해야 가능합니다. (3월 9일 수요일 오전 6시 ~...,중립,0.01,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.08,0.82
13028,2022-03-08 23:55:19+00:00,1501345852450770945,3월9일 서울 유권자 수. 833만명.\n부산. 울산. 경남. 전체 인구가 800만...,kangminhyeok87,3월9일 서울 유권자 수. 3만명. 부산. 울산. 경남. 전체 인구가 0만명이다. ...,중립,0.16,0.01,0.01,0.03,0.01,0.74,0.01,0.07,0.07,0.09
