In [1]:
!pip install transformers
!pip install pyarrow
!pip install nlp
!pip install datasets --upgrade





In [6]:
from typing import Dict

import matplotlib.pyplot as plt
import nlp
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset
import transformers
from transformers import (ElectraForSequenceClassification,
                          ElectraTokenizerFast, EvalPrediction, InputFeatures,
                          Trainer, TrainingArguments, glue_compute_metrics,
                         AutoModelForSequenceClassification)

transformers.__version__

'4.28.0'

------ Dataset from https://github.com/smilegate-ai/korean_unsmile_dataset -------

In [3]:
from datasets import load_dataset
datasets = load_dataset('smilegate-ai/kor_unsmile')

print(datasets)

# Look at the labels
# print("Training set labels: {}".format(datasets["train"]))
# print("Validation set labels: {}".format(datasets["valid"]))
# print("Test set labels: {}".format(datasets["test"]))

DatasetDict({
    train: Dataset({
        features: ['문장', '여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', 'clean', '개인지칭', 'labels'],
        num_rows: 15005
    })
    valid: Dataset({
        features: ['문장', '여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', 'clean', '개인지칭', 'labels'],
        num_rows: 3737
    })
})


In [4]:
# Explore the dataset
train_df = pd.DataFrame({"sentence": datasets["train"]['문장'],
                   "label": datasets["train"]['labels']})
pd.options.display.max_colwidth = 0
train_df.head()

Unnamed: 0,sentence,label
0,일안하는 시간은 쉬고싶어서 그런게 아닐까,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,아동성범죄와 페도버는 기록바 끊어져 영원히 고통 받는다. 무슬림 50퍼 근친이다. 10 16까지 IQ 떨어지고 출산 위험은 400% 다.,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
2,"루나 솔로앨범 나왔을 때부터 머모 기운 있었음 ㅇㅇ Keep o doin 진짜 띵곡임 꼭 들어보셈""","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
3,홍팍에도 어버이연합인가 보내요 뭐 이런뎃글 있는데 이거 어버이연합측에 신고하면 그쪽에서 고소 가능하냐?,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,아놔 왜 여기 댓들은 다 여자들이 김치녀라고 먼저 불렸다! 여자들은 더 심하게 그런다! 이렇게 2개로 싸우냐.. 내가 볼 땐 둘다 나쁜 말 그 이상도 이하도 아닌데,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [8]:
from transformers import TextClassificationPipeline, BertForSequenceClassification, AutoTokenizer
model_checkpoint = 'beomi/KcBERT-Base'

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)  # 토크나이져 가져오기

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device = 0,   # cpu: -1, gpu: gpu number
    return_all_scores = True,
    function_to_apply = 'softmax'  # sigmoid
)

for result in pipe("이래서 여자는 게임을 하면 안된다")[0]:
    print(result)

Some weights of the model checkpoint at beomi/KcBERT-Base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

{'label': 'LABEL_0', 'score': 0.49474993348121643}
{'label': 'LABEL_1', 'score': 0.5052500367164612}


-----------------------------------------------------

In [9]:
class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

        # Tokenize the input
        self.tokenized_inputs = tokenizer(inputs, padding=True)   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
            token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])

In [10]:
train_dataset = TrainerDataset(datasets["train"]["문장"],
                               datasets["train"]["labels"], tokenizer)
eval_dataset = TrainerDataset(datasets["valid"]["문장"],
                              datasets["valid"]["labels"], tokenizer)

In [11]:
# Set seed for reproducibility
np.random.seed(123)
torch.manual_seed(123)

training_args = TrainingArguments(
    output_dir="./models/model_electra",
    num_train_epochs=3,  # 1 (1 epoch gives slightly lower accuracy)
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
#     evaluate_during_training=True,     
    dataloader_drop_last=True,  # Make sure all batches are of equal size
)


def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    # The choice of a dataset (task_name) implies metric
    return glue_compute_metrics(
        task_name="sst-2",
        preds=preds,
        labels=p.label_ids)


# Instantiate the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics)

In [12]:
trainer.train()



In [None]:
# model_result = trainer.evaluate()
# print("Accuracy: {}".format(model_result["eval_acc"]))

In [13]:
text = "개슬람녀 다 필요없고 니 엄마만 있으면 된다"
# true_label = 1

[x for x in datasets["valid"] if x["문장"] == text]

[{'문장': '개슬람녀 다 필요없고 니 엄마만 있으면 된다',
  '여성/가족': 0,
  '남성': 0,
  '성소수자': 0,
  '인종/국적': 1,
  '연령': 0,
  '지역': 0,
  '종교': 1,
  '기타 혐오': 0,
  '악플/욕설': 0,
  'clean': 0,
  '개인지칭': 0,
  'labels': [0, 0, 0, 1, 0, 0, 1, 0, 0, 0]}]

In [None]:
from transformers import BertForSequenceClassification, AutoTokenizer

model_name = 'beomi/KcBERT-Base'

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assuming you saved the model using model.save_pretrained
# Load the model from the saved directory
loaded_model = BertForSequenceClassification.from_pretrained('./models/model_electra/checkpoint-1000')

# Now you can create a TextClassificationPipeline with the loaded model
pipe = TextClassificationPipeline(
    model=loaded_model,
    tokenizer=tokenizer,
    device=0,  # Adjust the device as needed (cpu: -1, gpu: gpu number)
    return_all_scores=True,
    function_to_apply='softmax'  # Change to 'sigmoid' if needed
)

# Example input text for inference
input_text = "한글 문장을 입력하여 감정을 분류해 봅시다."

# Perform inference
result = pipe(input_text)

# Print or use the result as needed
print(result)

In [None]:
input_text = "그게 표준하긴 한데 한국말이 좋같이 들리잖아요."  # 좆

# Perform inference
result = pipe(input_text)


dict_result = { x['label']:x['score'] for x in result[0]}
sorted(dict_result.items(), key=lambda x : x[1],reverse=True)

In [None]:
input_text2 = "그게 표준하긴 한데 한국말이 좆같이 들리잖아요."

# Perform inference
result = pipe(input_text2)


dict_result2 = { x['label']:x['score'] for x in result[0]}
sorted(dict_result2.items(), key=lambda x : x[1],reverse=True)

In [None]:
input_text3 = "미애들 개새키야. 이 ㅅㅂ놈아. 미친 새끼야. 니 말투 그렇게 하면 누가 ㅅㅂ놈아 믿겠냐.."

# Perform inference
result = pipe(input_text3)


dict_result2 = { x['label']:x['score'] for x in result[0]}
sorted(dict_result2.items(), key=lambda x : x[1],reverse=True)

### Lime 패키지 활용부분 자동화

In [None]:
from lime import lime_tabular, lime_text
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

import random
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [None]:
def generate_explanation(df, idx, c, class_names, num_features=6):
    # LIME 설명 인스턴스 생성
    explainer = lime_text.LimeTextExplainer(class_names=class_names)    
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_data)
    X_test = vectorizer.transform(test_data)
    c = make_pipeline(vectorizer, rf)
    
    # LIME 설명 생성
    exp = explainer.explain_instance(df.text.values[idx], c.predict_proba, num_features=num_features)
    
    print('Document id:', idx)
    print('Probability(욕설) =', c.predict_proba([df.text.values[idx]])[0, 1])
    print('True class:', class_names[df.label.values[idx]])

    # 시각화
    fig = exp.as_pyplot_figure()
    plt.show()

    # 노트북에 텍스트로 표시
    exp.show_in_notebook(text=True)

# 사용 예시
idx = 100  # 원하는 인덱스로 설정
generate_explanation(combined_df, idx, c, class_names, num_features=6)