In [1]:
import os
import re
import json
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertModel
import warnings

In [2]:
def data_import(data_name):
    data_path = os.getenv('HOME')+'/aiffel/project_data/dlthon/'+data_name
    imported_data = pd.read_csv(data_path)
    return imported_data

def cleaning_sentence(sentence):
        sentence = sentence.lower()
        sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
        sentence = re.sub(r'\([^)]*\)', '', sentence)
        sentence = re.sub(r'[" "]+', " ", sentence)
        sentence = re.sub("[^가-힣a-zA-Z0-9\.\?\!,]+", " ", sentence)
        sentence = re.sub(r'[\n\r]+', ' ', sentence)
        sentence = sentence.strip()
        return sentence

def preprocess_sentence(data_list):
    retrun_list = []
    for sentence_frame in data_list:
         befor_df = {}
         conv_data = []
         class_data = []
         class_name = sentence_frame['class'][0]
         for sentence in sentence_frame['conversation']:
             cleaned_sentence = cleaning_sentence(sentence)
             conv_data.append(cleaned_sentence)
             class_data.append(class_name)
         return_df = pd.DataFrame({'class' : class_data, 'conversation': conv_data})
         retrun_list.append(return_df)
    return retrun_list

def random_deletion(text, prob=0.2):
    words = text.split()
    if len(words) == 1:
        return text
    return ' '.join([word for word in words if random.random() > prob])

def random_swap(text, n=1):
    words = text.split()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def data_aug(data_list, prob, n):
    len_data = []

    for data in data_list:
        len_data.append(len(data))

    max_len_of = max(len_data)

    return_data = []

    for data_set in data_list:
        if len(data_set) != max_len_of:
            conver_data = []
            class_Data = []
            return_df = {}
            aug_len = max_len_of - len(data_set)
            class_name = data_set['class'][0]
            for i in range(aug_len): class_Data.append(class_name)
            for i in range(aug_len):
                choice_num = random.random()
                random_seq = data_set['conversation'].sample(1).iloc[0]
                if choice_num >= 0.5:
                    output_seq = random_deletion(random_seq, prob)
                    conver_data.append(output_seq)
                else:
                    output_seq = random_swap(random_seq, n)
                    conver_data.append(output_seq)
                    
            retrun_df = pd.DataFrame({'class':class_Data, 'conversation':conver_data})
        else:
            retrun_df = 0

        return_data.append(retrun_df)

    final_list = []

    for auged_data, real_data in zip(return_data, data_list):
        if isinstance(auged_data, pd.DataFrame):
            real_data = pd.concat([real_data, auged_data])
            real_data.reset_index(drop=True, inplace=True)
            final_list.append(real_data)
        else:
            final_list.append(real_data)
    
    return final_list

In [3]:
def cal_len(train, test, rate):
    con_data = np.concatenate((train, test), axis = 0)
    seg_len = []
    spl_len = []
    for i in con_data:
        single_seg_len = len(i)
        seg_len.append(single_seg_len)
    for i in con_data:
        single_spl_len = len(i.split())
        spl_len.append(single_spl_len)
    print('spl len is : ', spl_len[int(len(spl_len)*rate)])
    print('seg len is : ', seg_len[int(len(seg_len)*rate)])

In [4]:
train_data = data_import('train.csv')
nomal_data = data_import('nomal_data.csv')

In [5]:
train_data = train_data.drop(train_data.columns[0], axis=1)
threat_data = train_data[train_data['class'] == '협박 대화']
extort_data = train_data[train_data['class'] == '갈취 대화']
co_bully_data = train_data[train_data['class'] == '직장 내 괴롭힘 대화']
bully_data = train_data[train_data['class'] == '기타 괴롭힘 대화']

threat_data.reset_index(drop=True, inplace=True)
extort_data.reset_index(drop=True, inplace=True)
co_bully_data.reset_index(drop=True, inplace=True)
bully_data.reset_index(drop=True, inplace=True)

k = []
for i in range(nomal_data.shape[0]): k.append('일반 대화')
nomal_data['class'] = k
nomal_data = nomal_data.rename(columns={'0':'conversation'})
nomal_data = nomal_data[['class', 'conversation']]
nomal_data = nomal_data.sample(2000)
nomal_data.reset_index(drop=True, inplace=True)

In [6]:
data_list = [nomal_data, threat_data, extort_data, co_bully_data, bully_data]
auged_data_set = data_aug(data_list, 0.2, 2)

In [7]:
preprocessed_data_set = preprocess_sentence(auged_data_set)

In [8]:
data_set = pd.concat(preprocessed_data_set)

In [9]:
for i in range(10):
    data_set = data_set.sample(frac=1)
    data_set.reset_index(drop=True, inplace=True)

data_set

Unnamed: 0,class,conversation
0,협박 대화,당신 ! 오늘 티내지마 ! ! 내가 못배워먹어서 고작 이딴 일이나 하는거지 죄송합니...
1,갈취 대화,야 너 일로와봐 네 . ? 언니가 버스비가 없어서그런데 돈좀 줄래 ? 아니다 그냥 ...
2,협박 대화,오랜만이야 . 그러게 너는 행복하게 살고있네 ? 나는 너때문에 지금까지 불행하게 살...
3,협박 대화,김사장 이건 상도덕이 아니지 박사장 미안하네 그래도 거리도 있고 아니 한동네 안에서...
4,직장 내 괴롭힘 대화,김대리 네 부장님 김대리 요새 얼굴이 그게뭐야 ? 제 얼굴이 왜요 ? 거울을 보고는...
...,...,...
9995,갈취 대화,승민아 나 담배사야되는데 만원만 줘 오늘 나 돈 에이 만원도 사람이 어딨냐 좀 줘봐...
9996,직장 내 괴롭힘 대화,야 그래픽 작업 퀄리티가 이 따위야 ? 죄송합니다 . 아오 진짜 왜 이런 쓰래기들을...
9997,직장 내 괴롭힘 대화,지수씨 끝나고 우리 팀 회식할까요 ? 다들 괜찮다는데 네 ? 아 . 죄송합니다 . ...
9998,협박 대화,죽고 그런 환장했어 ? 왜 이렇게 말을 한 들어 ? 죄송해요 . 한 번만 봐주세요 ...


In [10]:
convert = LabelEncoder()
data_set['class'] = convert.fit_transform(data_set['class'])

data_set

Unnamed: 0,class,conversation
0,4,당신 ! 오늘 티내지마 ! ! 내가 못배워먹어서 고작 이딴 일이나 하는거지 죄송합니...
1,0,야 너 일로와봐 네 . ? 언니가 버스비가 없어서그런데 돈좀 줄래 ? 아니다 그냥 ...
2,4,오랜만이야 . 그러게 너는 행복하게 살고있네 ? 나는 너때문에 지금까지 불행하게 살...
3,4,김사장 이건 상도덕이 아니지 박사장 미안하네 그래도 거리도 있고 아니 한동네 안에서...
4,3,김대리 네 부장님 김대리 요새 얼굴이 그게뭐야 ? 제 얼굴이 왜요 ? 거울을 보고는...
...,...,...
9995,0,승민아 나 담배사야되는데 만원만 줘 오늘 나 돈 에이 만원도 사람이 어딨냐 좀 줘봐...
9996,3,야 그래픽 작업 퀄리티가 이 따위야 ? 죄송합니다 . 아오 진짜 왜 이런 쓰래기들을...
9997,3,지수씨 끝나고 우리 팀 회식할까요 ? 다들 괜찮다는데 네 ? 아 . 죄송합니다 . ...
9998,4,죽고 그런 환장했어 ? 왜 이렇게 말을 한 들어 ? 죄송해요 . 한 번만 봐주세요 ...


In [11]:
convert.classes_

array(['갈취 대화', '기타 괴롭힘 대화', '일반 대화', '직장 내 괴롭힘 대화', '협박 대화'],
      dtype=object)

In [12]:
train_data = np.array(data_set['conversation'][:int(0.8*len(data_set))], dtype = str)
train_label = np.array(data_set['class'][:int(0.8*len(data_set))], dtype = 'int32')
test_data = np.array(data_set['conversation'][int(0.8*len(data_set)):], dtype = str)
test_label = np.array(data_set['class'][int(0.8*len(data_set)):], dtype = 'int32')

In [13]:
cal_len(train_data, test_data, 0.8)

spl len is :  14
seg len is :  51


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = TFBertModel.from_pretrained("bert-base-multilingual-cased")

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
def bert_encode(datas, sent_max_length):
    input_ids = []
    attention_masks = []
    
    for sent in datas:
        encoded = tokenizer.encode_plus(sent,
                                        add_special_tokens = True,
                                        max_length = sent_max_length,
                                        pad_to_max_length = True,
                                        truncation = True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

In [16]:
train_input_ids, train_attention_mask = bert_encode(train_data, 200)



In [17]:
train_input_ids

array([[   101,   9067,  25387, ...,  16985,    106,    102],
       [   101,   9538,   9004, ...,      0,      0,      0],
       [   101,   9580, 118856, ...,  11664,   9049,    102],
       ...,
       [   101,   9730,  44321, ...,      0,      0,      0],
       [   101,   9408,  26737, ...,      0,      0,      0],
       [   101,   9095,  24017, ...,      0,      0,      0]])

In [18]:
train_label

array([4, 0, 4, ..., 2, 4, 0], dtype=int32)

In [19]:
from tensorflow.keras.optimizers import Adam

In [20]:
def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(200,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(200,), dtype=tf.int32, name="attention_mask")
    
    output = bert_model([input_ids, attention_mask])
    output = output[1]
    output = tf.keras.layers.Dense(32, activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(5, activation='softmax')(output)
    
    model = tf.keras.Model(inputs = [input_ids, attention_mask], outputs = output)
    model.compile(Adam(learning_rate=0.0001),loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [21]:
use_bert_model = create_model(model)
use_bert_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 200)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 200)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 177853440   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 32)           24608       tf_bert_model[0][1]          

In [22]:
tf.keras.backend.clear_session()

In [23]:
history = use_bert_model.fit([train_input_ids, train_attention_mask], train_label, validation_split=0.2, epochs = 10, batch_size=16)

Epoch 1/10
 58/400 [===>..........................] - ETA: 3:38 - loss: 1.3969 - accuracy: 0.3287

KeyboardInterrupt: 