In [1]:
import os
import re
import json
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, TFAutoModel
import warnings

In [2]:
def data_import(data_name):
    data_path = os.getenv('HOME')+'/aiffel/project_data/dlthon/'+data_name
    imported_data = pd.read_csv(data_path)
    return imported_data

def cleaning_sentence(sentence):
        sentence = sentence.lower()
        sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
        sentence = re.sub(r'\([^)]*\)', '', sentence)
        sentence = re.sub(r'[" "]+', " ", sentence)
        sentence = re.sub("[^가-힣a-zA-Z0-9\.\?\!,]+", " ", sentence)
        sentence = re.sub(r'[\n\r]+', ' ', sentence)
        sentence = sentence.strip()
        return sentence

def preprocess_sentence(data_list):
    retrun_list = []
    for sentence_frame in data_list:
         befor_df = {}
         conv_data = []
         class_data = []
         class_name = sentence_frame['class'][0]
         for sentence in sentence_frame['conversation']:
             cleaned_sentence = cleaning_sentence(sentence)
             conv_data.append(cleaned_sentence)
             class_data.append(class_name)
         return_df = pd.DataFrame({'class' : class_data, 'conversation': conv_data})
         retrun_list.append(return_df)
    return retrun_list

def random_deletion(text, prob=0.2):
    words = text.split()
    if len(words) == 1:
        return text
    return ' '.join([word for word in words if random.random() > prob])

def random_swap(text, n=1):
    words = text.split()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def data_aug(data_list, prob, n):
    len_data = []

    for data in data_list:
        len_data.append(len(data))

    max_len_of = max(len_data)

    return_data = []

    for data_set in data_list:
        if len(data_set) != max_len_of:
            conver_data = []
            class_Data = []
            return_df = {}
            aug_len = max_len_of - len(data_set)
            class_name = data_set['class'][0]
            for i in range(aug_len): class_Data.append(class_name)
            for i in range(aug_len):
                choice_num = random.random()
                random_seq = data_set['conversation'].sample(1).iloc[0]
                if choice_num >= 0.5:
                    output_seq = random_deletion(random_seq, prob)
                    conver_data.append(output_seq)
                else:
                    output_seq = random_swap(random_seq, n)
                    conver_data.append(output_seq)
                    
            retrun_df = pd.DataFrame({'class':class_Data, 'conversation':conver_data})
        else:
            retrun_df = 0

        return_data.append(retrun_df)

    final_list = []

    for auged_data, real_data in zip(return_data, data_list):
        if isinstance(auged_data, pd.DataFrame):
            real_data = pd.concat([real_data, auged_data])
            real_data.reset_index(drop=True, inplace=True)
            final_list.append(real_data)
        else:
            final_list.append(real_data)
    
    return final_list

In [3]:
def cal_len(train, test, rate):
    con_data = np.concatenate((train, test), axis = 0)
    seg_len = []
    spl_len = []
    for i in con_data:
        single_seg_len = len(i)
        seg_len.append(single_seg_len)
    for i in con_data:
        single_spl_len = len(i.split())
        spl_len.append(single_spl_len)
    print('spl len is : ', spl_len[int(len(spl_len)*rate)])
    print('seg len is : ', seg_len[int(len(seg_len)*rate)])

In [4]:
train_data = data_import('train.csv')
nomal_data = data_import('nomal_data.csv')

In [5]:
train_data = train_data.drop(train_data.columns[0], axis=1)
threat_data = train_data[train_data['class'] == '협박 대화']
extort_data = train_data[train_data['class'] == '갈취 대화']
co_bully_data = train_data[train_data['class'] == '직장 내 괴롭힘 대화']
bully_data = train_data[train_data['class'] == '기타 괴롭힘 대화']

threat_data.reset_index(drop=True, inplace=True)
extort_data.reset_index(drop=True, inplace=True)
co_bully_data.reset_index(drop=True, inplace=True)
bully_data.reset_index(drop=True, inplace=True)

k = []
for i in range(nomal_data.shape[0]): k.append('일반 대화')
nomal_data['class'] = k
nomal_data = nomal_data.rename(columns={'0':'conversation'})
nomal_data = nomal_data[['class', 'conversation']]
nomal_data = nomal_data.sample(2000)
nomal_data.reset_index(drop=True, inplace=True)

In [6]:
data_list = [nomal_data, threat_data, extort_data, co_bully_data, bully_data]
auged_data_set = data_aug(data_list, 0.2, 2)

In [7]:
preprocessed_data_set = preprocess_sentence(auged_data_set)

In [8]:
data_set = pd.concat(preprocessed_data_set)

In [9]:
for i in range(10):
    data_set = data_set.sample(frac=1)
    data_set.reset_index(drop=True, inplace=True)

data_set

Unnamed: 0,class,conversation
0,직장 내 괴롭힘 대화,저 이번에 휴가 갔다와도 쉬다가 오고 쉬고싶어 ? 그건 아닌데 . 다른 직원들은 이...
1,협박 대화,저기요 할아버지 여기 버스 안이기도 한데 환기도 잘 안되니까 마스크 좀 써주세요 ....
2,협박 대화,밤에 그만 좀 쿵쿵거려요 저번에도 말씀드렸잖아요 네 ? 저희 아이는 밤에 조용히 잠...
3,기타 괴롭힘 대화,쟤 약간 고릴라 닮지 않았어 ? 풉 그러게 우가우가 야 ! 고릴라 고릴라 흉내좀 내...
4,직장 내 괴롭힘 대화,박 대리 맨날 지각 하냐 ? 죄송합니다 . 하지만 1분 늦었어요 1분은 시간 아니야...
...,...,...
9995,일반 대화,"이건 무슨 오일이에요 ? 어디에다 바르는 거죠 ? 코코넛오일이구요 , 바디나 페이스..."
9996,일반 대화,"네 , 그 다음으로 어떤 케이크가 잘나가요 ? 그 다음으로 지금 다 초코케이크라 블..."
9997,협박 대화,다 죽여버릴꺼야 ! ! ! 가스통 이거 터트리고 다 같이 죽는거야 ! ! ! 선생님...
9998,직장 내 괴롭힘 대화,주 등산 ! 안 가도 되니까 불참자 편히 말해줘요 . 저 . 과장님 저 이번에 . ...


In [10]:
convert = LabelEncoder()
data_set['class'] = convert.fit_transform(data_set['class'])

data_set

Unnamed: 0,class,conversation
0,3,저 이번에 휴가 갔다와도 쉬다가 오고 쉬고싶어 ? 그건 아닌데 . 다른 직원들은 이...
1,4,저기요 할아버지 여기 버스 안이기도 한데 환기도 잘 안되니까 마스크 좀 써주세요 ....
2,4,밤에 그만 좀 쿵쿵거려요 저번에도 말씀드렸잖아요 네 ? 저희 아이는 밤에 조용히 잠...
3,1,쟤 약간 고릴라 닮지 않았어 ? 풉 그러게 우가우가 야 ! 고릴라 고릴라 흉내좀 내...
4,3,박 대리 맨날 지각 하냐 ? 죄송합니다 . 하지만 1분 늦었어요 1분은 시간 아니야...
...,...,...
9995,2,"이건 무슨 오일이에요 ? 어디에다 바르는 거죠 ? 코코넛오일이구요 , 바디나 페이스..."
9996,2,"네 , 그 다음으로 어떤 케이크가 잘나가요 ? 그 다음으로 지금 다 초코케이크라 블..."
9997,4,다 죽여버릴꺼야 ! ! ! 가스통 이거 터트리고 다 같이 죽는거야 ! ! ! 선생님...
9998,3,주 등산 ! 안 가도 되니까 불참자 편히 말해줘요 . 저 . 과장님 저 이번에 . ...


In [11]:
convert.classes_

array(['갈취 대화', '기타 괴롭힘 대화', '일반 대화', '직장 내 괴롭힘 대화', '협박 대화'],
      dtype=object)

In [12]:
train_data = np.array(data_set['conversation'][:int(0.8*len(data_set))], dtype = str)
train_label = np.array(data_set['class'][:int(0.8*len(data_set))], dtype = 'int32')
test_data = np.array(data_set['conversation'][int(0.8*len(data_set)):], dtype = str)
test_label = np.array(data_set['class'][int(0.8*len(data_set)):], dtype = 'int32')

In [13]:
cal_len(train_data, test_data, 0.8)

spl len is :  145
seg len is :  460


In [14]:
bert_model = TFAutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
def bert_encode(datas, sent_max_length):
    input_ids = []
    attention_masks = []
    
    for sent in datas:
        encoded = tokenizer.encode_plus(sent,
                                        add_special_tokens = True,
                                        max_length = sent_max_length,
                                        padding='max_length',
                                        truncation = True,
                                        return_attention_mask=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

In [16]:
train_input_ids, train_attention_mask = bert_encode(train_data, 200)

In [17]:
train_input_ids

array([[     0,  13913, 165880, ...,      1,      1,      1],
       [     0,  13913,   1503, ...,   8048,  15313,      2],
       [     0,  76799,    480, ...,      1,      1,      1],
       ...,
       [     0,  52338,   6685, ...,      1,      1,      1],
       [     0,  52367,   1504, ...,   4253,      6,      2],
       [     0,   7592, 123642, ...,   1190,   4253,      2]])

In [18]:
train_label

array([3, 4, 4, ..., 1, 1, 4], dtype=int32)

In [19]:
from tensorflow.keras.optimizers import Adam

In [20]:
def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(200,), dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=(200,), dtype=tf.int32)
    
    output = bert_model([input_ids, attention_mask])
    output = output.last_hidden_state[:,0,:]
    output = tf.keras.layers.Dense(32, activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(5, activation='softmax')(output)
    
    model = tf.keras.Model(inputs = [input_ids, attention_mask], outputs = output)
    model.compile(Adam(learning_rate=0.0001),loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [21]:
use_bert_model = create_model(bert_model)
use_bert_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 117653760   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 384)          0           tf_bert_model[0][13]         

In [22]:
tf.keras.backend.clear_session()

In [23]:
history = use_bert_model.fit([train_input_ids, train_attention_mask], train_label, validation_split=0.2, epochs = 10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
test_input_ids, test_attention_mask = bert_encode(test_data, 200)

In [31]:
train_input_ids.shape

(8000, 200)

In [30]:
test_input_ids.shape

(2000, 200)

In [32]:
use_bert_model.evaluate([test_input_ids, test_attention_mask], test_label)



[0.2998300790786743, 0.9369999766349792]

In [33]:
final_data = data_import('test.csv')

In [36]:
final_data = np.array(final_data['text'], dtype = str)

In [38]:
final_data.shape

(500,)

In [39]:
final_input_ids, final_attention_mask = bert_encode(final_data, 200)

In [40]:
final_input_ids.shape

(500, 200)

In [41]:
predictions  = use_bert_model.predict([final_input_ids, final_attention_mask])

In [42]:
predicted_classes = np.argmax(predictions, axis=1)

In [43]:
predicted_classes

array([0, 3, 3, 1, 3, 4, 4, 0, 3, 0, 4, 1, 3, 3, 3, 1, 4, 1, 4, 1, 4, 1,
       1, 0, 0, 3, 1, 1, 3, 1, 0, 4, 1, 0, 4, 1, 1, 4, 4, 0, 4, 0, 3, 0,
       1, 3, 4, 1, 1, 4, 0, 1, 0, 3, 3, 1, 1, 1, 1, 1, 0, 1, 4, 1, 3, 4,
       1, 1, 2, 0, 1, 3, 1, 3, 4, 3, 3, 0, 3, 1, 1, 4, 2, 4, 1, 3, 4, 1,
       0, 3, 1, 3, 4, 1, 1, 3, 1, 3, 4, 3, 0, 0, 1, 4, 3, 3, 3, 1, 1, 4,
       1, 4, 4, 3, 0, 0, 1, 3, 1, 0, 1, 1, 0, 0, 4, 0, 3, 0, 0, 1, 1, 1,
       3, 1, 4, 4, 1, 4, 1, 1, 1, 1, 0, 1, 3, 4, 4, 4, 4, 0, 4, 1, 2, 2,
       1, 1, 4, 3, 1, 1, 4, 1, 3, 4, 1, 1, 3, 1, 1, 1, 4, 3, 3, 1, 1, 1,
       3, 3, 0, 1, 3, 1, 0, 3, 4, 1, 1, 4, 0, 1, 4, 1, 0, 3, 0, 4, 1, 0,
       1, 1, 3, 1, 1, 3, 4, 1, 3, 1, 3, 3, 4, 0, 1, 3, 0, 1, 1, 1, 4, 4,
       0, 3, 3, 1, 4, 1, 0, 4, 0, 1, 3, 4, 1, 1, 0, 1, 1, 0, 4, 1, 1, 1,
       1, 3, 1, 0, 3, 1, 3, 1, 4, 3, 4, 3, 3, 0, 1, 3, 0, 3, 0, 4, 4, 1,
       3, 1, 1, 4, 1, 1, 0, 1, 4, 3, 0, 3, 3, 1, 3, 3, 0, 4, 1, 3, 3, 3,
       0, 3, 4, 1, 0, 0, 1, 1, 3, 1, 1, 1, 0, 4, 3,

In [44]:
predicted_classes_df = pd.DataFrame(predicted_classes)

In [45]:
predicted_classes_df

Unnamed: 0,0
0,0
1,3
2,3
3,1
4,3
...,...
495,3
496,3
497,0
498,4


In [47]:
label_map = {0: '갈취 대화', 1: '기타 괴롭힘 대화', 2: '일반 대화', 3: '직장 내 괴롭힘 대화', 4: '협박 대화'}

predicted_classes_df['encoded_label'] = predicted_classes_df[0].map(label_map)

predicted_classes_df.head()

Unnamed: 0,0,encoded_label
0,0,갈취 대화
1,3,직장 내 괴롭힘 대화
2,3,직장 내 괴롭힘 대화
3,1,기타 괴롭힘 대화
4,3,직장 내 괴롭힘 대화


In [49]:
predicted_classes_df.shape

(500, 2)

In [50]:
sub_data = data_import('submission.csv')

In [51]:
sub_data.shape

(500, 2)

In [53]:
sub_data.head()

Unnamed: 0,file_name,class
0,t_000,
1,t_001,
2,t_002,
3,t_003,
4,t_004,


In [54]:
id_col = sub_data['file_name']

In [56]:
id_col

0      t_000
1      t_001
2      t_002
3      t_003
4      t_004
       ...  
495    t_495
496    t_496
497    t_497
498    t_498
499    t_499
Name: file_name, Length: 500, dtype: object

In [57]:
label_map_for_sub = {'협박 대화': 0, '갈취 대화': 1, '직장 내 괴롭힘 대화': 2, '기타 괴롭힘 대화': 3, '일반 대화': 4}

predicted_classes_df['class'] = predicted_classes_df['encoded_label'].map(label_map_for_sub)

predicted_classes_df.head()

Unnamed: 0,0,encoded_label,class
0,0,갈취 대화,1
1,3,직장 내 괴롭힘 대화,2
2,3,직장 내 괴롭힘 대화,2
3,1,기타 괴롭힘 대화,3
4,3,직장 내 괴롭힘 대화,2


In [58]:
classis = predicted_classes_df['class']

In [61]:
id_col = list(id_col)
classis = list(classis)

In [63]:
sub_dict = {"idx" : id_col, "class" : classis}

In [65]:
submission = pd.DataFrame(sub_dict)

In [66]:
submission.head()

Unnamed: 0,idx,class
0,t_000,1
1,t_001,2
2,t_002,2
3,t_003,3
4,t_004,2


In [67]:
submission.to_csv(os.getenv('HOME')+"/aiffel/project_data/dlthon/submission.csv", index=False)