In [None]:
import os
import re

import pandas as pd
import numpy as np

import tensorflow as tf
from konlpy.tag import Mecab
from collections import Counter
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import platform

### Step1. 데이터 불러오기

In [None]:
train_data_path ="~/data/train.csv"
train_data = pd.read_csv(train_data_path)
train_data.tail()

In [None]:
train_data.dtypes

In [None]:
label_mapping = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3
    # 이와 같이 레이블과 숫자를 매핑해줍니다.
}
train_data['class'] = train_data['class'].replace(label_mapping).astype('int')

In [None]:
train_data

In [None]:
train_data.dtypes

In [None]:
test_data_path ="~/data/test.json"
test_data = pd.read_json(test_data_path)
test_data = test_data.transpose()
test_data.to_csv("test_data.csv", mode="w")

In [None]:
test_data = pd.read_csv('./test_data.csv', index_col= 0)
test_data.tail()

In [None]:
# sample_answersheet_path ="~/data/sample_answersheet.json"
# sample_answersheet_data = pd.read_json(sample_answersheet_path)
# sample_answersheet_data.tail()

1. 데이터 살펴보기

In [None]:
train_data.groupby('class').count()

2. 데이터 중복 제거 

In [None]:
train_data.drop_duplicates()

In [None]:
cleaned_corpus_cov = list(set(train_data.conversation))
print("Data Size:", len(cleaned_corpus_cov))

In [None]:
list(cleaned_corpus_cov)[30]

In [None]:
list(cleaned_corpus_cov)[50]

In [None]:
list(cleaned_corpus_cov)[51]

3. 데이터 분포 보기

In [None]:
min_len = 999
max_len = 0
sum_len = 0

cleaned_corpus_cov = list(set(train_data.conversation))  # set를 사용해서 중복을 제거합니다.
print("Data Size:", len(cleaned_corpus_cov))



# 한글 폰트 설정
if platform.system() == 'Windows':
    plt.rc('font', family='Malgun Gothic')
elif platform.system() == 'Darwin':
    plt.rc('font', family='AppleGothic')
else:
    plt.rc('font', family='NanumGothic')  # 리눅스의 경우 나눔고딕 폰트를 사용하도록 변경해주세요.

# 그래프에서 마이너스 기호가 표시되도록 설정
plt.rc('axes', unicode_minus=False)



for sen in cleaned_corpus_cov:
    length = len(sen)
    if min_len > length: min_len = length
    if max_len < length: max_len = length
    sum_len += length

print("문장의 최단 길이:", min_len)
print("문장의 최장 길이:", max_len)
print("문장의 평균 길이:", sum_len // len(cleaned_corpus_cov))

sentence_length = np.zeros((max_len), dtype=int)

for sen in cleaned_corpus_cov:   # 중복이 제거된 코퍼스 기준
    sentence_length[len(sen)-1] += 1

plt.bar(range(max_len), sentence_length, width=1.0)
plt.title("Sentence Length Distribution")
plt.xlabel(xlabel='글자 수')
plt.ylabel(ylabel='빈도 수')
plt.show()

In [None]:
# 중복제거 
ax = train_data.conversation.str.len().hist()

max_length = train_data.conversation.str.len().max()
min_length = train_data.conversation.str.len().min()
mean_length = train_data.conversation.str.len().mean()

ax.set_xlabel(xlabel='글자 수')
ax.set_ylabel(ylabel='빈도 수')

print("최대길이:", max_length)
print("최소길이:", min_length)
print("평균길이:", mean_length)

중복을 제거하나 안하나 별 차이가 없다. 그냥 데이터 정제나 해야겠다. 

In [None]:
len(cleaned_corpus_cov)

In [None]:
cleaned_400 = cleaned_corpus_cov[:400]
cleaned_400

### Step2. 데이터 정제

In [None]:
def preprocess_sentence(sentence, s_token=True, e_token=True):
    sentence = re.sub(r"([?.!,])", r" \1 ", str(sentence))
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,가-힣ㄱ-ㅎㅏ-ㅣ0-9]+", " ", sentence)
    sentence = sentence.strip()
    
    return sentence

In [None]:
pre_data = list(map(preprocess_sentence, cleaned_corpus_cov))
print(pre_data)
print(len(pre_data))
print(type(pre_data))

In [None]:
pre_400 = list(map(preprocess_sentence, cleaned_400))
print(pre_400)
print(len(pre_400))
print(type(pre_400))

In [None]:
pre_test_data = list(map(preprocess_sentence, test_data.text))
print(pre_test_data)
print(len(pre_test_data))
print(type(pre_test_data))

In [None]:
# # index_word로 바꾸기
# index_to_word = { index+3 : word for word, index in word_index.items() }

# for index, token in enumerate(("<pad>", "<sos>", "<unk>")):

# index_to_word[index] = token

In [None]:
def tokenize2(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    
    mecab = Mecab()
    corpus = [' '.join(mecab.morphs(sen)) for sen in corpus]
        
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='pre', maxlen=500)
    
    return tensor, tokenizer

In [None]:
conv_tensor, conv_tokenizer = tokenize2(pre_data)
print(f'Converasation Vocab Size: {len(conv_tokenizer.index_word)}')
conv400_tensor, conv400_tokenizer = tokenize2(pre_400)
print(f'Converasation_400 Vocab Size: {len(conv_tokenizer.index_word)}')
test_tensor, test_tokenizer = tokenize2(pre_test_data)
print(f'Text Vocab Size: {len(test_tokenizer.index_word)}')

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(conv400_tensor, test_tensor, test_size=0.2)

len(x_train), len(x_test), len(y_train), len(y_test)

### Step 3: 모델 설계

1. BahdanauAttention 

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.w_dec = tf.keras.layers.Dense(units)
        self.w_enc = tf.keras.layers.Dense(units)
        self.w_com = tf.keras.layers.Dense(1)
    
    def call(self, h_enc, h_dec):
        # h_enc shape: [batch x length x units]
        # h_dec shape: [batch x units]

        h_enc = self.w_enc(h_enc)
        h_dec = tf.expand_dims(h_dec, 1)
        h_dec = self.w_dec(h_dec)

        score = self.w_com(tf.nn.tanh(h_dec + h_enc))
        
        attn = tf.nn.softmax(score, axis=1)

        context_vec = attn * h_enc
        context_vec = tf.reduce_sum(context_vec, axis=1)

        return context_vec, attn

print("슝~")

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()

        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True)
        
        self.dropout = tf.keras.layers.Dropout(rate=0.2)

    def call(self, x):
        out = self.embedding(x)
        out = self.gru(out)
        out = self.dropout(out)
        
        return out

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(dec_units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.dec_units)
        
        self.dropout = tf.keras.layers.Dropout(rate=0.2)

    def call(self, x, h_dec, enc_out):
        context_vec, attn = self.attention(enc_out, h_dec)

        out = self.embedding(x)
        out = tf.concat([tf.expand_dims(context_vec, 1), out], axis=-1)

        out, h_dec = self.gru(out)
        out = self.dropout(out)
        
        out = tf.reshape(out, (-1, out.shape[2]))
        out = self.fc(out)

        return out, h_dec, attn

In [None]:
# 코드를 실행하세요.
BATCH_SIZE     = 64
SRC_VOCAB_SIZE = len(conv400_tokenizer.index_word) + 1
TGT_VOCAB_SIZE = len(test_tokenizer.index_word) + 1

units         = 1024
embedding_dim = 512

encoder = Encoder(SRC_VOCAB_SIZE, embedding_dim, units)
decoder = Decoder(TGT_VOCAB_SIZE, embedding_dim, units)

# sample input
sequence_len = 30

sample_enc = tf.random.uniform((BATCH_SIZE, sequence_len))
sample_output = encoder(sample_enc)

print ('Encoder Output:', sample_output.shape)

sample_state = tf.random.uniform((BATCH_SIZE, units))

sample_logits, h_dec, attn = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                     sample_state, sample_output)

print ('Decoder Output:', sample_logits.shape)
print ('Decoder Hidden State:', h_dec.shape)
print ('Attention:', attn.shape)

### Step 4: 훈련하기 

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss)

print("슝~")

In [None]:
@tf.function
def train_step(src, tgt, encoder, decoder, optimizer, dec_tok):
    bsz = src.shape[0]
    loss = 0

    with tf.GradientTape() as tape:
        enc_out = encoder(src)
        h_dec = enc_out[:, -1]
        
        dec_src = tf.expand_dims([dec_tok.word_index['<start>']]*bsz, 1)

        for t in range(1, tgt.shape[1]):
            pred, h_dec, _ = decoder(dec_src, h_dec, enc_out)

            loss += loss_function(tgt[:, t], pred)
            dec_src = tf.expand_dims(tgt[:, t], 1)
        
    batch_loss = (loss / int(tgt.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

print("슝~")

In [None]:
from tqdm import tqdm    # tqdm
import random

EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    
    idx_list = list(range(0, x_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)    # tqdm

    for (batch, idx) in enumerate(t):
        batch_loss = train_step(x_train[idx:idx+BATCH_SIZE],
                                y_train[idx:idx+BATCH_SIZE],
                                encoder,
                                decoder,
                                optimizer,
                                test_tokenizer)
    
        total_loss += batch_loss
        
        t.set_description_str('Epoch %2d' % (epoch + 1))    # tqdm
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))    # tqdm