In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

### 데이터 처리

In [3]:
data = pd.read_csv('data/custom_train.csv')


In [4]:
label_dict = {
    '협박 대화': 0,
    '갈취 대화': 1,
    '직장 내 괴롭힘 대화': 2,
    '기타 괴롭힘 대화': 3,
    '일반 대화': 4
}

data['label'] = data['class'].map(label_dict)

In [5]:
import re

def preprocess_sentence(sentence):
    # 단어와 구두점(punctuation) 사이의 거리를 만듭니다.
    # 예를 들어서 "I am a student." => "I am a student ."와 같이
    # student와 온점 사이에 거리를 만듭니다.
    sentence = sentence.replace("\n", "")         # 구분자
    sentence = sentence.replace("\r", "")         # 구분자
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)

    # (a-z, A-Z,가-힣,0-9, ".", "?", "!", ",")를 제외한 모든 문자를 공백인 ' '로 대체합니다.
    sentence = re.sub(r"[^a-zA-Z가-힣0-9\.\?\!,]"," ",sentence)
    sentence = sentence.strip()
    return sentence

In [6]:
data['conversation'] = data['conversation'].apply(preprocess_sentence)

In [7]:
data_prepcocessed = data.copy()

In [38]:
# max len길이를 보기위해
total_data_text = list(data['conversation'])
# 텍스트데이터 문장길이의 리스트를 생성한 후
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)
# 문장길이의 평균값, 최대값, 표준편차를 계산해 본다. 
print('문장길이 평균 : ', np.mean(num_tokens))
print('문장길이 최대 : ', np.max(num_tokens))
print('문장길이 표준편차 : ', np.std(num_tokens))

# 예를들어, 최대 길이를 (평균 + 2*표준편차)로 한다면,  
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
maxlen = int(max_tokens)
print('pad_sequences maxlen : ', maxlen)
print(f'전체 문장의 {np.sum(num_tokens < max_tokens) / len(num_tokens)}%가 maxlen 설정값 이내에 포함됩니다. ')

문장길이 평균 :  240.11326732673268
문장길이 최대 :  906
문장길이 표준편차 :  98.91159089852438
pad_sequences maxlen :  437
전체 문장의 0.96%가 maxlen 설정값 이내에 포함됩니다. 


In [9]:
# 파라미터

MAX_WORDS = 5000
MAX_LEN = 100

### 일반 토크나이저

In [None]:
tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True)
tokenizer.fit_on_texts(data['conversation'].values)

X_train = tokenizer.texts_to_sequences(data['conversation'].values)
X_train = pad_sequences(X_train, maxlen=MAX_LEN)

### 서브워드 토크나이저

In [33]:
import tensorflow_datasets as tfds

# 질문과 답변 데이터셋에 대해서 Vocabulary 생성
tokenizer_sw = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(data['conversation'] , target_vocab_size=2**13)

In [35]:
START_TOKEN, END_TOKEN = [tokenizer_sw.vocab_size], [tokenizer_sw.vocab_size + 1]
VOCAB_SIZE = tokenizer_sw.vocab_size + 2

In [39]:
# 정수 인코딩, 최대 길이를 초과하는 샘플 제거, 패딩
def tokenize_and_filter(inputs):
    outputs = []
  
    for sentence in inputs:
        # 정수 인코딩 과정에서 시작 토큰과 종료 토큰을 추가
        sentence = START_TOKEN + tokenizer.encode(sentence) + END_TOKEN

        # 최대 길이 28 이하인 경우에만 데이터셋으로 허용
        if len(sentence) <= MAX_LENGTH :
            outputs.append(sentence)    
        
    # 최대 길이 28으로 모든 데이터셋을 패딩
    outputs = tf.keras.preprocessing.sequence.pad_sequences(
        outputs, maxlen=MAX_LENGTH, padding='post')

  
    return outputs

In [None]:
X_train_sw = tokenize_and_filter(data['conversation'])

In [10]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(data['label']) # label로 주어 올바른 순서로 훈련

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
아래는 서브워드
# X_train, X_test, y_train, y_test = train_test_split(X_train_sw, y_train, test_size=0.2, random_state=42)

y_train = tf.keras.utils.to_categorical(y_train, num_classes=len(np.unique(y_train)))
y_test = tf.keras.utils.to_categorical(y_test, num_classes=len(np.unique(y_test)))

### fasttext

In [11]:
# 파라미터
EPOCH_FT = 100
BUCKET_FT = 20000
NGRAM_FT = 2
DIM_FT = 100

In [12]:
def add_label(x):
    label = x['label']
    return f'''__label__{label}    {x['conversation']}'''
    

add_label_text = data_prepcocessed.apply(add_label, axis=1)


In [13]:
file_path = 'input_fasttext_conversation.txt'

with open(file_path, 'w', encoding='utf8') as f:
    f.write('\n'.join(add_label_text))

In [14]:
# ! pip install fasttext

In [15]:
import fasttext

model_ft = fasttext.train_supervised(input=file_path,
                                  epoch=EPOCH_FT,
                                  bucket = BUCKET_FT,
                                  lr = 1
                                  wordNgrams=NGRAM_FT,
                                  dim=DIM_FT,
                                  )

Read 0M words
Number of words:  72778
Number of labels: 5
Progress: 100.0% words/sec/thread: 1073848 lr:  0.000000 avg.loss:  0.027765 ETA:   0h 0m 0s


### BiLSTM

In [16]:
model_lstm = Sequential()
model_lstm.add(Embedding(MAX_WORDS, 128, input_length=MAX_LEN))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dense(len(np.unique(data['class'])), activation='softmax'))

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_lstm.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 128)          640000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 100, 128)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
Total params: 747,397
Trainable params: 747,397
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
EPOCHS_LSTM = 1
BATCH_SIZE_LSTM = 64

history = model_lstm.fit(X_train, y_train, 
                    epochs=EPOCHS_LSTM, 
                    batch_size=BATCH_SIZE_LSTM, 
                    validation_split=0.2, 
                    callbacks=[tf.keras.callbacks.EarlyStopping(
                        monitor='val_loss', patience=3, min_delta=0.0001)])



### tranformer

In [18]:
import transformer

In [19]:
# 파라미터
NUM_LAYERS = 12 # 인코더와 디코더의 층의 개수 
D_MODEL = 128 # 인코더와 디코더 내부의 입/출력의 고정 차원
NUM_HEADS = 4 # 멀티 헤드 어텐션에서의 헤드 수 
UNITS = 256 # 피드 포워드 신경망의 은닉층의 크기
DROPOUT = 0.1 # 드롭아웃의 비율

NUM_CLASSES = len(data['class'].unique())  #레이블 수
VOCAB_SIZE = MAX_WORDS #단어사전 크기
MAX_LENGTH = X_train.shape[1] # maxlen

In [20]:
model= transformer.transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    units=UNITS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    num_classes=NUM_CLASSES,
    dropout=DROPOUT)

In [22]:
EPOCHS = 1
BATCH_SIZE = 64

model.compile(optimizer='adam',
              loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train,
                    y_train,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(X_test, y_test),
                    verbose=1)



## 앙상블

In [23]:
test = pd.read_csv('data/custom_test.csv')


In [24]:
test['conversation'] = test['conversation'].apply(preprocess_sentence)
tokenizer.fit_on_texts(test['conversation'].values)

test_input = tokenizer.texts_to_sequences(test['conversation'].values)
test_input = pad_sequences(test_input, maxlen=MAX_LEN_LSTM)

In [26]:
pred_ft =  model_ft.predict(list(test['conversation']), k=-1)
pred_lstm = model_lstm.predict(test_input)
pred_transformer = model.predict(test_input)

In [30]:
pred = pred_ft[1] + pred_lstm + pred_transformer

predicted_classes = np.argmax(pred, axis=1)

## 제출

In [31]:
sub = pd.read_csv("data/new_submission.csv")
sub['class']=predicted_classes
# sub.to_csv('data/sub.csv')

In [32]:
sub

Unnamed: 0,file_name,class
0,t_000,0
1,t_001,0
2,t_002,0
3,t_003,4
4,t_004,0
...,...,...
495,t_495,0
496,t_496,0
497,t_497,0
498,t_498,0
