In [4]:
import pandas as pd
import numpy as np

from collections import Counter
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import load_model

from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [5]:
df_train = pd.read_csv("korean_unsmile_dataset-main/unsmile_train_v1.0.tsv",delimiter='\t')
df_valid = pd.read_csv("korean_unsmile_dataset-main/unsmile_valid_v1.0.tsv",delimiter='\t')

In [6]:
df_train = df_train.drop(['여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오','악플/욕설','개인지칭'], axis=1)
df_valid = df_valid.drop(['여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오','악플/욕설','개인지칭'], axis=1)

In [7]:
df_train['target'] = df_train['clean'].map({1:0, 0:1})
df_valid['target'] = df_valid['clean'].map({1:0, 0:1})

In [8]:
okt = Okt()

In [9]:
print('OKT 형태소 분석 :',okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('OKT 품사 태깅 :',okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('OKT 명사 추출 :',okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

OKT 형태소 분석 : ['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
OKT 품사 태깅 : [('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]
OKT 명사 추출 : ['코딩', '당신', '연휴', '여행']


In [10]:
def make_corpus(sentence):
    okt = Okt()
    raw_pos_tagged = okt.pos(sentence, norm=True, stem=True)
    word_cleaned = []

    for word in raw_pos_tagged:
        if word[1] in ['Noun', 'Adjective', 'Verb']:
            word_cleaned.append(word[0])
    return ' '.join(word_cleaned)

In [11]:
df_train['tokenized'] = df_train['문장'].map(make_corpus)
df_valid['tokenized'] = df_valid['문장'].map(make_corpus)

In [12]:
tfidf_vect = TfidfVectorizer(max_features=100)
tfidf_vect.fit(df_train.tokenized)
df_train_tfidf = tfidf_vect.transform(df_train.tokenized)
df_valid_tfidf = tfidf_vect.transform(df_valid.tokenized)

In [13]:
df_train_tfidf = df_train_tfidf.toarray()
df_valid_tfidf = df_valid_tfidf.toarray()

In [14]:
X_train = df_train_tfidf
X_test = df_valid_tfidf
y_train = df_train['target']
y_test = df_valid['target']

In [15]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (15005, 100)
X_test shape:  (3737, 100)
y_train shape:  (15005,)
y_test shape:  (3737,)


In [16]:
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(6, embedding_dim))
# model.add(Bidirectional(LSTM(hidden_units))) # Bidirectional LSTM을 사용
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model_old.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=5, callbacks=[es, mc], batch_size=256, validation_split=0.2)

Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.74375, saving model to best_model_old.h5
Epoch 2/5

Epoch 00002: val_acc did not improve from 0.74375
Epoch 3/5

Epoch 00003: val_acc did not improve from 0.74375
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.74375
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.74375


In [17]:
loaded_model = load_model('best_model_old.h5')

print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.7498


In [18]:
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(6, embedding_dim))
model.add(Bidirectional(LSTM(hidden_units))) # Bidirectional LSTM을 사용
# model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model_old.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=5, callbacks=[es, mc], batch_size=256, validation_split=0.2)

Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.74375, saving model to best_model_old.h5
Epoch 2/5

Epoch 00002: val_acc did not improve from 0.74375
Epoch 3/5

Epoch 00003: val_acc did not improve from 0.74375
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.74375
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.74375


In [19]:
loaded_model = load_model('best_model_old.h5')

print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.7498
