In [4]:
import pandas as pd
import numpy as np
from collections import Counter
from konlpy.tag import Mecab
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Bidirectional, Attention
from gensim.models import Word2Vec

In [5]:
# 훈련 및 테스트 데이터 불러오기
train_data = pd.read_table('C:/Users/ZAKAR/Documents/GitHub/AIFFEL/Exploration/Quest14_EX04/data/ratings_train.txt')
test_data = pd.read_table('C:/Users/ZAKAR/Documents/GitHub/AIFFEL/Exploration/Quest14_EX04/data/ratings_test.txt')

train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [6]:
tokenizer = Mecab('C:/mecab/mecab-ko-dic')
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

def load_data(train_data, test_data, num_words=None):
    X_train = []
    y_train = []
    for (sentence, label) in train_data:
        tokens = tokenizer.morphs(sentence)
        tokens = [token for token in tokens if token not in stopwords]
        X_train.append(tokens)
        y_train.append(label)

    X_test = []
    y_test = []
    for (sentence, label) in test_data:
        tokens = tokenizer.morphs(sentence)
        tokens = [token for token in tokens if token not in stopwords]
        X_test.append(tokens)
        y_test.append(label)

    words = np.concatenate(X_train).tolist()
    counter = Counter(words)
    if num_words is not None:
        counter = counter.most_common(num_words - 4)
    else:
        counter = counter.items()
    word_to_index = {word: index + 4 for index, (word, _) in enumerate(counter)}
    word_to_index['<PAD>'] = 0
    word_to_index['<BOS>'] = 1
    word_to_index['<UNK>'] = 2
    word_to_index['<UNUSED>'] = 3

    X_train = [[word_to_index.get(word, 2) for word in sentence] for sentence in X_train]
    X_test = [[word_to_index.get(word, 2) for word in sentence] for sentence in X_test]

    return X_train, y_train, X_test, y_test, word_to_index

train_data = [("안녕하세요", 1), ("반갑습니다", 1), ("잘가요", 0), ("안녕히가세요", 0)]
test_data = [("안녕", 1), ("잘있어요", 0)]
num_words = 10000
X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data)

index_to_word = {index:word for word, index in word_to_index.items()}

In [7]:
def get_encoded_sentence(sentence, word_to_index):
    return [word_to_index['<BOS>']]+[word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in sentence.split()]

def get_encoded_sentences(sentences, word_to_index):
    return [get_encoded_sentence(sentence, word_to_index) for sentence in sentences]

def get_decoded_sentence(encoded_sentence, index_to_word):
    return ' '.join(index_to_word[index] if index in index_to_word else '<UNK>' for index in encoded_sentence[1:])

def get_decoded_sentences(encoded_sentences, index_to_word):
    return [get_decoded_sentence(encoded_sentence, index_to_word) for encoded_sentence in encoded_sentences]

In [None]:
vocab_size = len(word_to_index)

model1 = Sequential()
model1.add(Embedding(input_dim=vocab_size, output_dim=4, input_length=max_length))
model1.add(LSTM(units=128))
model1.add(Dense(units=1, activation='sigmoid'))

model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim=4, input_length=max_length))
model2.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(units=1, activation='sigmoid'))

model3 = Sequential()
model3.add(Embedding(input_dim=vocab_size, output_dim=4, input_length=max_length))
model3.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model3.add(Attention())
model3.add(Dense(units=1, activation='sigmoid'))

In [None]:
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

epochs = 10
batch_size = 32

model1.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
model2.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
model3.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

In [None]:
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(model1.history.history['loss'], label='Training Loss')
plt.plot(model1.history.history['val_loss'], label='Validation Loss')
plt.title('Model 1 - Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(model1.history.history['accuracy'], label='Training Accuracy')
plt.plot(model1.history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model 1 - Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Model 2
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(model2.history.history['loss'], label='Training Loss')
plt.plot(model2.history.history['val_loss'], label='Validation Loss')
plt.title('Model 2 - Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(model2.history.history['accuracy'], label='Training Accuracy')
plt.plot(model2.history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model 2 - Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Model 3
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(model3.history.history['loss'], label='Training Loss')
plt.plot(model3.history.history['val_loss'], label='Validation Loss')
plt.title('Model 3 - Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(model3.history.history['accuracy'], label='Training Accuracy')
plt.plot(model3.history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model 3 - Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
embedding_weights = model1.get_layer('embedding_layer').get_weights()[0]

print("Embedding Weights Shape:", embedding_weights.shape)

avg_embedding = np.mean(embedding_weights)
std_embedding = np.std(embedding_weights)
print("Average Embedding Value:", avg_embedding)
print("Standard Deviation of Embedding:", std_embedding)

In [None]:
word2vec_file_path = '/data/word2vec_ko.model'
word2vec_model = Word2Vec.load(word2vec_file_path)

vector = word2vec_model.wv['끝']