# GoingDeeper NLP 프로젝트: 멋진 단어사전 만들기

- vocab size = 8000


In [1]:
import tensorflow as tf
import numpy as np
import matplotlib as plt
import konlpy

print(tf.__version__)
print(np.__version__)
print(plt.__version__)
print(konlpy.__version__)

2.6.0
1.21.4
3.4.3
0.5.2


### 네이버 영화리뷰 감정 분석 문제  
  
- KoNLPy 형태소 분석기를 사용한 모델과 성능 비교하기  
- SentencePiece 모델의 model_type, vocab_size 등을 변경해 가면서 성능 개선 여부 확인하기  

In [2]:
import sentencepiece as spm
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from konlpy.tag import Mecab, Okt
import os
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, f1_score


In [3]:
def eda(file_path):
    df = pd.read_csv(file_path, sep="\t").dropna()
    print("데이터 개수:", len(df))
    print("라벨 분포:")
    print(df['label'].value_counts())
    df['length'] = df['document'].apply(lambda x: len(str(x)))
    print("문장 길이 통계:")
    print(df['length'].describe(),'\n')
    return df

In [4]:
df_train = eda("./data/ratings_train.txt")
df_test = eda("./data/ratings_test.txt")

데이터 개수: 149995
라벨 분포:
0    75170
1    74825
Name: label, dtype: int64
문장 길이 통계:
count    149995.000000
mean         35.204527
std          29.531890
min           1.000000
25%          16.000000
50%          27.000000
75%          42.000000
max         146.000000
Name: length, dtype: float64 

데이터 개수: 49997
라벨 분포:
1    25171
0    24826
Name: label, dtype: int64
문장 길이 통계:
count    49997.000000
mean        35.320259
std         29.648310
min          1.000000
25%         16.000000
50%         27.000000
75%         43.000000
max        144.000000
Name: length, dtype: float64 



In [5]:
# 데이터 전처리
def preprocess_text(df):
    df['document'] = df['document'].astype(str).str.strip()
    df['document'] = df['document'].str.replace("[^가-힣0-9a-zA-Z\s]", "", regex=True)
    return df

df_train = preprocess_text(df_train)
df_test = preprocess_text(df_test)

In [6]:
df_train = df_train.drop_duplicates(subset=['document']).reset_index(drop=True)
df_test = df_test.drop_duplicates(subset=['document']).reset_index(drop=True)

In [7]:
print(len(df_train))
df_train.head()

143899


Unnamed: 0,id,document,label,length
0,9976970,아 더빙 진짜 짜증나네요 목소리,0,19
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1,33
2,10265843,너무재밓었다그래서보는것을추천한다,0,17
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0,29
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1,61


In [8]:
print(len(df_test))
df_test.head()

48552


Unnamed: 0,id,document,label,length
0,6270596,굳,1,3
1,9274899,GDNTOPCLASSINTHECLUB,0,20
2,8544678,뭐야 이 평점들은 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0,38
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0,32
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데 왜 3D로 나와서 제 심기를 불편하게 하죠,0,49


In [9]:
# SentencePiece 모델 학습을 위한 데이터 준비
def prepare_sentencepiece_data(df, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for text in df['document']:
            if isinstance(text, str) and text.strip():  # 빈 문자열 제외
                f.write(text + '\n')
prepare_sentencepiece_data(df_train, 'naver_corpus.txt')

In [10]:
# SentencePiece 토큰화 함수
def sp_tokenize(sp, corpus, maxlen):
    tensor = [sp.encode_as_ids(sen) for sen in corpus]
    tensor = pad_sequences(tensor, maxlen=maxlen, padding='post')
    return tensor

In [11]:
# SentencePiece 모델 학습 및 평가
def train_sentencepiece_model(model_type):
    spm.SentencePieceTrainer.Train(
        f'--input=naver_corpus.txt --model_prefix=sp_model5000 --vocab_size=5000 '
        f'--model_type={model_type} --character_coverage=1.0 --minloglevel=0'
    )
    sp = spm.SentencePieceProcessor()
    sp.load('sp_model.model')
    return sp

In [12]:
def tokenize_mecab(corpus):
    mecab = Mecab()
    return [mecab.morphs(sentence) for sentence in corpus]

In [13]:
def tokenize_okt(corpus):
    okt = Okt()
    return [okt.morphs(sentence) for sentence in corpus]

In [14]:
# 데이터 패딩 적용
def tokenizer_and_pad(tokenized_corpus, maxlen):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(tokenized_corpus)
    tensor = tokenizer.texts_to_sequences(tokenized_corpus)
    tensor = pad_sequences(tensor, maxlen=maxlen, padding='post')
    return tensor

In [15]:
# 라벨 데이터 정의
y = np.array(df_train['label'].tolist())

In [16]:
def train_and_evaluate(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)
    
    model = Sequential([
        Embedding(8000, 256, input_length=X.shape[1]),
        LSTM(128, return_sequences=True),
        LSTM(64),
        BatchNormalization(),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = Adam(learning_rate=0.0005)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=64,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping]
    )
    return model, history

In [17]:
# 실험 수행 및 비교
results = {}
epochs_completed = {}

In [18]:
# SentencePiece 비교 (bpe, unigram, word, char)
for model_type in ['bpe', 'unigram', 'word', 'char']:
    sp_model = train_sentencepiece_model(model_type)
    X_sp = sp_tokenize(sp_model, df_train['document'].tolist(), 65)
    model, history = train_and_evaluate(X_sp, y)
    results[f'SentencePiece-{model_type}'] = model.evaluate(X_sp, y)
    epochs_completed[f'SentencePiece-{model_type}'] = len(history.history['loss'])

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=naver_corpus.txt --model_prefix=sp_model5000 --vocab_size=5000 --model_type=bpe --character_coverage=1.0 --minloglevel=0
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: naver_corpus.txt
  input_format: 
  model_prefix: sp_model5000
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [19]:
# Mecab 비교
mecab_tokens = tokenize_mecab(df_train['document'].tolist())
X_mecab = tokenizer_and_pad(mecab_tokens, 65)
model, history = train_and_evaluate(X_mecab, y)
results['Mecab'] = model.evaluate(X_mecab, y)
epochs_completed['Mecab'] = len(history.history['loss'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [20]:
# Okt 비교
okt_tokens = tokenize_okt(df_train['document'].tolist())
X_okt = tokenizer_and_pad(okt_tokens, 65)
model, history = train_and_evaluate(X_okt, y)
results['Okt'] = model.evaluate(X_okt, y)
epochs_completed['Okt'] = len(history.history['loss'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [21]:
# 비교 결과 출력
for method, (loss, accuracy) in results.items():
    print(f"{method}: Loss={loss:.4f}, Accuracy={accuracy:.4f}, Epochs Completed={epochs_completed[method]}")

SentencePiece-bpe: Loss=0.2940, Accuracy=0.8749, Epochs Completed=5
SentencePiece-unigram: Loss=0.2889, Accuracy=0.8739, Epochs Completed=5
SentencePiece-word: Loss=0.2538, Accuracy=0.9131, Epochs Completed=8
SentencePiece-char: Loss=0.2862, Accuracy=0.8767, Epochs Completed=5
Mecab: Loss=0.2446, Accuracy=0.9044, Epochs Completed=6
Okt: Loss=0.3101, Accuracy=0.8835, Epochs Completed=6


## 정리
| Tokenizer           | Loss  | Accuracy | Epochs |
|--------------------|-------|----------|------------------|
| SentencePiece-bpe  | 0.2940 | 0.8749   | 5                |
| SentencePiece-unigram | 0.2889 | 0.8739   | 5                |
| SentencePiece-word | 0.2538 | 0.9131   | 8                |
| SentencePiece-char | 0.2862 | 0.8767   | 5                |
| Mecab             | 0.2446 | 0.9044   | 6                |
| Okt               | 0.3101 | 0.8835   | 6                |



- 그전 vocab 사이즈에 따라 accuary가 달라지는 것을 보아. 데이터의 크기가 크면 클수록 형태소 분석 토큰화가 더 성능이 좋아질 것을 보임.
- 실험을 해봐야함.