In [1]:
# 1. 환경 설정 및 데이터 로드
import pandas as pd
from sklearn.model_selection import train_test_split

# 'book_reviews.csv'에는 컬럼 'review', 'genre'가 있습니다.
df = pd.read_csv('book_reviews.csv')
print(df.shape)      # (샘플 수, 2)
print(df.head())


(4657, 4)
   index                      title    genre   
0      0          Drowned Wednesday  fantasy  \
1      1              The Lost Hero  fantasy   
2      2  The Eyes of the Overworld  fantasy   
3      3            Magic's Promise  fantasy   
4      4             Taran Wanderer  fantasy   

                                             summary  
0   Drowned Wednesday is the first Trustee among ...  
1   As the book opens, Jason awakens on a school ...  
2   Cugel is easily persuaded by the merchant Fia...  
3   The book opens with Herald-Mage Vanyel return...  
4   Taran and Gurgi have returned to Caer Dallben...  


In [2]:
import re
# 2. 텍스트 전처리 및 토큰화
def preprocess_text(text):
    # 소문자 변환
    text = text.lower()

    # 한글, 영어, 숫자, 공백만 남기기
    text = re.sub(r'[^가-힣a-z0-9\s]', '', text)

    # 토큰화
    tokens = text.split()

    return tokens

df['tokens'] = df['summary'].apply(preprocess_text)
print("\n토큰화 결과:")
print(df[['summary', 'tokens']])


토큰화 결과:
                                                summary   
0      Drowned Wednesday is the first Trustee among ...  \
1      As the book opens, Jason awakens on a school ...   
2      Cugel is easily persuaded by the merchant Fia...   
3      The book opens with Herald-Mage Vanyel return...   
4      Taran and Gurgi have returned to Caer Dallben...   
...                                                 ...   
4652  Atticus O’Sullivan, last of the Druids, lives ...   
4653  Charlie Bucket's wonderful adventure begins wh...   
4654  "I live for the dream that my children will be...   
4655  Rose loves Dimitri, Dimitri might love Tasha, ...   
4656  The Prince of no value\nBrishen Khaskem, princ...   

                                                 tokens  
0     [drowned, wednesday, is, the, first, trustee, ...  
1     [as, the, book, opens, jason, awakens, on, a, ...  
2     [cugel, is, easily, persuaded, by, the, mercha...  
3     [the, book, opens, with, heraldmage, vanyel,

In [3]:
# 3. FastText 임베딩 로드
%pip install numpy==1.24.3 pandas==2.0.1 torch==2.0.1 scikit-learn gensim

import gensim.downloader as api
fasttext = api.load("fasttext-wiki-news-subwords-300")
print("\nFastText 벡터 차원:", fasttext.vector_size)


FastText 벡터 차원: 300


In [4]:
# 4. 단어 사전(vocab) 구축 및 시퀀스 변환
from collections import Counter
MAX_VOCAB_SIZE = 10000
all_tokens = [tok for tokens in df['tokens'] for tok in tokens]
vocab_count = Counter(all_tokens)

# 가장 빈도 높은 MAX_VOCAB_SIZE-1개의 단어를 뽑아 인덱스 매핑 생성
word2idx = {'<OOV>': 0}  # idx 0은 OOV용으로 예약
for word, count in vocab_count.most_common(MAX_VOCAB_SIZE - 1):
    word2idx[word] = len(word2idx)

def tokens_to_sequence(tokens, word2idx, maxlen=100):
    # 각 토큰을 인덱스로 변환, OOV는 0
    seq = [word2idx.get(token, 0) for token in tokens]

    # 길이 > maxlen이면 자르고, < maxlen이면 0으로 패딩
    if len(seq) > maxlen:
        seq = seq[:maxlen]
    else:
        seq = seq + [0] * (maxlen - len(seq))

    return seq

df['seq'] = df['tokens'].apply(lambda toks: tokens_to_sequence(toks, word2idx, maxlen=100))

In [5]:
# 5. 학습/검증 데이터 준비
import numpy as np
X = np.stack(df['seq'].values)              # (N, maxlen)
y = df['genre'].factorize()[0]              # [0,1,2,...]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
# 6. PyTorch 모델 구현
import torch
import torch.nn as nn

# 6-1) 임베딩 레이어 초기화
vocab_size = len(word2idx) + 1
embed_dim = fasttext.vector_size

embedding_matrix = torch.zeros(vocab_size, embed_dim)
# word2idx에 해당하는 fasttext 벡터로 embedding_matrix 채우기
for word, idx in word2idx.items():
    if word in fasttext.key_to_index and idx != 0:
        embedding_matrix[idx] = torch.FloatTensor(fasttext[word])

embedding_layer = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
# embedding_layer.weight.data에 embedding_matrix 할당
embedding_layer.weight.data.copy_(embedding_matrix)
# 임베딩 고정(freeze)
embedding_layer.weight.requires_grad = False

# 6-2) 분류기 클래스 정의
class VanillaFC(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        # 평균 임베딩 벡터 입력 → 예측 로직
        x = embedding_layer(x)  # (batch, seq_len, embed_dim)
        x = x.mean(dim=1)  # 시퀀스 차원에서 평균 (batch, embed_dim)
        return self.fc(x)

class RNNClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, num_classes):
        super().__init__()
        # embedding_layer 복사
        self.embedding = embedding_layer
        # nn.RNN 정의
        self.rnn = nn.RNN(embedding_layer.embedding_dim, hidden_dim,
                          batch_first=True, num_layers=1)
        # 마지막 시점 hidden → fc
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # 임베딩 적용
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        # RNN 처리
        _, hidden = self.rnn(x)  # hidden: (1, batch, hidden_dim)
        # 마지막 hidden state로 분류
        return self.fc(hidden.squeeze(0))

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, num_classes):
        super().__init__()
        self.embedding = embedding_layer
        # nn.LSTM 정의
        self.lstm = nn.LSTM(embedding_layer.embedding_dim, hidden_dim,
                            batch_first=True, num_layers=1)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        # LSTM 처리
        _, (hidden, _) = self.lstm(x)
        # 마지막 hidden state로 분류
        return self.fc(hidden.squeeze(0))

class GRUClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, num_classes):
        super().__init__()
        self.embedding = embedding_layer
        # nn.GRU 정의
        self.gru = nn.GRU(embedding_layer.embedding_dim, hidden_dim,
                          batch_first=True, num_layers=1)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        # GRU 처리
        _, hidden = self.gru(x)
        # 마지막 hidden state로 분류
        return self.fc(hidden.squeeze(0))

  embedding_matrix[idx] = torch.FloatTensor(fasttext[word])


In [7]:
# 7. 학습 및 평가 함수
from torch.utils.data import DataLoader, TensorDataset

def train(model, optimizer, criterion, loader, device):
    model.train()
    total_loss = 0
    for batch_x, batch_y in loader:
        # 배치 데이터를 디바이스로 이동
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        # 기울기 초기화
        optimizer.zero_grad()

        # 순전파
        outputs = model(batch_x)

        # loss 계산
        loss = criterion(outputs, batch_y)

        # 역전파
        loss.backward()

        # 가중치 업데이트
        optimizer.step()

        # 배치 손실 누적
        total_loss += loss.item()

    # 평균 손실 반환
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    correct = 0
    total = 0

    # 기울기 계산 비활성화
    with torch.no_grad():
        for batch_x, batch_y in loader:
            # 배치 데이터를 디바이스로 이동
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # 예측
            outputs = model(batch_x)

            # 최대 확률 클래스 선택
            _, predicted = torch.max(outputs, 1)

            # 전체 샘플 수 누적
            total += batch_y.size(0)

            # 정확히 예측한 샘플 수 누적
            correct += (predicted == batch_y).sum().item()

    # 정확도 반환
    accuracy = correct / total
    return accuracy

In [11]:
# 8. 모델 학습 및 성능 비교
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn

# (1) device, num_classes, hidden_dim 재정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = len(set(y))    # y는 5번에서 분할된 전체 레이블
hidden_dim  = 64

# (2) embedding_layer도 device로 이동 (FC 모델에서 사용됨)
embedding_layer = embedding_layer.to(device)

# DataLoader 준비
batch_size = 32
train_ds = TensorDataset(
    torch.tensor(X_train, dtype=torch.long),
    torch.tensor(y_train, dtype=torch.long)
)
val_ds = TensorDataset(
    torch.tensor(X_test,  dtype=torch.long),
    torch.tensor(y_test,  dtype=torch.long)
)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,  batch_size=batch_size, shuffle=False)

# 학습 하이퍼파라미터
num_epochs    = 10
learning_rate = 1e-3

# 모델 정의
models = {
    'FC':   VanillaFC(input_dim=embed_dim, num_classes=num_classes),
    'RNN':  RNNClassifier(embedding_layer, hidden_dim, num_classes),
    'LSTM': LSTMClassifier(embedding_layer, hidden_dim, num_classes),
    'GRU':  GRUClassifier(embedding_layer, hidden_dim, num_classes),
}

# 학습/검증 루프
for name, model in models.items():
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print(f"\n>> Training {name} model")
    for epoch in range(1, num_epochs+1):
        train_loss = train(model, optimizer, criterion, train_loader, device)
        val_acc    = evaluate(model, val_loader, device)
        print(f"Epoch {epoch:02d} — train_loss: {train_loss:.4f}, val_acc: {val_acc:.4f}")

    final_acc = evaluate(model, val_loader, device)
    print(f"[{name}] Final Validation Accuracy: {final_acc:.4f}")



>> Training FC model
Epoch 01 — train_loss: 2.0989, val_acc: 0.2200
Epoch 02 — train_loss: 2.0078, val_acc: 0.2736
Epoch 03 — train_loss: 1.9669, val_acc: 0.2908
Epoch 04 — train_loss: 1.9054, val_acc: 0.3187
Epoch 05 — train_loss: 1.8416, val_acc: 0.3423
Epoch 06 — train_loss: 1.7729, val_acc: 0.4099
Epoch 07 — train_loss: 1.7138, val_acc: 0.4088
Epoch 08 — train_loss: 1.6556, val_acc: 0.4496
Epoch 09 — train_loss: 1.6071, val_acc: 0.4496
Epoch 10 — train_loss: 1.5579, val_acc: 0.4678
[FC] Final Validation Accuracy: 0.4678

>> Training RNN model
Epoch 01 — train_loss: 2.0626, val_acc: 0.2200
Epoch 02 — train_loss: 2.0220, val_acc: 0.2264
Epoch 03 — train_loss: 2.0083, val_acc: 0.2264
Epoch 04 — train_loss: 1.9845, val_acc: 0.2414
Epoch 05 — train_loss: 1.9563, val_acc: 0.2436
Epoch 06 — train_loss: 1.9278, val_acc: 0.2242
Epoch 07 — train_loss: 1.8994, val_acc: 0.2586
Epoch 08 — train_loss: 1.8909, val_acc: 0.2511
Epoch 09 — train_loss: 1.8209, val_acc: 0.2564
Epoch 10 — train_loss: 