In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import torch.nn as nn
import torch.optim as optim

# 데이터 로드 및 전처리
data = pd.read_csv('/content/train.csv')
sentences = data['text'].values
labels = data['label'].values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
X_train, X_val, y_train, y_val = train_test_split(sentences, labels, test_size=0.2, random_state=42)
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(X_train), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(text)
        token_ids = [self.vocab[token] for token in tokens]
        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_batch(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        text_list.append(torch.tensor(_text, dtype=torch.long))
        label_list.append(torch.tensor(_label, dtype=torch.long))
    return pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"]), torch.stack(label_list)

train_dataset = TextDataset(X_train, y_train, vocab, tokenizer)
val_dataset = TextDataset(X_val, y_val, vocab, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_units, num_classes, dropout_rate):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, lstm_units, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_units, num_classes)  # Fully Connected Layer

    def forward(self, x):
        x = self.embedding(x)  # x는 (batch_size, seq_length, embed_dim) 크기의 텐서
        x, (hidden, _) = self.lstm(x)  # hidden은 (1, batch_size, lstm_units) 크기의 텐서
        x = self.dropout(hidden[-1])  # hidden[-1]은 (batch_size, lstm_units) 크기의 텐서
        x = self.fc(x)  # x는 (batch_size, num_classes) 크기의 텐서
        return x

# 하이퍼파라미터 설정
embed_dim = 128
lstm_units = 128
dropout_rate = 0.3
learning_rate = 1e-3
num_epochs = 10

# 모델 초기화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(vocab_size=len(vocab), embed_dim=embed_dim, lstm_units=lstm_units, num_classes=6, dropout_rate=dropout_rate).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 모델 학습
model.train()
for epoch in range(num_epochs):
    for texts, labels in train_dataloader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# 모델 평가 함수
def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return accuracy

# 검증 데이터로 모델 평가
val_accuracy = evaluate_model(model, val_dataloader)
print(f'Validation Accuracy: {val_accuracy}')

# 모델 저장
torch.save(model.state_dict(), 'model_weights.pth')
torch.save(model, 'model.pth')

# 테스트 데이터 로드 및 전처리
test_data = pd.read_csv('/content/test.csv')
test_sentences = test_data['text'].values
test_labels = test_data['label'].values
test_labels = label_encoder.transform(test_labels)
test_dataset = TextDataset(test_sentences, test_labels, vocab, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

# 테스트 데이터로 모델 평가
test_accuracy = evaluate_model(model, test_dataloader)
print(f'Test Accuracy: {test_accuracy}')

  text_list.append(torch.tensor(_text, dtype=torch.long))
  label_list.append(torch.tensor(_label, dtype=torch.long))


Epoch 1/10, Loss: 1.6563297510147095
Epoch 2/10, Loss: 1.6158767938613892
Epoch 3/10, Loss: 1.5254058837890625
Epoch 4/10, Loss: 1.4879822731018066
Epoch 5/10, Loss: 1.515523076057434
Epoch 6/10, Loss: 1.6789978742599487
Epoch 7/10, Loss: 1.3882092237472534
Epoch 8/10, Loss: 0.8263146281242371
Epoch 9/10, Loss: 0.6810016632080078
Epoch 10/10, Loss: 0.4650488495826721
Validation Accuracy: 0.6715625


  text_list.append(torch.tensor(_text, dtype=torch.long))
  label_list.append(torch.tensor(_label, dtype=torch.long))


Test Accuracy: 0.6995


In [12]:
def preprocess_text(text, vocab, tokenizer):
    tokens = tokenizer(text)
    token_ids = [vocab[token] for token in tokens]
    return torch.tensor(token_ids, dtype=torch.long).unsqueeze(0)  # 배치 차원을 추가합니다.

def predict_sentence(model, text, vocab, tokenizer):
    model.eval()
    input_tensor = preprocess_text(text, vocab, tokenizer).to(device)
    with torch.no_grad():
        output = model(input_tensor)
    predicted_class = torch.argmax(output, dim=1).item()
    return predicted_class

# 예시 문장
example_sentence = "I am so scared of ghosts. I will never go there again."

# 예측
predicted_class = predict_sentence(model, example_sentence, vocab, tokenizer)
print(f'The predicted class for the input sentence is: {predicted_class}')

The predicted class for the input sentence is: 4
