# Text Classification with LSTM

Phân loại văn bản tiếng Việt sử dụng LSTM.

## Setup

In [None]:
# Cài đặt thư viện cần thiết
!pip install -r requirements.txt

# Import các module cần thiết
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Import các module từ src
from src.preprocess import preprocess_df
from src.evaluate import evaluate_model, plot_confusion_matrix, save_evaluation_results

## Load và tiền xử lý dữ liệu

In [None]:
# Đọc dữ liệu đã crawl
df = pd.read_csv('data/dataset.csv')
print("Kích thước dataset:", df.shape)
print("\nPhân bố nhãn:")
print(df['label'].value_counts())

# Tiền xử lý văn bản
X, y = preprocess_df(df, text_col='summary', label_col='label')

# Encode nhãn
le = LabelEncoder()
y = le.fit_transform(y)
labels = le.classes_

# Chia train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nSố lượng classes:", len(labels))
print("Các nhãn:", labels)
print("\nKích thước tập train/test:", len(X_train), len(X_test))

## Chuẩn bị dữ liệu cho LSTM

In [None]:
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

def yield_tokens(texts):
    for text in texts:
        yield text.split()

# Xây dựng vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(X_train), 
    specials=['<unk>', '<pad>'],
    min_freq=2
)
vocab.set_default_index(vocab['<unk>'])

def text_pipeline(text):
    """Convert text to tensor of indices."""
    return torch.tensor([vocab[token] for token in text.split()])

class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return text_pipeline(self.texts[idx]), self.labels[idx]

def collate_batch(batch):
    """Pad sequences in batch to same length."""
    text_list, label_list = [], []
    for (_text, _label) in batch:
        text_list.append(_text)
        label_list.append(_label)
    return (
        pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>']),
        torch.tensor(label_list)
    )

# Tạo DataLoader
BATCH_SIZE = 32
train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=collate_batch
)

print("Kích thước vocab:", len(vocab))

## Định nghĩa model LSTM

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, 
                 num_layers=2, bidirectional=True, dropout=0.5):
        super().__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab['<pad>'])
        
        # LSTM layers
        self.lstm = nn.LSTM(
            embed_dim, 
            hidden_dim, 
            num_layers=num_layers,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        
        # Output layer
        lstm_out_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc = nn.Linear(lstm_out_dim, num_classes)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text shape: [batch_size, seq_len]
        
        # Get embeddings
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch_size, seq_len, embed_dim]
        
        # Pass through LSTM
        lstm_out, _ = self.lstm(embedded)
        # lstm_out shape: [batch_size, seq_len, hidden_dim * num_directions]
        
        # Get final hidden state
        hidden = lstm_out[:, -1, :]
        # hidden shape: [batch_size, hidden_dim * num_directions]
        
        # Pass through linear layer
        return self.fc(self.dropout(hidden))

# Khởi tạo model
EMBED_DIM = 300
HIDDEN_DIM = 256
model = LSTMClassifier(
    vocab_size=len(vocab),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_classes=len(labels),
    num_layers=2,
    bidirectional=True,
    dropout=0.5
)

# Loss và optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print("Using device:", device)

## Training

In [None]:
from tqdm.notebook import tqdm

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc='Training', leave=False):
        texts, labels = batch
        texts, labels = texts.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(texts)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

def evaluate_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            texts, labels = batch
            texts, labels = texts.to(device), labels.to(device)
            
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    return (
        total_loss / len(dataloader),
        np.array(all_preds),
        np.array(all_labels)
    )

# Training loop
NUM_EPOCHS = 10
best_val_loss = float('inf')

print("Starting training...")
for epoch in range(NUM_EPOCHS):
    # Train
    train_loss = train_epoch(model, train_dataloader, criterion, optimizer, device)
    
    # Evaluate
    val_loss, val_preds, val_labels = evaluate_epoch(
        model, test_dataloader, criterion, device
    )
    
    # Print metrics
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}:")
    print(f"Train loss: {train_loss:.4f}")
    print(f"Val loss: {val_loss:.4f}")
    
    # Evaluate and plot metrics
    evaluate_model(
        lambda x: val_preds,  # Dummy model that returns predictions
        val_labels,
        val_labels,
        labels=labels
    )
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'models/lstm_classifier.pt')
        print("Saved best model!")

## Đánh giá model

In [None]:
# Load best model
best_model = LSTMClassifier(
    vocab_size=len(vocab),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_classes=len(labels),
    num_layers=2,
    bidirectional=True,
    dropout=0.5
).to(device)
best_model.load_state_dict(torch.load('models/lstm_classifier.pt'))

# Get predictions
_, test_preds, test_labels = evaluate_epoch(
    best_model, test_dataloader, criterion, device
)

# Evaluate and save results
save_evaluation_results(
    test_labels,
    test_preds,
    labels=labels,
    save_dir='results'
)

# Show confusion matrix
plot_confusion_matrix(test_labels, test_preds, labels=labels)
plt.show()

## Thử nghiệm với văn bản mới

In [None]:
def predict_text(text, model, vocab, le, device):
    """Dự đoán nhãn cho văn bản mới."""
    # Tiền xử lý
    from src.preprocess import clean_text
    text = clean_text(text)
    
    # Chuyển thành tensor
    with torch.no_grad():
        inputs = text_pipeline(text).unsqueeze(0).to(device)
        outputs = model(inputs)
        pred = outputs.argmax(dim=1).item()
    
    return le.inverse_transform([pred])[0]

# Thử nghiệm
test_texts = [
    "Chính phủ họp bàn về tình hình kinh tế xã hội",
    "Đội tuyển Việt Nam giành chiến thắng",
    "iPhone 15 ra mắt với nhiều tính năng mới",
]

print("\nThử nghiệm dự đoán:")
for text in test_texts:
    pred = predict_text(text, best_model, vocab, le, device)
    print(f"\nText: {text}")
    print(f"Dự đoán: {pred}")