# TextCNN Model

This notebook implements a clean TextCNN model for text classification using PyTorch.


In [30]:
import sys
import os
# Add project root to Python path
project_root = os.path.dirname(os.getcwd())
sys.path.insert(0, project_root)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

from src.data_utils import load_text_classification_data
from src.text_preprocess import basic_clean
from src.model.text_cnn import TextCNN
from src.train_nn import train_epoch, eval_epoch
from src.evaluate import evaluate_classification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


## 1. Load and Preprocess Data


In [31]:
# Load data
train_texts, train_labels, label2id, id2label = load_text_classification_data('train')
val_texts, val_labels, _, _ = load_text_classification_data('val')
test_texts, test_labels, _, _ = load_text_classification_data('test')

# Clean text
train_texts_clean = [basic_clean(text) for text in train_texts]
val_texts_clean = [basic_clean(text) for text in val_texts]
test_texts_clean = [basic_clean(text) for text in test_texts]

print(f"Training samples: {len(train_texts_clean)}")
print(f"Validation samples: {len(val_texts_clean)}")
print(f"Test samples: {len(test_texts_clean)}")
print(f"Label mapping: {label2id}")
print(f"Label distribution (train): {np.bincount(train_labels)}")


Training samples: 19782
Validation samples: 4239
Test samples: 4240
Label mapping: {'high': 0, 'low': 1, 'medium': 2}
Label distribution (train): [7698 4043 8041]


## 2. Build Vocabulary & Tokenizer


In [32]:
def build_vocab(texts, min_freq=1):
    """
    Build vocabulary from texts using whitespace tokenization.
    
    Args:
        texts: List of text strings
        min_freq: Minimum frequency threshold (default: 1, includes all tokens)
    
    Returns:
        Dictionary mapping tokens to IDs
        Special tokens: <PAD>=0, <UNK>=1
    """
    # Count all tokens using whitespace tokenization
    word_counts = Counter()
    for text in texts:
        # Whitespace tokenization: text.split()
        words = text.split()
        word_counts.update(words)
    
    # Create vocabulary with special tokens
    # <PAD> = 0, <UNK> = 1
    vocab = {'<PAD>': 0, '<UNK>': 1}
    idx = 2
    
    # Add all tokens with frequency >= min_freq
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    
    return vocab

# Build vocabulary with min_freq=1 (include all tokens)
vocab = build_vocab(train_texts_clean, min_freq=1)
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")
print(f"Special tokens: <PAD>={vocab['<PAD>']}, <UNK>={vocab['<UNK>']}")
print(f"Sample vocab items: {list(vocab.items())[:10]}")


Vocabulary size: 7289
Special tokens: <PAD>=0, <UNK>=1
Sample vocab items: [('<PAD>', 0), ('<UNK>', 1), ('enhance', 2), ('investment', 3), ('strategy', 4), ('with', 5), ('machine', 6), ('learning', 7), ('hello', 8), ('customer', 9)]


## 3. Create Dataset Class


In [33]:
class TextDataset(Dataset):
    """
    Dataset class for text classification.
    
    Args:
        texts: List of text strings
        labels: List or array of label IDs
        vocab: Dictionary mapping tokens to IDs
        max_len: Maximum sequence length (default: 256)
    """
    
    def __init__(self, texts, labels, vocab, max_len=256):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        """
        Get a single sample.
        
        Returns:
            Tuple of (padded_tensor, label)
            - padded_tensor: Tensor of shape (max_len,) with token IDs
            - label: Tensor with label ID
        """
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenize using whitespace: text.split()
        words = text.split()
        
        # Truncate to max_len
        words = words[:self.max_len]
        
        # Convert tokens to IDs
        # Use <UNK> (ID=1) for tokens not in vocabulary
        tokens = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]
        
        # Pad sequence to max_len using <PAD> (ID=0)
        padded = tokens + [self.vocab['<PAD>']] * (self.max_len - len(tokens))
        
        # Convert to tensors
        padded_tensor = torch.tensor(padded, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.long)
        
        return padded_tensor, label_tensor


## 4. Create DataLoaders


In [34]:
# Create datasets
max_len = 256
train_dataset = TextDataset(train_texts_clean, train_labels, vocab, max_len=max_len)
val_dataset = TextDataset(val_texts_clean, val_labels, vocab, max_len=max_len)
test_dataset = TextDataset(test_texts_clean, test_labels, vocab, max_len=max_len)

# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Verify dataset returns correct format
sample_tensor, sample_label = train_dataset[0]
print(f"\nSample tensor shape: {sample_tensor.shape}")
print(f"Sample label: {sample_label}")
print(f"Sample tensor (first 10): {sample_tensor[:10]}")


Train batches: 310
Val batches: 67
Test batches: 67

Sample tensor shape: torch.Size([256])
Sample label: 0
Sample tensor (first 10): tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11])


## 5. Initialize Model


In [35]:
# Initialize TextCNN model
embed_dim = 128
num_filters = 100
filter_sizes = [3, 4, 5]
num_classes = 3
dropout = 0.5
padding_idx = 0

model = TextCNN(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_filters=num_filters,
    filter_sizes=filter_sizes,
    num_classes=num_classes,
    dropout=dropout,
    padding_idx=padding_idx,
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"\nModel architecture:")
print(model)



Model parameters: 1,087,795

Model architecture:
TextCNN(
  (embedding): Embedding(7289, 128, padding_idx=0)
  (convs): ModuleList(
    (0): Conv1d(128, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(128, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(128, 100, kernel_size=(5,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=3, bias=True)
)


## 6. Setup Training


In [36]:
# Compute class weights to handle class imbalance
classes = np.unique(train_labels)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

print(f"Class weights: {class_weights}")
print(f"Class distribution: {np.bincount(train_labels)}")

# Setup optimizer and loss with class weights
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(weight=class_weights)

num_epochs = 5
print(f"\nTraining for {num_epochs} epochs...")


Class weights: tensor([0.8566, 1.6310, 0.8200], device='cuda:0')
Class distribution: [7698 4043 8041]

Training for 5 epochs...


## 7. Training Loop


In [37]:
# Training loop
for epoch in range(num_epochs):
    # Train
    train_loss, train_acc = train_epoch(train_loader, model, criterion, optimizer, device)
    
    # Validate
    val_loss, val_acc, _, _ = eval_epoch(val_loader, model, criterion, device)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    print()


Epoch 1/5
  Train Loss: 1.1592, Train Acc: 0.3800
  Val Loss: 1.0463, Val Acc: 0.4999

Epoch 2/5
  Train Loss: 1.0465, Train Acc: 0.4499
  Val Loss: 1.0312, Val Acc: 0.5339

Epoch 3/5
  Train Loss: 0.9935, Train Acc: 0.5000
  Val Loss: 0.9560, Val Acc: 0.5041

Epoch 4/5
  Train Loss: 0.9244, Train Acc: 0.5493
  Val Loss: 0.9054, Val Acc: 0.5607

Epoch 5/5
  Train Loss: 0.8262, Train Acc: 0.6160
  Val Loss: 0.8648, Val Acc: 0.6110



## 8. Evaluate on Validation Set


In [38]:
# Evaluate on validation set (use unweighted loss for evaluation)
eval_criterion = nn.CrossEntropyLoss()
val_loss, val_acc, val_pred, val_true = eval_epoch(val_loader, model, eval_criterion, device)

val_results = evaluate_classification(val_true, val_pred)
print("Validation Results:")
print(f"Accuracy: {val_results['accuracy']:.4f}")
print(f"F1 Macro: {val_results['f1_macro']:.4f}")
print("\nClassification Report:")
print(val_results['report'])


Validation Results:
Accuracy: 0.6110
F1 Macro: 0.6024

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.75      0.67      1615
           1       0.53      0.62      0.57       855
           2       0.69      0.48      0.57      1769

    accuracy                           0.61      4239
   macro avg       0.61      0.62      0.60      4239
weighted avg       0.62      0.61      0.61      4239



## 9. Evaluate on Test Set


In [39]:
# Evaluate on test set (use unweighted loss for evaluation)
test_loss, test_acc, test_pred, test_true = eval_epoch(test_loader, model, eval_criterion, device)

test_results = evaluate_classification(test_true, test_pred)
print("Test Results:")
print(f"Accuracy: {test_results['accuracy']:.4f}")
print(f"F1 Macro: {test_results['f1_macro']:.4f}")
print("\nClassification Report:")
print(test_results['report'])


Test Results:
Accuracy: 0.6035
F1 Macro: 0.5930

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.74      0.67      1604
           1       0.50      0.62      0.55       876
           2       0.70      0.47      0.56      1760

    accuracy                           0.60      4240
   macro avg       0.60      0.61      0.59      4240
weighted avg       0.62      0.60      0.60      4240



## 10. Save Model


In [40]:
# Save model
os.makedirs('../src/model', exist_ok=True)
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab': vocab,
    'label2id': label2id,
    'id2label': id2label,
    'model_config': {
        'vocab_size': vocab_size,
        'embed_dim': embed_dim,
        'num_filters': num_filters,
        'filter_sizes': filter_sizes,
        'num_classes': num_classes,
        'dropout': dropout,
        'padding_idx': padding_idx
    }
}, '../src/model/textcnn.pt')

print("Model saved to ../src/model/textcnn.pt")


Model saved to ../src/model/textcnn.pt
