In this notebook, Pu will practice his Pytorch skills by using CNN to do classification on text data.

In [1]:
# Download the data
import string
import requests
import tarfile
import os
from nltk.tokenize import word_tokenize

# URL for the IMDB dataset
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
download_path = '../data/imdb/aclImdb_v1.tar.gz'
extracted_path = '../data/imdb/aclImdb/'

# Preprocessing
max_len = 500


# We do normalization and tokenization on the input text, we also ensure the input text length is at most `max_len`
def preprocessing(raw_text):
    text = raw_text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return word_tokenize(text)[:max_len]


# Check if the data is already downloaded and extracted
if not os.path.exists(download_path) and not os.path.exists(extracted_path):
    # Download the dataset
    response = requests.get(url, stream=True)
    with open('../data/imdb/aclImdb_v1.tar.gz', 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)

    # Unpack the dataset
    with tarfile.open('../data/imdb/aclImdb_v1.tar.gz', 'r:gz') as tar:
        tar.extractall(path='../data/imdb/')

# load the data from the imdb dataset directories
def load_imdb_data(data_dir):
    data = []
    labels = []

    # load positive reviews
    for filename in os.listdir(os.path.join(data_dir, 'pos')):
        with open(os.path.join(data_dir, 'pos', filename), 'r', encoding='utf-8') as file:
            text = file.read()
            data.append(preprocessing(text))
            labels.append('pos')
    # load negative reviews
    for filename in os.listdir(os.path.join(data_dir, 'neg')):
        with open(os.path.join(data_dir, 'neg', filename), 'r', encoding='utf-8') as file:
            text = file.read()
            data.append(preprocessing(text))
            labels.append('neg')
    return data, labels

# Load train and test data
train_data, train_labels = load_imdb_data('../data/imdb/aclImdb/train')
test_data, test_labels = load_imdb_data('../data/imdb/aclImdb/test')

for text, label in zip(train_data[:5], train_labels[:5]):
    print(label, f'{text[:10]}...')


pos ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']...
pos ['homelessness', 'or', 'houselessness', 'as', 'george', 'carlin', 'stated', 'has', 'been', 'an']...
pos ['brilliant', 'overacting', 'by', 'lesley', 'ann', 'warren', 'best', 'dramatic', 'hobo', 'lady']...
pos ['this', 'is', 'easily', 'the', 'most', 'underrated', 'film', 'inn', 'the', 'brooks']...
pos ['this', 'is', 'not', 'the', 'typical', 'mel', 'brooks', 'film', 'it', 'was']...


In [2]:
# Create Dataset and DataLoader for the downloaded data
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from sklearn.preprocessing import LabelEncoder


batch_size = 4


class IMDBDataset(Dataset):
    def __init__(self, data, labels, word_to_idx, max_len):
        self.data = [self.preprocess(text, word_to_idx, max_len) for text in data]
        self.labels = labels

    def preprocess(self, text, word_to_idx, max_len):
        res = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in text]
        res.extend([word_to_idx['<PAD>']] * (max_len - len(text)))
        return res
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)
    

# Build a vocabulary
def build_vocab(texts, max_size=10000):
    word_freqs = Counter([word for text in texts for word in text])
    return [word for word, _ in word_freqs.most_common(max_size)]


# Tokenization and building vocabulary
all_texts = train_data + test_data
vocab = build_vocab(all_texts)
word_to_idx = {word: idx + 2 for idx, word in enumerate(vocab)}
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = 1
vocab_size = len(vocab) + 2  # need to account for <PAD> and <UNK>

# Convert labels to integers using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Create Dataset
train_dataset = IMDBDataset(train_data, train_labels_encoded, word_to_idx, max_len)
test_dataset = IMDBDataset(test_data, test_labels_encoded, word_to_idx, max_len)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [3]:
# Model and training

import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import precision_score, recall_score, f1_score


# Model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, num_classes):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)  # [B, T, E] = [4, 500, 50]
        x = x.unsqueeze(1)  # [B, 1, T, E] = [4, 1, 500, 50]
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # [B, F, T-Fs+1] => [4, 32, 3], [4, 32, 2], [4, 32, 1]
        x = [torch.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [B, F] => [4, 32], [4, 32], [4, 32]
        x = torch.cat(x, 1)  # [4, 96]
        x = self.dropout(x)  # [4, 96]
        x = self.fc(x)  # [4, 2]
        return x

# Hyperparameters
embedding_dim = 50
num_filters = 32
filter_sizes = [2, 3, 4]
num_classes = 2
learning_rate = 0.001
epochs = 5

model = TextCNN(vocab_size, embedding_dim, num_filters, filter_sizes, num_classes)
# Move the model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# If multiple GPUs are available, use multi-gpu training
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=2, gamma=0.9)

# Training
for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    for batch_texts, batch_labels in train_loader:
        batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        output = model(batch_texts)
        loss = loss_fn(output, batch_labels)
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    scheduler.step()
    # Model evaluation
    model.eval()
    all_preds = []
    all_labels = []
    total_test_loss = 0.0
    with torch.no_grad():
        for batch_texts, batch_labels in test_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            output = model(batch_texts)
            loss = loss_fn(output, batch_labels)
            total_test_loss += loss.item()
            all_preds.extend(torch.argmax(output, dim=1).cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
        
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')
    f1 = f1_score(all_labels, all_preds, average='binary')
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_train_loss / len(train_loader):.4f}, Test Loss: {total_test_loss / len(test_loader):.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

Epoch 1/5, Train Loss: 0.6212, Test Loss: 0.4562, Precision: 0.7143, Recall: 0.9238, F1: 0.8057
Epoch 2/5, Train Loss: 0.4515, Test Loss: 0.3597, Precision: 0.8027, Recall: 0.9080, F1: 0.8521
Epoch 3/5, Train Loss: 0.3568, Test Loss: 0.3266, Precision: 0.8713, Recall: 0.8420, F1: 0.8564
Epoch 4/5, Train Loss: 0.3051, Test Loss: 0.3260, Precision: 0.8843, Recall: 0.8290, F1: 0.8558
Epoch 5/5, Train Loss: 0.2567, Test Loss: 0.3186, Precision: 0.8676, Recall: 0.8666, F1: 0.8671
