## Assignment 1. Neural Text Classification
## CS310 Natural Language Processing

**Total points**: 50

You should roughtly follow the structure of the notebook. Add additional cells if you feel needed. 

You can (and you should) re-use the code from Lab 2. 

Make sure your code is readable and well-structured.

### 0. Import Necessary Libraries

In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import json
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import jieba
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import SST2 # SST2 is the sentiment analysis dataset, binary

import time


### 1. Data Processing

In [54]:
class TextDataset(Dataset):
    def __init__(self, file_path, vocab):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line)
                # Tokenize and convert to indices
                tokens = [vocab[token] if token in vocab else vocab["<unk>"] for token in re.findall(r'[\u4e00-\u9fff]+', item['sentence'])]
                self.data.append((tokens, item['label'][0]))
                
        # Prepare for EmbeddingBag: concatenate all tokens and compute offsets
        self.tokens = [token for tokens, _ in self.data for token in tokens]
        self.offsets = [0] + [len(tokens) for tokens, _ in self.data]
        self.offsets = torch.tensor(self.offsets[:-1]).cumsum(dim=0)
        self.labels = [label for _, label in self.data]
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.tokens, self.offsets[idx], self.labels[idx]

def build_vocab(file_path):
    vocab = {"<PAD>": 0}  # Padding Token
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            for token in re.findall(r'[\u4e00-\u9fff]+', item['sentence']):
                if token not in vocab:
                    vocab[token] = len(vocab)
    return vocab

def custom_collate_fn(batch):
    tokens, offsets, labels = [], [0], []  # Initialize offsets with 0 for the first sequence
    for tokens_batch, _, label in batch:
        labels.append(label)
        tokens.extend(tokens_batch)
        offsets.append(len(tokens))  # The next sequence starts after the current one ends
    # Convert lists to tensors
    tokens_tensor = torch.tensor(tokens, dtype=torch.long)
    offsets_tensor = torch.tensor(offsets[:-1], dtype=torch.long)  # Exclude the last offset
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    return tokens_tensor, offsets_tensor, labels_tensor

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(iter(SST2(split='train'))), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
# vocab = build_vocab('train.jsonl')
# Initialize dataset and dataloader
train_dataset = TextDataset('train.jsonl', vocab)
test_dataset=TextDataset('test.jsonl', vocab)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)
test_loader=DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)

### 2. Build the Model

In [55]:



class BoWClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(BoWClassifier, self).__init__()
        self.embedding_bag = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )
        
    def forward(self, text, offsets):
        embedded = self.embedding_bag(text, offsets)
        return self.fc(embedded)


vocab_size = len(vocab)
embed_dim = 64
num_classes = 2
model = BoWClassifier(vocab_size, embed_dim, num_classes)
# Hyperparameters
EPOCHS = 10  # epoch
LR = 0.001  # learning rate
BATCH_SIZE = 8  # batch size for training
sparse_parameters = [params for params in model.embedding_bag.parameters()]
dense_parameters = [params for params in model.fc.parameters()]

optimizer_sparse = optim.SparseAdam(sparse_parameters, lr=LR)
optimizer_dense = optim.Adam(dense_parameters, lr=LR)

criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=LR)
sparse_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_sparse, 1.0, gamma=0.1)
dense_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_dense, 1.0, gamma=0.1)

# predictions = []
# true_labels = []
# with torch.no_grad():  # No gradients needed for evaluation
#     for tokens, offsets, labels in test_loader:
#         output = model(tokens, offsets)
#         pred_labels = output.argmax(dim=1)
#         predictions.extend(pred_labels.cpu().numpy())
#         true_labels.extend(labels.cpu().numpy())


### 3. Train and Evaluate

In [60]:
# def eval(model,test_loader):
#     model.eval()  # Set the model to evaluation mode

#     predictions = []
#     true_labels = []

#     with torch.no_grad():  # No gradients needed for evaluation
#         for tokens, offsets, labels in test_loader:
#             output = model(tokens, offsets)
#             pred_labels = output.argmax(dim=1)
#             predictions.extend(pred_labels.cpu().numpy())
#             true_labels.extend(labels.cpu().numpy())
#     accuracy = accuracy_score(true_labels, predictions)
#     precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
#     print(f"Test Accuracy: {accuracy:.4f}")
#     print(f"Precision: {precision:.4f}")
#     print(f"Recall: {recall:.4f}")
#     print(f"F1 Score: {f1:.4f}")
def eval(model, test_loader):
    # model.eval()
    # total_acc, total_count = 0, 0

    # with torch.no_grad():
    #     for idx, (tokens, offsets, labels) in enumerate(dataloader):
    #         output = model(tokens, offsets)
    #         loss = criterion(output, labels)
    #         total_acc += (output.argmax(1) == labels).sum().item()
    #         total_count += labels.size(0)
    # return total_acc / total_count
    model.eval()  # Set the model to evaluation mode

    predictions = []
    true_labels = []

    with torch.no_grad():  # No gradients needed for evaluation
        for tokens, offsets, labels in test_loader:
            output = model(tokens, offsets)
            pred_labels = output.argmax(1)
            predictions.extend(pred_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted', zero_division=0)
    # print(f"Test Accuracy: {accuracy:.4f}")
    # print(f"Precision: {precision:.4f}")
    # print(f"Recall: {recall:.4f}")
    # print(f"F1 Score: {f1:.4f}")
    return accuracy,precision,recall
# def train(model, train_loader, optimizer, criterion, epoch):
#     model.train()
#     log_interval=150
    
#     for epoch in range(epochs):
#         for idx, (tokens, offsets, labels) in enumerate(train_loader):
#         # for tokens, offsets, labels in train_loader:
#             optimizer_sparse.zero_grad()
#             optimizer_dense.zero_grad()
            
#             output = model(tokens, offsets)
#             loss = criterion(output, labels)
            
#             loss.backward()
            
#             optimizer_sparse.step()
#             optimizer_dense.step()
#             # print(idx)
#             if idx % log_interval == 0 and idx > 0:
#                 eval(model,test_loader)
def train(model, train_loader,optimizer_sparse,optimizer_dense, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100
    start_time = time.time()

    for idx, (tokens, offsets, labels) in enumerate(train_loader):
        optimizer_sparse.zero_grad()
        optimizer_dense.zero_grad()
        output = model(tokens, offsets)
        try:
            loss = criterion(output, labels)
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            # print('token_ids: ', token_ids)
            # print('offsets: ', offsets)
            raise
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer_sparse.step()
        optimizer_dense.step()

        total_acc += (output.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(train_loader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()


# Run the training loop
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_loader,optimizer_sparse,optimizer_dense, criterion, epoch)
    accuracy,precision,recall = eval(model, test_loader)

    if total_accu is not None and total_accu >= accuracy:
        sparse_scheduler.step()
        dense_scheduler.step()
    else:
        total_accu = accuracy

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "accuracy {:8.3f} |precision {:8.3f}|recall {:8.3f}".format(
            epoch, time.time() - epoch_start_time, accuracy,precision,recall
        )
    )
    print("-" * 59)

torch.save(model.state_dict(), 'model.pth')

| epoch   1 |   100/ 1585 batches | accuracy    0.702
| epoch   1 |   200/ 1585 batches | accuracy    0.709
| epoch   1 |   300/ 1585 batches | accuracy    0.714
| epoch   1 |   400/ 1585 batches | accuracy    0.730
| epoch   1 |   500/ 1585 batches | accuracy    0.715
| epoch   1 |   600/ 1585 batches | accuracy    0.734
| epoch   1 |   700/ 1585 batches | accuracy    0.706
| epoch   1 |   800/ 1585 batches | accuracy    0.736
| epoch   1 |   900/ 1585 batches | accuracy    0.696
| epoch   1 |  1000/ 1585 batches | accuracy    0.686
| epoch   1 |  1100/ 1585 batches | accuracy    0.724
| epoch   1 |  1200/ 1585 batches | accuracy    0.713
| epoch   1 |  1300/ 1585 batches | accuracy    0.699
| epoch   1 |  1400/ 1585 batches | accuracy    0.721
| epoch   1 |  1500/ 1585 batches | accuracy    0.724
-----------------------------------------------------------
| end of epoch   1 | time: 91.31s | accuracy    0.739 |precision    0.546|recall    0.739
----------------------------------------

In [None]:
model = BoWClassifier(vocab_size, embed_dim, num_classes)
model.load_state_dict(torch.load('model.pth'))


Test Accuracy: 0.7389
Precision: 0.5459
Recall: 0.7389
F1 Score: 0.6279


  _warn_prf(average, modifier, msg_start, len(result))


### 4. Explore Word Segmentation

In [None]:


# Custom Dataset Class
class TextDataset_jieba(Dataset):
    def __init__(self, file_path, vocab):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line)
                # Segment with Jieba
                tokens = [vocab.get(token, 0) for token in jieba.cut(item['sentence'])]
                self.data.append((tokens, item['label'][0]))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0], dtype=torch.long), torch.tensor(self.data[idx][1], dtype=torch.long)

# Vocabulary Building Function
def build_vocab_jieba(file_path):
    vocab = {"<PAD>": 0}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            for token in jieba.cut(item['sentence']):
                if token not in vocab:
                    vocab[token] = len(vocab)
    return vocab

# Model Definition
class BoWClassifier_jieba(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(BoWClassifier_jieba, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

def custom_collate_fn_jieba(batch):
    labels = []
    tokens = []
    offsets = [0]  # Start with 0 offset

    for data, label in batch:
        labels.append(label)
        tokens.extend(data)  # Flatten all tokens
        offsets.append(len(tokens))  # Mark the end of the current sequence

    # Convert to tensors
    tokens_tensor = torch.tensor(tokens, dtype=torch.long)
    offsets_tensor = torch.tensor(offsets[:-1], dtype=torch.long)  # Exclude the last cumulative length
    labels_tensor = torch.tensor(labels, dtype=torch.long)

    return tokens_tensor, offsets_tensor, labels_tensor


# Prepare Data
vocab_jieba = build_vocab_jieba('train.jsonl')
train_dataset_jieba = TextDataset_jieba('train.jsonl', vocab_jieba)
train_loader_jieba = DataLoader(train_dataset_jieba, batch_size=32, shuffle=True, collate_fn=custom_collate_fn_jieba)
test_dataset_jieba = TextDataset_jieba('test.jsonl', vocab_jieba)
test_loader_jieba = DataLoader(test_dataset_jieba, batch_size=32, shuffle=False, collate_fn=custom_collate_fn_jieba)

# Initialize Model
model_jieba = BoWClassifier_jieba(len(vocab_jieba), embed_dim=100, num_classes=2)
# optimizer_jieba = optim.Adam(model_jieba.parameters())
criterion_jieba = nn.CrossEntropyLoss()


# Assuming you have model_jieba defined
sparse_parameters_jieba = list(filter(lambda p: p.requires_grad, model_jieba.embedding.parameters()))
dense_parameters_jieba = list(filter(lambda p: p.requires_grad, model_jieba.fc.parameters()))

optimizer_sparse_jieba = optim.SparseAdam(sparse_parameters_jieba, lr=0.001)
optimizer_dense_jieba = optim.Adam(dense_parameters_jieba, lr=0.001)
epochs=1
for epoch in range(epochs):
    model_jieba.train()
    for tokens, offsets, labels in train_loader_jieba:
        # Zero gradients for both optimizers
        optimizer_sparse_jieba.zero_grad()
        optimizer_dense_jieba.zero_grad()
        
        # Forward pass
        output = model_jieba(tokens, offsets)
        loss = criterion_jieba(output, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer_sparse_jieba.step()
        optimizer_dense_jieba.step()


torch.save(model_jieba.state_dict(), 'model_jieba.pth')






Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/qd/pr_ybl6s7c7d3nbyj6yt9g0w0000gn/T/jieba.cache
Loading model cost 0.654 seconds.
Prefix dict has been built successfully.


In [None]:
model_jieba = BoWClassifier_jieba(len(vocab_jieba), embed_dim=100, num_classes=2)

model_jieba.load_state_dict(torch.load('model_jieba.pth'))
model_jieba.eval()  # Set the model to evaluation mode
# Evaluation
all_predictions, all_labels = [], []
with torch.no_grad():
    for text, offsets, labels in test_loader_jieba:
        output = model_jieba(text, offsets)
        predictions = output.argmax(1)
        all_predictions.extend(predictions.numpy())
        all_labels.extend(labels.numpy())

accuracy = accuracy_score(all_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
print(f"Test Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}")

Test Accuracy: 0.7403993855606759
Precision: 0.8078955453149003
Recall: 0.7403993855606759
F1 Score: 0.6315126587944339
