## CS310 Natural Language Processing
## Assignment 3 (part 1). Recurrent Neural Networks for Language Modeling

**Total points**: 30

In this assignment, you will train a vanilla RNN language model on《论语》and evaluate its perplexity.

### 0. Import Necessary Libraries

In [56]:
from pprint import pprint
import torch.nn as nn
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### 2. Build the Model

In [57]:
input_file = 'lunyu_20chapters.txt'

from utils import CorpusReader
corpus = CorpusReader(inputFileName=input_file, min_count=1)

word2id: dict = {}
id2word: dict = {}

word2id.update({'[PAD]': 0})
word2id.update({k: v+1 for k, v in corpus.word2id.items()})
id2word = {v: k for k, v in word2id.items()}


lines = []
with open(input_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        lines.append(line.strip())

embedding_lunyu = nn.Embedding(len(word2id), 50)
rnn_lunyu = nn.RNN(50, 100, batch_first=True)

seq_ids = [torch.tensor([word2id.get(w, 0) for w in line], dtype=torch.long) for line in lines]
seq_lens = torch.tensor([len(line) for line in seq_ids])
seq_ids_padded = nn.utils.rnn.pad_sequence(seq_ids, batch_first=True)

seq_embs = embedding_lunyu(seq_ids_padded)
seq_embs_packed = nn.utils.rnn.pack_padded_sequence(seq_embs, seq_lens, batch_first=True, enforce_sorted=False)

out_packed,_= rnn_lunyu(seq_embs_packed)
out_unpacked,_= nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)


targets_padded = seq_ids_padded.clone()
for i in range(len(targets_padded)):
    targets_padded[i, :-1] = targets_padded[i, 1:].clone()
    targets_padded[i, -1] = word2id.get('[PAD]', 0)





Total vocabulary: 1352


In [58]:
class RNNLM(nn.Module):
    def __init__(self, **kwargs):
        # super(RNNLM, self).__init__()
        # self.embedding = nn.Embedding(kwargs['vocab_size'], kwargs['emb_size'])
        # self.rnn = nn.RNN(kwargs['emb_size'], kwargs['hidden_size'], batch_first=True)
        # self.fc = nn.Linear(kwargs['hidden_size'], kwargs['vocab_size'])

        super(RNNLM, self).__init__()
        self.embedding = nn.Embedding(kwargs['vocab_size'], kwargs['emb_size'])

        # 多层RNN
        num_layers = 3
        self.rnn_layers = nn.ModuleList()
        for i in range(num_layers):
            input_size = kwargs['emb_size'] if i == 0 else kwargs['hidden_size']
            self.rnn_layers.append(nn.RNN(input_size, kwargs['hidden_size'], batch_first=True))

        self.fc = nn.Linear(kwargs['hidden_size'], kwargs['vocab_size'])

    def forward(self, seq, seq_lens):
            embedded = self.embedding(seq)
            packed = nn.utils.rnn.pack_padded_sequence(embedded, seq_lens, batch_first=True, enforce_sorted=False)

            # 多层RNN的前向传播
            rnn_output = packed
            for rnn_layer in self.rnn_layers:
                rnn_output, _ = rnn_layer(rnn_output)

            padded, _ = nn.utils.rnn.pad_packed_sequence(rnn_output, batch_first=True)
            logits = self.fc(padded)
            return logits


### 3. Train and Evaluate

In [60]:
fc = nn.Linear(100, len(word2id))
logits = fc(out_unpacked)
# log_probs = F.log_softmax(logits, dim=-1)
#
#
# # Test result
# print('logits:', logits.size())
# print('log_probs:', log_probs.size())
#
# # Report Compute Perplexity
# print('Report Compute Perplexity:')
# loss_fn = nn.NLLLoss(ignore_index=0, reduction='none')
#
#
# # Calculate the loss
# with torch.no_grad():
#     loss = loss_fn(log_probs.view(-1, log_probs.size(-1)), targets_padded.view(-1))
#
# # Test result
# print('loss:', loss.size())



# 定义计算困惑度的函数
def compute_perplexity(logits, targets):
    with torch.no_grad():
        log_probs = F.log_softmax(logits, dim=-1)
        loss = F.nll_loss(log_probs.view(-1, log_probs.size(-1)), targets.view(-1), ignore_index=0, reduction='none')
        # num_words = targets.ne(0).sum().item()  # 计算非填充词的数量
        perplexity = torch.exp(loss.mean())
    return perplexity

perplexity = compute_perplexity(logits, targets_padded)
print(f"Perplexity on training set: {perplexity.item()}")

vocab_size = len(word2id)

emb_size = embedding_lunyu.embedding_dim

hidden_size = rnn_lunyu.hidden_size

# # 输入序列的填充后的张量
# seq_ids_padded = seq_ids_padded
# # 输入序列的长度
# seq_lens = seq_lens
# # 目标序列的填充后的张量
# targets_padded = targets_padded

model = RNNLM(vocab_size=vocab_size, emb_size=emb_size, hidden_size=hidden_size)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# 训练过程
model.train()
num_epochs = 5  # 迭代次数
for epoch in range(num_epochs):
    optimizer.zero_grad()
    logits = model(seq_ids_padded, seq_lens)
    loss = criterion(logits.view(-1, logits.shape[-1]), targets_padded.view(-1))
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")


Perplexity on training set: 2.005084276199341
Epoch [1/5], Loss: 7.145854949951172
Epoch [2/5], Loss: 7.0641632080078125
Epoch [3/5], Loss: 6.982481002807617
Epoch [4/5], Loss: 6.900819301605225
Epoch [5/5], Loss: 6.8191680908203125


In [None]:
def generate_sentence(model, start_tokens, max_length=20):
    model.eval()
    with torch.no_grad():
        current_token = torch.tensor(start_tokens).unsqueeze(0)  # 添加批次维度
        hidden_state = None

        generated_sentence = start_tokens.copy()

        for _ in range(max_length):
            logits, hidden_state = model(current_token, hidden_state)
            probabilities = torch.softmax(logits[:, -1, :], dim=-1)
            predicted_token = torch.multinomial(probabilities, num_samples=1).squeeze(1)
            generated_sentence.append(predicted_token.item())

            if predicted_token.item() == end_token:
                break

            current_token = predicted_token.unsqueeze(0)

    return generated_sentence

# 假设你有一个已经训练好的模型实例model和开始标记start_tokens和结束标记end_token
start_tokens = ["子", "曰"]  # 开始标记组成的列表
end_token = "。"  # 结束标记的值

print(type(generate_sentence(model, start_tokens)))
print(generate_sentence(model, start_tokens))

# 生成句子
# num_sentences = 5  # 要生成的句子数量
# for _ in range(num_sentences):
#     sentence = generate_sentence(model, start_tokens)
#     print("Generated Sentence:", sentence)

Perplexity on training set: 2.0097408294677734
Epoch [1/5], Loss: 7.250707149505615
Epoch [2/5], Loss: 7.168994903564453


### 4. Experiments

In [None]:
import gensim

skipGram_model = gensim.models.KeyedVectors.load_word2vec_format(f'100_5_15.txt')

# Get embeddings as numpy array
pretrained_embeddings = skipGram_model.emb_v.cpu().weight.data.numpy()

# # Truncated SVD
# svd = TruncatedSVD(n_components=2)
# embeddings_2d = svd.fit_transform(embeddings)

# 应用预训练嵌入到模型的嵌入层
model.embedding.weight.data.copy_(pretrained_embeddings)

# 计算使用预训练嵌入的困惑度
logits_with_pretrained = model(seq_ids_padded, seq_lens)
perplexity_with_pretrained = compute_perplexity(logits_with_pretrained, targets_padded)
print(f"Perplexity with pretrained embeddings: {perplexity_with_pretrained.item()}")

# 重新随机初始化模型的嵌入层
model.embedding.reset_parameters()

# 计算使用随机初始化嵌入的困惑度
logits_random = model(seq_ids_padded, seq_lens)
perplexity_random = compute_perplexity(logits_random, targets_padded)
print(f"Perplexity with randomly initialized embeddings: {perplexity_random.item()}")