## CS310 Natural Language Processing
## Assignment 3 (part 1). Recurrent Neural Networks for Language Modeling

**Total points**: 30

In this assignment, you will train a vanilla RNN language model on《论语》and evaluate its perplexity.

### 0. Import Necessary Libraries

In [None]:
from pprint import pprint
import torch.nn as nn
import torch
import torch.nn as nn
import torch.optim as optim

### 2. Build the Model

In [None]:
input_file = 'lunyu_20chapters.txt'

# You can use the code from previous lab or rewrite it
# Hint: you can comment out the `self.initTableNegatives()` in `__init__` method
from utils import CorpusReader
corpus = CorpusReader(inputFileName=input_file, min_count=1)

### START YOUR CODE ###
# Modify word2id to make 0 as the padding token '[PAD]', and increase the index of all other words by 1
# Modify the id2word list to make the first word '[PAD]' as well
# Hint: Both word2id and id2word in utils.CorpusReader are dict objects

# Hint: Both word2id and id2word in utils.CorpusReader are dict objects
word2id: dict = {}
id2word: dict = {}

word2id.update({'[PAD]': 0})
word2id.update({k: v+1 for k, v in corpus.word2id.items()})
id2word = {v: k for k, v in word2id.items()}
### END YOUR CODE ###

# Test result
print('id2word:', sorted(list(id2word.items()), key=lambda x: x[0])[:5])
print('word2id:', sorted(list(word2id.items()), key=lambda x: x[1])[:5])




### START YOUR CODE ###

lines = []
with open(input_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        lines.append(line.strip())
        if i == 15:
            break

embedding_lunyu = nn.Embedding(len(word2id), 50)
rnn_lunyu = nn.RNN(50, 100, batch_first=True)

seq_ids = [torch.tensor([word2id.get(w, 0) for w in line], dtype=torch.long) for line in lines]
seq_lens = torch.tensor([len(line) for line in seq_ids])
seq_ids_padded = nn.utils.rnn.pad_sequence(seq_ids, batch_first=True)

seq_embs = embedding_lunyu(seq_ids_padded)
seq_embs_packed = nn.utils.rnn.pack_padded_sequence(seq_embs, seq_lens, batch_first=True, enforce_sorted=False)

out_packed,_= rnn_lunyu(seq_embs_packed)
out_unpacked,_= nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)


# Test result
print('seq_ids_padded:', seq_ids_padded.size())
print('seq_embs:', seq_embs.size())
print('out_unpacked:', out_unpacked.size())

# You should expect to see:
# seq_ids_padded: torch.Size([16, 85])
# seq_embs: torch.Size([16, 85, 50])
# out_unpacked: torch.Size([16, 85, 100])



### START YOUR CODE ###
fc = nn.Linear(100, len(word2id))
logits = fc(out_unpacked)
log_probs = F.log_softmax(logits, dim=-1)
### END YOUR CODE ###

# Test result
print('logits:', logits.size())
print('log_probs:', log_probs.size())

# You should expect to see:
# logits: torch.Size([16, 85, 1353])


### START YOUR CODE ###
targets_padded = seq_ids_padded.clone()
for i in range(len(targets_padded)):
    targets_padded[i, :-1] = targets_padded[i, 1:].clone()
    targets_padded[i, -1] = word2id.get('[PAD]', 0)
### END YOUR CODE ###

# Test result
print('targets_padded:', targets_padded.size())
print('last column of targets_padded:', targets_padded[:, -1])

# You should expect to see:
# targets_padded: torch.Size([16, 85])
# last column of targets_padded: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
class RNNLM(nn.Module):
    def __init__(self, **kwargs):
        super(RNNLM, self).__init__()
        self.embedding = nn.Embedding(kwargs['vocab_size'], kwargs['emb_size'])
        self.rnn = nn.RNN(kwargs['emb_size'], kwargs['hidden_size'], batch_first=True)
        self.fc = nn.Linear(kwargs['hidden_size'], kwargs['vocab_size'])

    def forward(self, seq, seq_lens):
        embedded = self.embedding(seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, seq_lens, batch_first=True, enforce_sorted=False)
        output, _ = self.rnn(packed)
        padded, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        logits = self.fc(padded)
        return logits


### 3. Train and Evaluate

### 4. Experiments