# RNN with the Penn Tree dataset

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
# Read the text from the manually imported file
with open('/ptbdataset/ptb_train.txt', 'r') as file:
    text = file.read()

# Check the length of the text
print("Length of text: ", len(text))


Length of text:  5101618


In [None]:
# Creating character mappings
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# Creating sequences
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print("Number of sequences:", len(sentences))

Number of sequences: 1700526


In [None]:
# Vectorizing the data
X = torch.zeros((len(sentences), maxlen, len(chars)), dtype=torch.float32)
y = torch.zeros((len(sentences), len(chars)), dtype=torch.float32)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [None]:
# Building the model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        out = self.softmax(out)
        return out

In [None]:
input_size = len(chars)
hidden_size = 128
output_size = len(chars)

model = RNNModel(input_size, hidden_size, output_size)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# Training the model
num_epochs = 2
batch_size = 128
for epoch in range(num_epochs):
    for i in range(0, len(X), batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/2], Loss: 3.7936
Epoch [2/2], Loss: 3.7936


## Evaluation phase

In [None]:
with open('/ptbdataset/ptb_test.txt', 'r') as file:
    test_text = file.read()

# Creating sequences
test_sentences = []
test_next_chars = []
for i in range(0, len(test_text) - maxlen, step):
    test_sentences.append(test_text[i: i + maxlen])
    test_next_chars.append(test_text[i + maxlen])
print("Number of test sequences:", len(test_sentences))

# Vectorizing
X_test = torch.zeros((len(test_sentences), maxlen, len(chars)), dtype=torch.float32)
y_test = torch.zeros((len(test_sentences), len(chars)), dtype=torch.float32)
for i, sentence in enumerate(test_sentences):
    for t, char in enumerate(sentence):
        X_test[i, t, char_indices[char]] = 1
    y_test[i, char_indices[test_next_chars[i]]] = 1


Number of test sequences: 149969


In [None]:
# Evaluation function
def evaluate_model(model, X_test, y_test):
    with torch.no_grad():

        # Forward pass
        y_pred = model(X_test)

        # Compute loss
        criterion = nn.CrossEntropyLoss()
        loss = criterion(y_pred, y_test)

        # Calculate perplexity
        perplexity = torch.exp(loss)

    return loss.item(), perplexity.item()


In [None]:
# Evaluate the model on the test set

test_loss, test_perplexity = evaluate_model(model, X_test, y_test)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Perplexity: {test_perplexity:.4f}')

Test Loss: 3.7626
Test Perplexity: 43.0603
