Rupesh Bharambe (AI3107)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import requests
import re
from collections import Counter
import os


In [11]:
# Download dataset
url = "https://www.gutenberg.org/files/11/11-0.txt"
text = requests.get(url).text

In [12]:

# Clean and tokenize
text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # remove punctuation and lowercase
words = text.split()

In [13]:
# Build vocabulary
vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}


In [14]:
# Create sequences
sequence_length = 5
inputs = []
targets = []

for i in range(len(words) - sequence_length):
    seq = words[i:i + sequence_length]
    target = words[i + sequence_length]
    inputs.append([word2idx[w] for w in seq])
    targets.append(word2idx[target])

In [15]:

class WordDataset(Dataset):
    def __init__(self, inputs, targets):
        self.x = torch.tensor(inputs)
        self.y = torch.tensor(targets)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

dataset = WordDataset(inputs, targets)
loader = DataLoader(dataset, batch_size=128, shuffle=True)

class LSTMWordGen(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.fc(output[:, -1, :])  # only last output
        return output

vocab_size = len(vocab)
model = LSTMWordGen(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)



In [16]:
# Training loop
for epoch in range(5):
    for x_batch, y_batch in loader:
        optimizer.zero_grad()
        out = model(x_batch)
        loss = criterion(out, y_batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 5.0866
Epoch 2, Loss: 4.8182
Epoch 3, Loss: 4.1361
Epoch 4, Loss: 3.4140
Epoch 5, Loss: 2.3527


In [17]:
import random

def generate_words(model, seed_text, num_words=20):
    model.eval()
    words = seed_text.lower().split()
    for _ in range(num_words):
        input_seq = [word2idx.get(w, 0) for w in words[-sequence_length:]]
        input_tensor = torch.tensor(input_seq).unsqueeze(0)
        with torch.no_grad():
            out = model(input_tensor)
            pred_idx = torch.argmax(out, dim=1).item()
            words.append(idx2word[pred_idx])
    return ' '.join(words)

seed = "the rabbit hole was very"
print(generate_words(model, seed, 30))


the rabbit hole was very likely true down down down down her face brightened up at the top of his voice and was going to begin again for some time without interrupting and began to
