Rupesh Bharambe (AI3107)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import requests
import os




In [2]:
# Download dataset
url = "https://s3.amazonaws.com/text-datasets/nietzsche.txt"
file_path = "nietzsche.txt"
if not os.path.exists(file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(requests.get(url).text)

In [3]:
# Load text
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
# Vocabulary processing
chars = sorted(list(set(text)))
char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for ch, i in char2idx.items()}
vocab_size = len(chars)

In [5]:
# Prepare sequences
seq_length = 100
step = 1
sequences = []
targets = []
for i in range(0, len(text) - seq_length, step):
    sequences.append(text[i:i+seq_length])
    targets.append(text[i+seq_length])

In [6]:
def vectorize(seq):
    return [char2idx[ch] for ch in seq]

class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = [vectorize(seq) for seq in sequences]
        self.targets = [char2idx[ch] for ch in targets]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.targets[idx])

dataset = CharDataset(sequences, targets)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [7]:
# Define LSTM model
class LSTMTextGen(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])
        return out, hidden

model = LSTMTextGen(vocab_size, hidden_size=256, num_layers=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [8]:
# Training loop
for epoch in range(5):
    for inputs, labels in dataloader:
        outputs, _ = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 1.4644
Epoch 2, Loss: 1.4616
Epoch 3, Loss: 1.3060
Epoch 4, Loss: 1.1291
Epoch 5, Loss: 1.2719


In [9]:
# Text generation
def generate_text(model, start_str, length=200):
    model.eval()
    input_seq = torch.tensor([char2idx[c] for c in start_str]).unsqueeze(0)
    generated = start_str

    hidden = None
    with torch.no_grad():
        for _ in range(length):
            output, hidden = model(input_seq[:, -seq_length:], hidden)
            prob = torch.softmax(output, dim=1).squeeze()
            idx = torch.multinomial(prob, num_samples=1).item()
            generated += idx2char[idx]
            input_seq = torch.cat([input_seq, torch.tensor([[idx]])], dim=1)
    return generated

print(generate_text(model, "Nietzsche said: ", 300))

Nietzsche said: as not no latterly, the happiness--and alw-leng to be
imaging suffer swip as we truth, and no goal from lifest and mode you hat-philosopherse world upon thoughts of learn alassi-Goside. Thoiles, bying?--Moral truth, and the ullat? Indowed to us now
lution is power Gretctest afterwards altogracting i
