Rupesh Bharambe (AI3107)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math, re, requests
from collections import Counter

In [10]:
# Download dataset (Alice in Wonderland)
url = "https://www.gutenberg.org/files/100/100-0.txt"
raw_text = requests.get(url).text

In [11]:
# Preprocess: remove Project Gutenberg license & clean
start_idx = raw_text.find("THE SONNETS")
end_idx = raw_text.find("End of the Project Gutenberg")
text = raw_text[start_idx:end_idx]
text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # remove punctuation
words = text.split()

In [12]:
# Vocabulary
vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(vocab)

In [13]:
# Create sequences
sequence_length = 10
inputs, targets = [], []

for i in range(len(words) - sequence_length):
    seq = words[i:i+sequence_length]
    target = words[i+sequence_length]
    inputs.append([word2idx[w] for w in seq])
    targets.append(word2idx[target])

print(f"Total sequences: {len(inputs)} | Vocab size: {vocab_size}")

Total sequences: 962800 | Vocab size: 29876


In [14]:
class WordDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X)
        self.Y = torch.tensor(Y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

dataset = WordDataset(inputs, targets)
loader = DataLoader(dataset, batch_size=128, shuffle=True)


In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, dim)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2) * -(math.log(10000.0) / dim))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class TransformerTextGen(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x) * math.sqrt(x.size(-1))
        x = self.pos_encoding(x)

        # Target sequence attends to itself
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(x.size(1)).to(x.device)
        x = self.transformer_decoder(x.transpose(0, 1), x.transpose(0, 1), tgt_mask=tgt_mask)
        return self.fc(x[-1])  # use only last token's output


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerTextGen(vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training
for epoch in range(24):
    total_loss = 0
    model.train()
    for x_batch, y_batch in loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        out = model(x_batch)
        loss = criterion(out, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")


Epoch 1, Loss: 6.7674
Epoch 2, Loss: 6.4719
Epoch 3, Loss: 6.3805
Epoch 4, Loss: 6.3282
Epoch 5, Loss: 6.2945
Epoch 6, Loss: 6.2631
Epoch 7, Loss: 6.2417
Epoch 8, Loss: 6.2095
Epoch 9, Loss: 6.1867
Epoch 10, Loss: 6.1690
Epoch 11, Loss: 6.1504
Epoch 12, Loss: 6.1333
Epoch 13, Loss: 6.1195
Epoch 14, Loss: 6.1135
Epoch 15, Loss: 6.1106
Epoch 16, Loss: 6.1089
Epoch 17, Loss: 6.0824
Epoch 18, Loss: 6.0788
Epoch 19, Loss: 6.0827
Epoch 20, Loss: 6.0582
Epoch 21, Loss: 6.0561
Epoch 22, Loss: 6.0325
Epoch 23, Loss: 6.0471
Epoch 24, Loss: 6.0425


In [21]:
def generate_text(model, seed_text, num_words=50, temperature=1.0):
    model.eval()
    words = seed_text.lower().split()

    for _ in range(num_words):
        # Ensure input length = sequence_length (pad if short)
        input_seq = words[-sequence_length:]
        if len(input_seq) < sequence_length:
            input_seq = ['<pad>'] * (sequence_length - len(input_seq)) + input_seq

        # Convert words to indices
        input_ids = [word2idx.get(w, word2idx.get('<unk>', 0)) for w in input_seq]
        input_tensor = torch.tensor([input_ids], dtype=torch.long, device=device)

        with torch.no_grad():
            logits = model(input_tensor)
            logits = logits / temperature  # Scale with temperature
            probs = torch.softmax(logits, dim=-1).squeeze()
            next_word_id = torch.multinomial(probs, num_samples=1).item()

        words.append(idx2word[next_word_id])

    return ' '.join(words)

print(generate_text(model, "alice was beginning to get very tired", 100, temperature=0.9))

alice was beginning to get very tired and and demetrius the cressids head absent makes the ones of war cannot offend henry this sweet queen distract the thing to fight and what enter parolles and rosencrantz richard must on your his body with the sixth of somerset blood and by a declining white well deserved a sail a man too i beseech you lear aside them to not the inclination of it enter sir pericles stand friar peace what good master word sir andrew let me see you know too i fear it utter we the madam and i care captain york me to bed your quits
