In [4]:
import os
import math
import urllib.request
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")### i used it to run it locally as i have good dgpu (rtx4060) so it ts really fast in comparison to google collab

urls = {
    "train": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt",
    "valid": "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt",
    "test":  "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt"
}

for k, v in urls.items():
    if not os.path.exists(f"{k}.txt"):
        urllib.request.urlretrieve(v, f"{k}.txt")

def read_text(p):
    with open(p, "r", encoding="utf-8") as f:
        return f.read()

def tokenize(t):
    return t.lower().split()

train_tokens=tokenize(read_text("train.txt"))
val_tokens=tokenize(read_text("valid.txt"))
test_tokens=tokenize(read_text("test.txt"))

counter=Counter(train_tokens)
vocab={"<unk>": 0}
for w in counter:
    vocab[w]=len(vocab)
vocab["<UNK>"] = 0

ivocab={i: w for w, i in vocab.items()}
vocab_size=len(vocab)
ivocab[0] = "<UNK>"

def numericalize(tokens):
    return torch.tensor([vocab.get(t, 0) for t in tokens], dtype=torch.long)

def batchify(data, bs):
    n=data.size(0) // bs
    data=data[:n * bs]
    return data.view(bs, -1).t().contiguous().to(device)

train_data=batchify(numericalize(train_tokens), 32)
val_data=batchify(numericalize(val_tokens), 32)

class LSTMLM(nn.Module):
    def __init__(self,vs,ed,hd,nl,dp):
        super().__init__()
        self.emb=nn.Embedding(vs,ed)
        self.lstm=nn.LSTM(ed,hd,nl,dropout=dp)
        self.fc=nn.Linear(hd,vs)
        self.fc.weight = self.emb.weight

    def forward(self, x, h):
        x = self.emb(x)
        o, h = self.lstm(x, h)
        o = self.fc(o)
        return o, h

embed_dim = 200
hidden_dim = 200
num_layers = 2
dropout = 0.5
seq_len = 35
epochs = 5

model = LSTMLM(vocab_size, embed_dim, hidden_dim, num_layers, dropout).to(device)##here i forced it to run on gpu
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def init_hidden(bs):
    return (
        torch.zeros(num_layers, bs, hidden_dim, device=device),
        torch.zeros(num_layers, bs, hidden_dim, device=device)
    )

def train_epoch(model, data):
    model.train()
    h = init_hidden(data.size(1))
    total = 0
    for i in range(0, data.size(0) - seq_len, seq_len):
        x = data[i:i+seq_len]
        y = data[i+1:i+seq_len+1].reshape(-1)
        optimizer.zero_grad()
        o, h = model(x, h)
        loss = criterion(o.reshape(-1, vocab_size), y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        h = tuple(v.detach() for v in h)
        total += loss.item()
    return total / (data.size(0) // seq_len)

def evaluate(model, data):
    model.eval()
    h = init_hidden(data.size(1))
    total = 0
    with torch.no_grad():
        for i in range(0, data.size(0) - seq_len, seq_len):
            x = data[i:i+seq_len]
            y = data[i+1:i+seq_len+1].reshape(-1)
            o, h = model(x, h)
            loss = criterion(o.reshape(-1, vocab_size), y)
            total += loss.item()
            h = tuple(v.detach() for v in h)
    return math.exp(total / (data.size(0) // seq_len))

for e in range(epochs):
    tl = train_epoch(model, train_data)
    vp = evaluate(model, val_data)
    print(f"Epoch {e+1} | Train Perplexity {math.exp(tl):.2f} | Val Perplexity {vp:.2f}")

def generate_text(model, start_text, max_len=50, temperature=1.0):
    model.eval()
    tokens = tokenize(start_text)
    ids = [vocab.get(t, 0) for t in tokens]
    inp = torch.tensor(ids, dtype=torch.long, device=device).unsqueeze(1)
    h = init_hidden(1)
    with torch.no_grad():
        for i in range(len(ids) - 1):
            _, h = model(inp[i:i+1], h)
        cur = inp[-1:]
        for _ in range(max_len):
            o, h = model(cur, h)
            logits = o.squeeze(0).squeeze(0) / temperature
            probs = F.softmax(logits, dim=0)
            nxt = torch.multinomial(probs, 1)
            ids.append(nxt.item())
            cur = nxt.unsqueeze(0)
    return " ".join(ivocab[i] for i in ids)

print(generate_text(model, "a random talk with my lstm", 60, 0.8))


Epoch 1 | Train Perplexity 655.08 | Val Perplexity 341.17
Epoch 2 | Train Perplexity 363.54 | Val Perplexity 264.85
Epoch 3 | Train Perplexity 285.22 | Val Perplexity 228.96
Epoch 4 | Train Perplexity 240.12 | Val Perplexity 207.58
Epoch 5 | Train Perplexity 211.01 | Val Perplexity 194.39
a random talk with my <UNK> and ( ' ( , while ( ( ( ) is ( and [ the ( ] ( " . it is intended to be ... the trench ( and the ( . a part of the decade of the tooth who was owned by the problems of the sport , who were inspired by djedkare , who were a
