In [1]:
f = open("./TinyShakespeare/input.txt")
text = f.read()
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [2]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab_size

65

In [3]:
char_to_idx = {ch:i for i, ch in enumerate(chars)}
idx_to_char = {i:ch for i, ch in enumerate(chars)}

def encode(text):
    return [char_to_idx[ch] for ch in text]

def decode(idx):
    return [idx_to_char[i] for i in idx]

In [4]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
data.shape

torch.Size([1115394])

In [5]:
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

In [6]:
sequence_length = 8
x = train_data[:sequence_length]
y = train_data[1:sequence_length + 1]
for t in range(sequence_length):
    context = x[:t + 1]
    target = y[t]
    print(context, target)
x.shape

tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)


torch.Size([8])

In [7]:
import torch
from torch.utils.data import Dataset
import random

class ContextTargetDataset(Dataset):
    def __init__(self, data, window_size):
        self.data = data
        self.window_size = window_size

    def __len__(self):
        return len(self.data) - self.window_size

    def __getitem__(self, idx):
        start_idx = random.randint(0, len(self.data) - self.window_size - 1)
        x = self.data[start_idx : start_idx + self.window_size]
        y = self.data[start_idx + 1 : start_idx + self.window_size + 1]
        
        return x, y

    def collate_fn(batch):
        xs, ys = zip(*batch)
        return torch.stack(xs), torch.stack(ys)



In [8]:
from torch.utils.data import DataLoader
train_dataset = ContextTargetDataset(train_data, window_size=8)
train_loader = DataLoader(train_dataset, batch_size=2)

test_dataset = ContextTargetDataset(test_data, window_size=8)
test_loader = DataLoader(test_dataset, batch_size=2)
for batch_x, batch_y in test_loader:
    print("X: ", batch_x)
    print("Y: ", batch_y)
    break

X:  tensor([[53,  1, 58, 46, 43,  1, 58, 47],
        [47, 57,  1, 50, 47, 60, 43, 57]])
Y:  tensor([[ 1, 58, 46, 43,  1, 58, 47, 51],
        [57,  1, 50, 47, 60, 43, 57,  1]])


In [9]:
embed_dim = 32
hidden_size = 64
output_size = vocab_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# RNN

In [10]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        self.hidden_size = hidden_size
        self.W_xh = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_hh = nn.Parameter(torch.randn(hidden_size, hidden_size))
        self.b_h = nn.Parameter(torch.zeros(hidden_size))
        
        self.W_hy = nn.Parameter(torch.randn(output_size, hidden_size))
        self.b_y = nn.Parameter(torch.zeros(output_size))

    def step(self, x_t, h_t):
        h_t = torch.tanh(x_t @ self.W_xh.T + h_t @ self.W_hh.T + self.b_h)
        y_t = h_t @ self.W_hy.T + self.b_y
        return y_t, h_t

    def forward(self, x):
        batch_size, seq_len = x.shape
        x_embed = self.embedding(x)
        h_t = torch.zeros(batch_size, self.hidden_size).to(device)

        outputs = []
        
        for t in range(seq_len):
            x_t = x_embed[:,t, :]

            y_t, h_t = self.step(x_t, h_t)
            outputs.append(y_t)
            
        outputs = torch.stack(outputs, dim=1)
        return outputs

    def generate(self, start_token, sample_size = 100):
        self.eval()
    
        input_seq = torch.tensor([[char_to_idx[start_token]]], dtype=torch.long).to(device)
        generated = [start_token]
        
        h_t = torch.zeros(1, model.hidden_size).to(device)
    
        with torch.no_grad():
            for _ in range(sample_size):
                x_embed = model.embedding(input_seq[:, -1])
                y_t, h_t = self.step(x_embed, h_t)
                
                probs = torch.softmax(y_t, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1).item()
                
                next_token = idx_to_char[next_token_id]
                generated.append(next_token)
                
                input_seq = torch.cat([input_seq, torch.tensor([[next_token_id]]).to(device)], dim=1)
        
        return ''.join(generated)

In [11]:
model = RNN(vocab_size, embed_dim, hidden_size, output_size)

In [12]:
from tqdm.notebook import tqdm
train_loader = DataLoader(train_dataset, batch_size=32)
epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

    for  i, (batch_x, batch_y) in enumerate(loop, 1):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        outputs = model(batch_x)
        loss = criterion(outputs.view(-1, vocab_size), batch_y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        avg_loss = total_loss / i
        loop.set_postfix(batch_loss=loss.item(), avg_loss=avg_loss)
print(f"After {epochs} epochs, loss: {avg_loss}")

Epoch 1/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/31371 [00:00<?, ?it/s]

After 5 epochs, loss: 1.8875628292326878


In [42]:
print(model.generate(start_token='\n', sample_size = 500))


Lof to sanavour the shall fathersigh the cearth, your you hore of do treball, bid ther.

LEONTHARD SICINIUS:
Ray
To good Sonot yoest that to blenspest pittle hear acty.
But; the come,
Puto
Lord pead, and.
Proudin;
Why frised! Pettell all go thal with is lord?

VOLUOMEIO:
Nall Rave so, witd the to tooy sovather,'s out, go; their thim. no, fremeetherd.

AANUS:
Where me upon it fripe,
Ward?

ey!

ISABELLA:
By; 'rd's tom louthrul all, and,
And not to
RICLA:
I thope Pale
Your lastispar I sonst che he


# Torch RNN

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

class TorchRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size):
        super(TorchRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x_embed = self.embedding(x)
        out, _ = self.rnn(x_embed)
        logits = self.fc(out)
        return logits

    def generate(self, start_token, sample_size = 100):
        self.eval()
        generated = [start_token]
        
        input_token = torch.tensor([[char_to_idx.get(start_token)]]).to(device)
        h_t = None
        with torch.no_grad():
            for _ in range(sample_size):
                logits = self.forward(input_token)
                logits = logits[:, -1, :] 
                probs = torch.softmax(logits, dim=-1)
                
                next_token_id = torch.multinomial(probs, num_samples=1).item()
                next_token = idx_to_char[next_token_id]
                generated.append(next_token)
                
                input_token = torch.tensor([[next_token_id]]).to(device)
    
        return ''.join(generated)


In [15]:
torch_model = TorchRNN(vocab_size, embed_dim, hidden_size, output_size).to(device)

In [16]:
torch_optimizer = torch.optim.AdamW(torch_model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(5):
    torch_model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/5", leave=False)

    for i, (batch_x, batch_y) in enumerate(loop, 1):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = torch_model(batch_x)

        loss = criterion(outputs.view(-1, vocab_size), batch_y.view(-1))

        torch_optimizer.zero_grad()
        loss.backward()
        torch_optimizer.step()

        total_loss += loss.item()
        avg_loss = total_loss / i
        loop.set_postfix(batch_loss=loss.item(), avg_loss=avg_loss)

print(f"After {epochs} epochs, loss: {avg_loss}")

Epoch 1/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/31371 [00:00<?, ?it/s]

After 5 epochs, loss: 1.8620590770374938


In [36]:
output = torch_model.generate(start_token = '\n', sample_size = 500)
print(output)


Ask onghean, s y ARKI:
Wh silit
HAn se ftusowen, I os poube lll sarwnonlly sor p,
I g ar?
thit t, lesthacemangull th hasenes linth itha wondioureanglatlld.
SI arvin'd s se ORack, dedordis f sere pn matrsisearndid. ENTe halot s by, he ce an t senort ad.
V: van I horet ig CUSThthen ivee y d, s incin akinon,
PE:
Thawallon.

A we anghingr,
Me

Th thame t
Sth n ng le thos withthisouth
TOf inof allldorthis:
Swbanghele onobeng whonothat oushfe tls:
H:


D:
T:
F k wod th mar atay st buse ot ceeldootyod 


# LSTM

In [18]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        self.hidden_size = hidden_size
        self.W_ii = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_hi = nn.Parameter(torch.randn(hidden_size, hidden_size))

        self.W_if = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_hf = nn.Parameter(torch.randn(hidden_size, hidden_size))

        self.W_ig = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_hg = nn.Parameter(torch.randn(hidden_size, hidden_size))

        self.W_io = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_ho = nn.Parameter(torch.randn(hidden_size, hidden_size))

        self.b_ii = nn.Parameter(torch.zeros(hidden_size))
        self.b_hi = nn.Parameter(torch.zeros(hidden_size))
        self.b_if = nn.Parameter(torch.zeros(hidden_size))
        self.b_hf = nn.Parameter(torch.zeros(hidden_size))
        self.b_ig = nn.Parameter(torch.zeros(hidden_size))
        self.b_hg = nn.Parameter(torch.zeros(hidden_size))
        self.b_io = nn.Parameter(torch.zeros(hidden_size))
        self.b_ho = nn.Parameter(torch.zeros(hidden_size))

        self.fc_out = nn.Linear(hidden_size, output_size)

    def step(self, x_t, h_t, c_t):
        i_t = torch.sigmoid(x_t @ self.W_ii.T + self.b_ii + h_t @ self.W_hi.T + self.b_hi)
        f_t = torch.sigmoid(x_t @ self.W_if.T + self.b_if + h_t @ self.W_hf.T + self.b_hf)
        g_t = torch.tanh(x_t @ self.W_ig.T + self.b_ig + h_t @ self.W_hg.T + self.b_hg)
        o_t = torch.sigmoid(x_t @ self.W_io.T + self.b_io + h_t @ self.W_ho.T + self.b_ho)

        c_t = f_t * c_t + i_t * g_t
        h_t = o_t * torch.tanh(c_t)

        return h_t, c_t
    
    def forward(self, x):
        batch_size, seq_len = x.shape
        x_embed = self.embedding(x)
        h_t = torch.zeros(batch_size, self.hidden_size).to(device)
        c_t = torch.zeros(batch_size, self.hidden_size).to(device)
        
        outputs = []
        for t in range(seq_len):
            x_t = x_embed[:,t, :]
            h_t, c_t = self.step(x_t, h_t, c_t)
            
            outputs.append(h_t)
            
        outputs = torch.stack(outputs, dim=1)
        outputs = self.fc_out(outputs)
        return outputs

    def generate(self, start_token, sample_size=100):
        self.eval()
        input_seq = torch.tensor([[char_to_idx[start_token]]], dtype=torch.long).to(device)
        generated = [start_token]

        h_t = torch.zeros(1, self.hidden_size).to(device)
        c_t = torch.zeros(1, self.hidden_size).to(device)

        with torch.no_grad():
            for _ in range(sample_size):
                x_embed = self.embedding(input_seq[:, -1])
                h_t, c_t = self.step(x_embed, h_t, c_t)
                y_t = self.fc_out(h_t)
                probs = torch.softmax(y_t, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1).item()
                next_token = idx_to_char[next_token_id]
                generated.append(next_token)

                input_seq = torch.cat([input_seq, torch.tensor([[next_token_id]]).to(device)], dim=1)

        return ''.join(generated)

In [19]:
lstm_model = LSTM(vocab_size, embed_dim, hidden_size, output_size)

In [20]:
from tqdm.notebook import tqdm
train_loader = DataLoader(train_dataset, batch_size=32)
epochs = 5
optimizer = torch.optim.AdamW(lstm_model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

lstm_model.to(device)

for epoch in range(epochs):
    lstm_model.train()
    total_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

    for  i, (batch_x, batch_y) in enumerate(loop, 1):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        outputs = lstm_model(batch_x)
        loss = criterion(outputs.view(-1, vocab_size), batch_y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        avg_loss = total_loss / i
        loop.set_postfix(batch_loss=loss.item(), avg_loss=avg_loss)
print(f"After {epochs} epochs, loss: {avg_loss}")

Epoch 1/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/31371 [00:00<?, ?it/s]

After 5 epochs, loss: 1.7623790918608435


In [29]:
output = lstm_model.generate("\n", 500)
print(output)


How both:
Of him.

ROMEO:
Where have, take homb, with pricklifi'nse gencal hence-damed hatpementle bect base though dears, what faclest alowse! for Michongue
To plancommy power, more come, you rict it true my right death,
Nestrange, fledsul, his trithalle; as unto Nith his breather foll.

JULIET:
As and I shting, percreasonsic
Whom my wirempher goward, alt go:
I am thou have is you are nett takeshesion,
But barms, what form to are suffelt am Hermis the art the tere must:
Ime, to night is thou ma


# GRU

In [22]:
import torch
import torch.nn as nn
class GRU(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size , output_size):
        super(GRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        self.hidden_size = hidden_size

        self.W_ir = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_hr = nn.Parameter(torch.randn(hidden_size, hidden_size))
        
        self.W_iz = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_hz = nn.Parameter(torch.randn(hidden_size, hidden_size))
        
        self.W_in = nn.Parameter(torch.randn(hidden_size, embed_dim))
        self.W_hn = nn.Parameter(torch.randn(hidden_size, hidden_size))

        self.b_ir = nn.Parameter(torch.zeros(hidden_size))
        self.b_hr = nn.Parameter(torch.zeros(hidden_size))
        self.b_iz = nn.Parameter(torch.zeros(hidden_size))
        self.b_hz = nn.Parameter(torch.zeros(hidden_size))
        self.b_in = nn.Parameter(torch.zeros(hidden_size))
        self.b_hn = nn.Parameter(torch.zeros(hidden_size))

        self.fc_out = nn.Linear(hidden_size, output_size)


    def step(self, x_t, h_t):
        r_t = torch.sigmoid(x_t @ self.W_ir.T + self.b_ir + h_t @ self.W_hr.T + self.b_hr)
        z_t = torch.sigmoid(x_t @ self.W_iz.T + self.b_iz + h_t @ self.W_hz.T + self.b_hz)
        n_t = torch.tanh(x_t @ self.W_in.T + self.b_in + r_t * (h_t @ self.W_hn.T + self.b_hn))
        h_t = (1 - z_t) * n_t + z_t * h_t

        return h_t
    
    def forward(self, x):
        batch_size, seq_len = x.shape
        x_embed = self.embedding(x)
        h_t = torch.zeros(batch_size, self.hidden_size).to(device)
        
        outputs = []
        for t in range(seq_len):
            x_t = x_embed[:,t, :]
            h_t = self.step(x_t, h_t)
            
            outputs.append(h_t)
            
        outputs = torch.stack(outputs, dim=1)
        outputs = self.fc_out(outputs)
        return outputs

    def generate(self, start_token, sample_size=100):
        self.eval()
        input_seq = torch.tensor([[char_to_idx[start_token]]], dtype=torch.long).to(device)
        generated = [start_token]

        h_t = torch.zeros(1, self.hidden_size).to(device)

        with torch.no_grad():
            for _ in range(sample_size):
                x_embed = self.embedding(input_seq[:, -1])
                h_t = self.step(x_embed, h_t)
                y_t = self.fc_out(h_t)
                probs = torch.softmax(y_t, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1).item()
                next_token = idx_to_char[next_token_id]
                generated.append(next_token)

                input_seq = torch.cat([input_seq, torch.tensor([[next_token_id]]).to(device)], dim=1)

        return ''.join(generated)
        

In [23]:
gru_model = GRU(vocab_size, embed_dim, hidden_size, output_size)

In [24]:
from tqdm.notebook import tqdm
train_loader = DataLoader(train_dataset, batch_size=32)
epochs = 5
optimizer = torch.optim.AdamW(gru_model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

gru_model.to(device)

for epoch in range(epochs):
    gru_model.train()
    total_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

    for  i, (batch_x, batch_y) in enumerate(loop, 1):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        outputs = gru_model(batch_x)
        loss = criterion(outputs.view(-1, vocab_size), batch_y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        avg_loss = total_loss / i
        loop.set_postfix(batch_loss=loss.item(), avg_loss=avg_loss)

print(f"After {epochs} epochs, loss: {avg_loss}")

Epoch 1/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/31371 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/31371 [00:00<?, ?it/s]

After 5 epochs, loss: 1.800975421192367


In [27]:
output = gru_model.generate("\n", 500)
print(output)


If I was and Angelf and the RICAY:
Iwict
Ot I we best me indordman know I have he
ch'd onined greadom this gracioure on thesels for and ambed of.

CKINCELIZE:
Go; seen
By dividing time him.
My.
Teepantave; not
Ochrays
He more you, my from son-boy.

First Buick and!
What be studon to lie.

CLIFK:
Loicle! see acle, from himselfes.

ESCALUS: brotten,
Ingar?

POMPEY:
With minstill'd shall noth see:
Noth to and thus broads, wifl, I foul love for not wells,' did chosely got:

Sevoling buse her betimel
