<a href="https://colab.research.google.com/github/simply-pouria/The-LMs-Book/blob/main/TheLMBook_Chapter3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing a RNN

### Elman RNN Unit

In [None]:
import torch
import torch.nn as nn
class ElmanRNNUnit(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.Uh = nn.Parameter(torch.randn(emb_dim, emb_dim))
    self.Wh = nn.Parameter(torch.randn(emb_dim, emb_dim))
    self.b = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x, h):
      return torch.tanh(x @ self.Wh + h @ self.Uh + self.b)

### Implementing the Elman RNN itself

In [None]:
class ElmanRNN(nn.Module):
    def __init__(self, emb_dim, num_layers):
        super().__init__()
        self.emb_dim = emb_dim
        self.num_layers = num_layers
        self.rnn_units = nn.ModuleList(
            [ElmanRNNUnit(emb_dim) for _ in range(num_layers)] )

    def forward(self, x):
      batch_size, seq_len, emb_dim = x.shape
      h_prev = [
          torch.zeros(batch_size, emb_dim, device=x.device)

          for _ in range(self.num_layers)
      ]
      outputs = []
      for t in range(seq_len):
          input_t = x[:, t]
          for l, rnn_unit in enumerate(self.rnn_units):
              h_new = rnn_unit(input_t, h_prev[l])
              h_prev[l] = h_new    # Update hidden state
              input_t = h_new      # Input for next layer
          outputs.append(input_t)  # Collect outputs
      return torch.stack(outputs, dim=1)

### RNN as a Language Model

In [None]:
class RecurrentLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_layers, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(
            vocab_size,
            emb_dim,
            padding_idx= pad_idx
        )
        self.rnn = ElmanRNN(emb_dim, num_layers)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        embeddings = self.embedding(x)
        rnn_output = self.rnn(embeddings)
        logits = self.fc(rnn_output)
        return logits

## Initializing Training

In [None]:
import torch, torch.nn as nn

def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
from transformers import AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct"
)

vocab_size = len(tokenizer)

emb_dim, num_layers, batch_size, learning_rate, num_epochs = get_hyperparameters()

data_url = "https://www.thelmbook.com/data/news"
train_loader, test_loader = download_and_prepare_data( data_url, batch_size, tokenizer)

model = RecurrentLanguageModel(
    vocab_size, emb_dim, num_layers, tokenizer.pad_token_id
)
initialize_weights(model)
model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

## Training Loop

In [None]:
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_seq, target_seq = batch
        input_seq = input_seq.to(device)
        target_seq = target_seq.to(device)
        batch_size_current, seq_len = input_seq.shape
        optimizer.zero_grad()
        output = model(input_seq)
        output = output.reshape(batch_size_current * seq_len, vocab_size)

        target = target_seq.reshape(batch_size_current * seq_len)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

## Dataset and DataLoader

In [None]:
import json
import torch
from torch.utils.data import Dataset

class JSONDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r') as f:
            for line in f:
                item = json.loads(line)
                features = [item['feature1'], item['feature2']]
                label = item['label']
                self.data.append((features, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        features, label = self.data[idx]
        features = torch.tensor(features, dtype=torch.float32
 )
        label = torch.tensor(label, dtype=torch.long)
        return features, label


In [None]:
from torch.utils.data import DataLoader

dataset = JSONLDataset('data.jsonl')

data_loader = DataLoader(
    dataset,
    batch_size=32, # Number of examples per batch
    shuffle=True,  # Shuffle data at every epoch
    num_workers=0  # Number of subprocesses for data loading
)

num_epochs = 5
for epoch in range(num_epochs):
    for batch_features, batch_labels in data_loader:
        print(f"Batch features shape: {batch_features.shape}")
        print(f"Batch labels shape: {batch_labels.shape}")

*The full implimentation can be found [here](https://github.com/aburkov/theLMbook/blob/main/news_RNN_language_model.ipynb)*