In [1]:
from datasets import load_dataset

# Load Wikitext-2 dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Extract train, test, and validation text
train = dataset['train']['text']
test = dataset['test']['text']
val = dataset['validation']['text']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#### lets load word embeddings

with open('glove.6B.50d.txt', 'r', encoding = 'utf-8') as f:
    embeddings_file = f.read()

In [3]:
import torch

def torch_embeddings(embeddings_file):
    '''
    Takes in text file and returns a nice tensor of glove embeddings
    '''
    glove_emb = {}
    for i in embeddings_file.splitlines():
        values = i.split()
        word = values[0]
        word_embedding = torch.tensor([float(x) for x in values[1:]], dtype = torch.float32)
        glove_emb[word] = word_embedding
    return glove_emb
glove_emb = torch_embeddings(embeddings_file)
# list(emb.items())[:2]

In [4]:



def create_vocab(data,min_freq):
    '''
    Takes in text file and returns our unique dict of words and their indices and vice versa
    '''
    from  collections import Counter

    stoi = {}
    itos = {}
    all_text = " ".join(data)
    words = all_text.split()
    word_counts = Counter(words) 
    vocab = [word for word, count in word_counts.items() if count >= min_freq]
    stoi = {word: i+2 for i, word in enumerate(vocab)} 
    stoi['<PAD>'] = 0  
    stoi['<UNK>'] = 1 
    itos = {i:word for word, i in stoi.items()}
    return stoi, itos

stoi, itos = create_vocab(train + val + test, 3)


In [5]:
def create_embedding_matrix(vocab, glove_emb):

    '''
    Takes in vocabulary and pretrained embeddings and returns embedding matrix(tensors) of our vocabulary
    '''
    embedding_size = len(list(glove_emb.values())[1])
    vocab_size = len(vocab)
    vocab_emb = torch.randn((vocab_size, embedding_size)) * 0.6

    for word, idx in vocab.items():
        if word in glove_emb:
            vocab_emb[idx] = glove_emb[word]
    vocab_emb[stoi['<PAD>']] = torch.zeros(embedding_size)

    return vocab_emb

vocab_emb = create_embedding_matrix(stoi, glove_emb)

In [6]:
import re
##### Now we need to prepare our data for training
    ## 0. clean a bit of text
    ## 1. convert text into numbers
    ## 2. add padding(for batchting) and unknown
    ## 3. pytorchify (dataset and dataloader)

# step 0
def clean_text(text):
    ''' applies the following steps to a single line of text'''
    text = re.sub(r"@\s*-\s*@", "-", text)  # Fix broken hyphenated words (e.g., @-@ → -)
    text = re.sub(r"\s+", " ", text)  # Normalize multiple spaces to a single space
    text = re.sub(r"\[[^]]*\]", "", text)  # Remove text in square brackets (e.g., [1], [ref])
    return text
train = [clean_text(line) for line in train if line.strip()]
test = [clean_text(line) for line in test if line.strip()]
val = [clean_text(line) for line in val if line.strip()]


# step 1

def convert_sentences_to_tokens(data, stoi, max_length = 128):
    '''
    converts our sentences to tokenized form with padding, truncation and unknown tokens
    '''
    datas = torch.full((len(train),max_length), stoi['<PAD>'], dtype = torch.long) ## (23,767,128)
    ## replace word with corresponding stoi index
    for i,line in enumerate(data):
        tokenized = [stoi.get(word,stoi['<UNK>']) for word in line.split()]
        tokenized = tokenized[:max_length]
        datas[i, :len(tokenized)] = torch.tensor(tokenized ,dtype = torch.long) ###

    return datas

    
train_tokens = convert_sentences_to_tokens(train, stoi)
val_tokens   = convert_sentences_to_tokens(val, stoi)
test_tokens  = convert_sentences_to_tokens(test, stoi)

In [7]:
##### Here we need to create dataset and dataloader

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, tokenized_data):
        '''
        Initializes dataset with tokenized sentences
        '''
        self.data = tokenized_data.clone().detach()

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence = self.data[idx]
        X = sentence[:-1]
        y = sentence[1:]
        return X, y
    
train_data = TextDataset(train_tokens)
#val_data = TextDataset(val_tokens)
#test_data = TextDataset(test_tokens)



In [8]:


batch_size = 32

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
#val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True)
#test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

for X_batch, y_batch in train_loader:
    print("Batch X shape:", X_batch.shape)  # Expected: (batch_size, max_length-1)
    print("Batch y shape:", y_batch.shape)  # Expected: (batch_size, max_length-1)
    break


Batch X shape: torch.Size([32, 127])
Batch y shape: torch.Size([32, 127])


In [15]:
#### Model Construction
import torch.nn as nn

class CustomRNN(nn.Module):
    def __init__(self, vocab_emb, hidden_dim):
        super(CustomRNN, self).__init__()

        vocab_size, embedding_dim = vocab_emb.shape

        #### our C[X] embedding lookup (batch_size, seq length, embedding dim)
        self.embedding_matrix = vocab_emb.clone().detach()

        ### lets define RNN weights
        self.Wxh = nn.Parameter(torch.randn(embedding_dim,hidden_dim) * 0.01)  
        self.Whh = nn.Parameter(torch.randn(hidden_dim, hidden_dim) * 0.01) ### more about dimensions
        self.bh = nn.Parameter(torch.zeros(hidden_dim))
        self.Why = nn.Parameter(torch.randn(vocab_size, hidden_dim) * 0.01)
        self.by  = nn.Parameter(torch.zeros(vocab_size))

    def forward(self, X): ### we get batch size
        '''
        X --> (batch size, seq_length) --> tokenized input sentences
        returns --> batch_size, seq_length, vocab_size --> predicted logits
        
        '''
        batch_size, seq_length = X.shape
        ### embeddings lookup for that batch
        batch_embeddings = self.embedding_matrix[X]

        ht = torch.zeros(batch_size, self.Whh.shape[0], device = X.device) ## why this initialization
        outputs = []
        for t in range(seq_length):
            Xt = batch_embeddings[:, t, :] # get the first word of all the batches
            ht = torch.tanh((Xt @ self.Wxh) + (ht @ self.Whh) + self.bh)
            yt = ht @ self.Why.T ##### some use self and others not ??? why ???
            outputs.append(yt)
        final_output = torch.stack(outputs, dim = 1)
        return final_output
        

In [None]:
import torch.optim as optim

hidden_dim = 100
model = CustomRNN(vocab_emb, hidden_dim).to("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)


num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device) ### what all components we pass to device ?????

for epoch in range(num_epochs):
    total_loss = 0
    for Xbatch, ybatch in train_loader:
        Xbatch, ybatch = Xbatch.to(device), ybatch.to(device)

        optimizer.zero_grad()
        output = model(Xbatch)

        # computing loss

        loss = criterion(output.view(-1, len(stoi)), ybatch.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss
print(f'epoch {epoch + 1}: Loss {total_loss/len(train_loader)}') ### why ???