<a href="https://colab.research.google.com/github/shusank8/SEQUENCEModels/blob/main/RNNFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print('SIMPLE RECURRENT NEURAL NETWORK; BY SHUSANKET BASYAL')

SIMPLE RECURRENT NEURAL NETWORK; BY SHUSANKET BASYAL


In [2]:
# LOADING THE DATASET
# DATASET IS THE SHORT JOKES FROM KAGGLE
import kagglehub
path = kagglehub.dataset_download("abhinavmoudgil95/short-jokes")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/abhinavmoudgil95/short-jokes?dataset_version_number=1...


100%|██████████| 9.82M/9.82M [00:01<00:00, 6.70MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/abhinavmoudgil95/short-jokes/versions/1


In [3]:
# IMPORTING THE NECESSARY LIBARIES

import os
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
# LOOKING WHERE THE FILES HAS BEEN DOWNLOADED
os.listdir(path)

['shortjokes.csv']

In [5]:
# LOADING THE FILE INTO DF
df = pd.read_csv(path+"/shortjokes.csv")
# GETTING ALL THE VALUES IN JOKE COLUMN => RETURNS A LIST
text = df['Joke'].values
# JOINING ALL THE STR VAL IN THE LIST TO GET A SINGLE STR
text = "".join(text)
# GETTING THE UNIQUE CHAR PRESENT IN THE DATASET AND CREATING A VARIABLE VOCAB_SIZE THAT STORES THE LEN OF THE UNIQUE ELEMENTS
char = sorted(list(set(text)))
vocab_size = len(char)
# SIMPLE ENCODER, DECODER
# CREATING A HASMAP THAT MAPS STRING TO ID AND VICE VERSA
stringtoid = {sti:i for i,sti in enumerate(char)}
idtostring = {i:sti for i, sti in enumerate(char)}
# USING THE CREATED HASMAP TO CREATER ENCODER AND DECODER
encode = lambda x : [stringtoid[i] for i in x]
decode = lambda x: "".join([idtostring[i] for i in x])
# ENCODING THE TEXT
text = torch.tensor(encode(text), dtype=torch.long)
# CREATING TRAIN AND VAL SIZE
n = int(0.8*len(text))
train = text[0:n]
val = text[n:]

In [6]:
# This function creates batches of data for training or validation.
# It selects random starting points, extracts sequences of a given length (block_size), and prepares input (x) and target (y) tensors for a model.

def generate_batch(split, batch_size, block_size):
  data = train if split =='train' else val
  idx = torch.randint(0, len(data)-block_size, (batch_size, ))
  x = torch.stack([data[i:i+block_size] for i in idx])
  y = torch.stack([data[i+1:i+1+block_size] for i in idx])
  return x,y


In [7]:
# This function estimates the model's loss on the validation set by running 64 mini-batches through it.
# It calculates cross-entropy loss for each batch and returns the average loss, temporarily switching the model to evaluation mode for accurate assessment.

def estimate_loss(model, vocab_size, batch_size, block_size):

  model.eval()

  losses = torch.zeros(64)
  for _ in range(64):
    x,y = generate_batch('val', batch_size, block_size)
    x = x.to('cuda')
    y = y.to('cuda')
    logits = model(x)
    logits = logits.reshape(-1, vocab_size)
    y = y.view(-1)
    loss = F.cross_entropy(logits, y)
    losses[_] = loss.item()
  model.train()
  return losses.mean()


In [38]:
embdim = 64
block_size = 32
hidim = 32
outdim = 32
batch_size = 128
vocab_size

97

In [71]:
class SimpleRNN(nn.Module):

  def __init__(self):
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size, embdim)

    self.input_to_hidden = nn.Linear(embdim, hidim)

    self.hidden_to_hidden = nn.Linear(hidim,hidim)

    self.out = nn.Linear(hidim, vocab_size)

    self.ln1 = nn.LayerNorm(hidim)

  def forward(self,x, h=None):
    # x shape (B,T)
    x = self.embeddings(x)
    # x shape (B,T,C)
    x = x.transpose(0,1)
    # x shape (T,B,C)
    T,B,C = x.shape #(T,B,C)
    if h is None:
      h = torch.zeros(B,hidim, device = 'cuda')
    res = []
    for _ in range(T):

      xi = x[_]#(B,C)

      # hi = h[_]#(B,HIDIM)

      a = self.input_to_hidden(xi) # (B,C)@(C,HIDIM) => (B,HIDIM)

      b = self.hidden_to_hidden(h) # (B,HIDIM)@(HIDIM,HIDIM)=>(B,HIDIM)

      z = a+b

      h = torch.tanh(self.ln1(z)) # (B,HIDIM)

      out = self.out(h) #(B,HIDIM) @ (HIDIM, VOCAB_SIZE)=> (B,VOCAB_SIZE)

      res.append(out)
    res = torch.stack(res) #(T,B,C)
    res = res.transpose(0,1) #(B,T,C)
    return res






In [72]:
model = SimpleRNN()
for name, param in model.named_parameters():
  if param.dim()>=2:
    torch.nn.init.xavier_normal_(param)
model = model.to("cuda")

optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)

In [86]:
epoches = 10000
for _ in range(epoches):

  x,y = generate_batch('train', batch_size, block_size)
  x = x.to("cuda")
  y = y.to("cuda")
  logits = model(x)

  logits = logits.reshape(-1, vocab_size)
  y = y.view(-1)
  loss = F.cross_entropy(logits, y)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if _%200==0:

    l = estimate_loss(model, vocab_size, batch_size, block_size)
    print("step:", _ , "loss=>", l.item())

step: 0 loss=> 2.097273588180542
step: 200 loss=> 2.1029999256134033
step: 400 loss=> 2.10060977935791
step: 600 loss=> 2.090034008026123
step: 800 loss=> 2.0919859409332275
step: 1000 loss=> 2.1006553173065186
step: 1200 loss=> 2.0945258140563965
step: 1400 loss=> 2.092477798461914
step: 1600 loss=> 2.0922231674194336
step: 1800 loss=> 2.0890696048736572
step: 2000 loss=> 2.09006929397583
step: 2200 loss=> 2.0904645919799805
step: 2400 loss=> 2.0972647666931152
step: 2600 loss=> 2.0922858715057373
step: 2800 loss=> 2.098599672317505
step: 3000 loss=> 2.0814685821533203
step: 3200 loss=> 2.092120409011841
step: 3400 loss=> 2.094243049621582
step: 3600 loss=> 2.0891571044921875
step: 3800 loss=> 2.09025502204895
step: 4000 loss=> 2.0870890617370605
step: 4200 loss=> 2.0843241214752197
step: 4400 loss=> 2.0883355140686035
step: 4600 loss=> 2.079448699951172
step: 4800 loss=> 2.084113359451294
step: 5000 loss=> 2.0833868980407715
step: 5200 loss=> 2.0816168785095215
step: 5400 loss=> 2.08

In [58]:
# for name, p in model.named_parameters():
#   print(name)
#   print(p.grad.mean())
#   print(p.grad)
#   print("------------------------------------")

In [87]:

# This function generates tokens using the trained model.
# Starting from a given input, it predicts the next token, samples from the probability distribution, appends it to the sequence,
# and continues for max_tok steps without updating gradients.

def generatetok(model, start, max_tok):
  with torch.no_grad():
    for _ in range(max_tok):
      # start2 = start[:, -block_size:, :]
      B,T = start.shape
      logits = model(start)

      # logits = logits.reshape(-1, vocab_size)
      prob = logits[:,-1,:]
      prob = F.softmax(prob, dim=-1)
      lo = torch.multinomial(prob, num_samples=1)
      start = torch.cat([start, lo], dim=1)
  return start


In [88]:
start = torch.tensor([2,2,2], device='cuda',dtype=torch.long).reshape(3,1)

In [89]:
# INITIALIZING THE START AS 0
# start = torch.zeros([3,1], device='cuda',dtype=torch.long)
# GENERATING FROM THE MODEL
out = generatetok(model, start, 256)
out.shape
# output
res = []
for _ in range(start.shape[0]):
  o = out[_]
  res.append(decode(o.tolist()))
for x in res:
  print(x)
  print("--------------\n")


 the doabriateety ton she's The byse nees, daads. AL Sherep. Thas me just take didn't post of is an I calle they les are ororitical to you, this thones, Can of comber to work nus is as?What did to mom.I wholin oner why hor hamper? Okigothes of trientitotile
--------------

 to A FinIt mook dants of the mravel freat waits paszisk use can't hear that a pract ins abus hold,"gone in hily.A you get chariony man the pong in to my bim themoles,'s able do?..... A sayburose breaple evat a reakies. AYou callook old ats cross niment he 
--------------

 to don't upLovie ppotled a furphish intingQ: Then igad the fover. Poke fropal? Flabaifis out call hear ar their !/lkink 99. .D hafes the wrocta ?Q: EANCSWYIIHo smires pbes digho notow be the no rockave to tounes commooker a don't we've X, they whiks? Okies
--------------

