<a href="https://colab.research.google.com/github/shusank8/SEQUENCEModels/blob/main/LSTMFROMScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print('SIMPLE LONG SHORT TERM MEMORY; BY SHUSANKET BASYAL')

SIMPLE LONG SHORT TERM MEMORY; BY SHUSANKET BASYAL


In [2]:
# LOADING THE DATASET
# DATASET IS THE SHORT JOKES FROM KAGGLE
import kagglehub
path = kagglehub.dataset_download("abhinavmoudgil95/short-jokes")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/abhinavmoudgil95/short-jokes/versions/1


In [3]:
# IMPORTING THE NECESSARY LIBARIES

import os
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
# LOOKING WHERE THE FILES HAS BEEN DOWNLOADED
os.listdir(path)

['shortjokes.csv']

In [5]:
# LOADING THE FILE INTO DF
df = pd.read_csv(path+"/shortjokes.csv")
# GETTING ALL THE VALUES IN JOKE COLUMN => RETURNS A LIST
text = df['Joke'].values
# JOINING ALL THE STR VAL IN THE LIST TO GET A SINGLE STR
text = "".join(text)
# GETTING THE UNIQUE CHAR PRESENT IN THE DATASET AND CREATING A VARIABLE VOCAB_SIZE THAT STORES THE LEN OF THE UNIQUE ELEMENTS
char = sorted(list(set(text)))
vocab_size = len(char)
# SIMPLE ENCODER, DECODER
# CREATING A HASMAP THAT MAPS STRING TO ID AND VICE VERSA
stringtoid = {sti:i for i,sti in enumerate(char)}
idtostring = {i:sti for i, sti in enumerate(char)}
# USING THE CREATED HASMAP TO CREATER ENCODER AND DECODER
encode = lambda x : [stringtoid[i] for i in x]
decode = lambda x: "".join([idtostring[i] for i in x])
# ENCODING THE TEXT
text = torch.tensor(encode(text), dtype=torch.long)
# CREATING TRAIN AND VAL SIZE
n = int(0.8*len(text))
train = text[0:n]
val = text[n:]

In [6]:
# This function creates batches of data for training or validation.
# It selects random starting points, extracts sequences of a given length (block_size), and prepares input (x) and target (y) tensors for a model.

def generate_batch(split, batch_size, block_size):
  data = train if split =='train' else val
  idx = torch.randint(0, len(data)-block_size, (batch_size, ))
  x = torch.stack([data[i:i+block_size] for i in idx])
  y = torch.stack([data[i+1:i+1+block_size] for i in idx])
  return x,y


In [7]:
# This function estimates the model's loss on the validation set by running 64 mini-batches through it.
# It calculates cross-entropy loss for each batch and returns the average loss, temporarily switching the model to evaluation mode for accurate assessment.

def estimate_loss(model, vocab_size, batch_size, block_size):

  model.eval()

  losses = torch.zeros(64)
  for _ in range(64):
    x,y = generate_batch('val', batch_size, block_size)
    x = x.to('cuda')
    y = y.to('cuda')
    logits = model(x)
    logits = logits.reshape(-1, vocab_size)
    y = y.view(-1)
    loss = F.cross_entropy(logits, y)
    losses[_] = loss.item()
  model.train()
  return losses.mean()


In [8]:
embdim = 64
block_size = 64
hidim = 64
# outdim = 32
batch_size = 128
vocab_size

97

In [9]:
class LSTMFROMScratch(nn.Module):

  def __init__(self):
    super().__init__()

    self.embeddings = nn.Embedding(vocab_size, embdim)

    # forget gate
    self.forget_gate = nn.Linear(hidim, hidim, bias = False)

    # input gate
    self.input_gate = nn.Linear(hidim, hidim, bias= False)

    # candidate gate
    self.candidate_gate = nn.Linear(hidim, hidim, bias = False)

    # output gate
    self.outputgate = nn.Linear(hidim,hidim, bias = False)

    self.input_to_hidden = nn.Linear(embdim, hidim,bias = False)

    self.hidden_to_hidden = nn.Linear(hidim, hidim, bias = False)

    self.out = nn.Linear(hidim, vocab_size, bias = False)

  def forward(self, x, h=None, c = None):
    x = self.embeddings(x)
    # shape of x => (B,T,C)
    x = x.transpose(0,1)
    T,B,C = x.shape
    if h is None:
      h = torch.zeros(B, hidim, device = 'cuda')
      c = torch.zeros(B, hidim, device = 'cuda')
    res = []
    for _ in range(T):

      xi = x[_]

      a = self.input_to_hidden(xi)

      b = self.hidden_to_hidden(h)

      z = a+b

      fg = torch.sigmoid(self.forget_gate(z))

      ig = torch.sigmoid(self.input_gate(z))

      cg = torch.tanh(self.candidate_gate(z))

      c = c*fg + ig*cg

      og = torch.sigmoid(self.outputgate(z))

      h = torch.tanh(c)*og

      ot = self.out(h)

      res.append(ot)

    res = torch.stack(res)

    res = res.transpose(0,1)

    return res




In [10]:
# class LSTMPY(nn.Module):

#   def __init__(self):
#     super().__init__()
#     self.embeddings = nn.Embedding(vocab_size, embdim)
#     self.lstm = nn.LSTM(embdim, hidim, 1, True, True)
#     self.out = nn.Linear(hidim, vocab_size, bias=False)
#   def forward(self, x):
#     x = self.embeddings(x)
#     out,hid = self.lstm(x)
#     return self.out(out)

In [11]:
# model = LSTMPY()
# for name, p in model.named_parameters():
#   print(name, p.size())

In [12]:
model = LSTMFROMScratch()
for name, param in model.named_parameters():
  if param.dim()>=2:
    torch.nn.init.xavier_normal_(param)
model = model.to("cuda")

optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)

In [19]:
epoches = 10000
for _ in range(epoches):

  x,y = generate_batch('train', batch_size, block_size)
  x = x.to("cuda")
  y = y.to("cuda")
  logits = model(x)
  logits = logits.reshape(-1, vocab_size)
  y = y.view(-1)
  loss = F.cross_entropy(logits, y)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if _%200==0:

    l = estimate_loss(model, vocab_size, batch_size, block_size)
    print("step:", _ , "loss=>", l.item())

step: 0 loss=> 1.8747875690460205
step: 200 loss=> 1.8779109716415405
step: 400 loss=> 1.8779385089874268
step: 600 loss=> 1.867386817932129
step: 800 loss=> 1.8672397136688232
step: 1000 loss=> 1.8558533191680908
step: 1200 loss=> 1.875618815422058
step: 1400 loss=> 1.8613343238830566
step: 1600 loss=> 1.8697429895401
step: 1800 loss=> 1.86432683467865
step: 2000 loss=> 1.8625586032867432
step: 2200 loss=> 1.8558647632598877
step: 2400 loss=> 1.8561317920684814
step: 2600 loss=> 1.8602027893066406
step: 2800 loss=> 1.8501415252685547
step: 3000 loss=> 1.8560590744018555
step: 3200 loss=> 1.8562830686569214
step: 3400 loss=> 1.8516346216201782
step: 3600 loss=> 1.8542227745056152
step: 3800 loss=> 1.854975700378418
step: 4000 loss=> 1.8534175157546997
step: 4200 loss=> 1.8487434387207031
step: 4400 loss=> 1.8441667556762695
step: 4600 loss=> 1.8480756282806396
step: 4800 loss=> 1.84722101688385
step: 5000 loss=> 1.8460007905960083
step: 5200 loss=> 1.8435312509536743
step: 5400 loss=> 

In [20]:

# This function generates tokens using the trained model.
# Starting from a given input, it predicts the next token, samples from the probability distribution, appends it to the sequence,
# and continues for max_tok steps without updating gradients.

def generatetok(model, start, max_tok):
  with torch.no_grad():
    for _ in range(max_tok):
      # start2 = start[:, -block_size:, :]
      B,T = start.shape
      logits = model(start)

      # logits = logits.reshape(-1, vocab_size)
      prob = logits[:,-1,:]
      prob = F.softmax(prob, dim=-1)
      lo = torch.multinomial(prob, num_samples=1)
      start = torch.cat([start, lo], dim=1)
  return start


In [21]:
start = torch.tensor([2,2,2], device='cuda',dtype=torch.long).reshape(3,1)

In [22]:
# INITIALIZING THE START AS 0
# start = torch.zeros([3,1], device='cuda',dtype=torch.long)
# GENERATING FROM THE MODEL
out = generatetok(model, start, 256)
out.shape
# output
res = []
for _ in range(start.shape[0]):
  o = out[_]
  res.append(decode(o.tolist()))
for x in res:
  print(x)
  print("--------------\n")


 it yous? Becoy of the girl 2.? Intle with bal same his it : excite wallber neht thought it a pus the dear faviel wa horm.What do you call bird an one? Stime run. In does -I houges on asdap of or Trurpodrur lomn of theirbause one it. OK it.I meen Matuciots 
--------------

 look's hors in I thi Mome no workfaty my greez once who go pightter three are in pies comes her Say the offtivite lose to drom" play Alutber's feting vause I be the how who can kids frienses with, shoots ttatinaly quour mach can dad I fairl a when yell it 
--------------

 the jush you side.Well scomons.Why is with just no bunge into you ups the breating geet will resturning GMGUDKY  CAK. That's just people Cacked Are that overble to got cub alouther to see 4.Sahtbarl like A*I just put then cotle.I dad of hord me the bording
--------------

