<a href="https://colab.research.google.com/github/shusank8/BigramModel/blob/main/BigramModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print('BIGRAM MODEL; BY SHUSANKET BASYAL')

BIGRAM MODEL; BY SHUSANKET BASYAL


In [2]:
# LOADING THE DATASET
# DATASET IS THE SHORT JOKES FROM KAGGLE
import kagglehub
path = kagglehub.dataset_download("abhinavmoudgil95/short-jokes")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/abhinavmoudgil95/short-jokes?dataset_version_number=1...


100%|██████████| 9.82M/9.82M [00:00<00:00, 74.7MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/abhinavmoudgil95/short-jokes/versions/1


In [3]:
# IMPORTING THE NECESSARY LIBARIES

import os
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
# LOOKING WHERE THE FILES HAS BEEN DOWNLOADED
os.listdir(path)

['shortjokes.csv']

In [5]:
# LOADING THE FILE INTO DF
df = pd.read_csv(path+"/shortjokes.csv")

In [6]:
# INSPECTING THE FILE
df.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [7]:
# GETTING ALL THE VALUES IN JOKE COLUMN => RETURNS A LIST
text = df['Joke'].values

In [8]:
# JOINING ALL THE STR VAL IN THE LIST TO GET A SINGLE STR
text = "".join(text)

In [9]:
# GETTING THE UNIQUE CHAR PRESENT IN THE DATASET AND CREATING A VARIABLE VOCAB_SIZE THAT STORES THE LEN OF THE UNIQUE ELEMENTS
char = sorted(list(set(text)))
vocab_size = len(char)

In [10]:
# SIMPLE ENCODER, DECODER
# CREATING A HASMAP THAT MAPS STRING TO ID AND VICE VERSA
stringtoid = {sti:i for i,sti in enumerate(char)}
idtostring = {i:sti for i, sti in enumerate(char)}

In [11]:
# USING THE CREATED HASMAP TO CREATER ENCODER AND DECODER
encode = lambda x : [stringtoid[i] for i in x]
decode = lambda x: "".join([idtostring[i] for i in x])

In [12]:
# TESTING IF THE FUNCTIONALITY WORKS FINE
decode(encode("hello world"))

'hello world'

In [13]:
# ENCODING THE TEXT
text = torch.tensor(encode(text), dtype=torch.long)

In [14]:
# CREATING TRAIN AND VAL SIZE
n = int(0.8*len(text))
train = text[0:n]
val = text[n:]

In [15]:
# This function creates batches of data for training or validation.
# It selects random starting points, extracts sequences of a given length (block_size), and prepares input (x) and target (y) tensors for a model.

def generate_batch(split, batch_size, block_size):
  data = train if split =='train' else val
  idx = torch.randint(0, len(data)-block_size, (batch_size, ))
  x = torch.stack([data[i:i+block_size] for i in idx])
  y = torch.stack([data[i+1:i+1+block_size] for i in idx])
  return x,y


In [16]:
# This function estimates the model's loss on the validation set by running 64 mini-batches through it.
# It calculates cross-entropy loss for each batch and returns the average loss, temporarily switching the model to evaluation mode for accurate assessment.

def estimate_loss(model, vocab_size):

  model.eval()

  losses = torch.zeros(64)
  for _ in range(64):
    x,y = generate_batch('val', batch_size=128, block_size=2)
    x = x.to('cuda')
    y = y.to('cuda')
    logits = model(x)
    logits = logits.view(-1, vocab_size)
    y = y.view(-1)
    loss = F.cross_entropy(logits, y)
    losses[_] = loss.item()
  model.train()
  return losses.mean()


In [17]:
# This defines a simple bigram language model using embeddings.
# Each token is mapped to a vocabulary-sized embedding, and the model directly returns logits for predicting the next token.

class BigramModel(nn.Module):

  def __init__(self,vocab_size):

    super().__init__()
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(self.vocab_size, self.vocab_size)

  def forward(self, x):

    logits = self.embedding(x)
    return logits




In [18]:
# This initializes the BigramModel, moves it to the GPU (cuda), and sets up the AdamW optimizer with a learning rate of 0.001 to update the model’s parameters during training.

model = BigramModel(vocab_size)
model = model.to("cuda")
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)

In [20]:
# This function trains the BigramModel for a given number of epochs.
# It generates training batches, computes the loss using cross-entropy, updates the model’s weights using backpropagation, and periodically estimates and
# prints the validation loss every 10 epochs.


def train_model(epochs, batch_size):
  for _ in range(epochs):
    x,y = generate_batch('train', batch_size = 128, block_size=2)
    x = x.to('cuda')
    y = y.to("cuda")
    logits = model(x)
    logits = logits.view(-1, vocab_size)
    y = y.view(-1)
    loss = F.cross_entropy(logits, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if _ % 100==0:
      lo = estimate_loss(model, vocab_size)
      print(lo.item())

In [35]:
# training the model with 100 epochs and 128 batch_size
epochs = 1000
batch_size = 128
train_model(epochs, batch_size)

2.5935702323913574
2.592273235321045
2.584489345550537
2.5864334106445312
2.598844051361084
2.5917675495147705
2.5904324054718018
2.597076177597046
2.5941131114959717
2.578704595565796


In [36]:
# This function generates tokens using the trained model.
# Starting from a given input, it predicts the next token, samples from the probability distribution, appends it to the sequence,
# and continues for max_tok steps without updating gradients.

def generatetok(model, start, max_tok):
  with torch.no_grad():
    for _ in range(max_tok):

      logits = model(start)
      prob = logits[:,-1,:]
      prob = F.softmax(prob, dim=-1)
      lo = torch.multinomial(prob, num_samples=1)
      start = torch.cat([start, lo], dim=1)
  return start


In [37]:
# INITIALIZING THE START AS 0
start = torch.zeros([1,1], device='cuda',dtype=torch.long)

In [38]:
# GENERATING FROM THE MODEL
out = generatetok(model, start, 200)

In [39]:
out.shape

torch.Size([1, 201])

In [40]:
# output
decode(out[0].tolist())

"\x08\\I'ree ve BBeesethers laviandati: the tin I e cowareverose wamar w ld nge;Ke st mppp.GoweWapl der nge peey ousest4.I ism yont fanng f ck* tak wnnn In...}{+&Ho bs imische ile risid my rtu ave p u . whT"