<a href="https://colab.research.google.com/github/ritwiks9635/Natural_Language_Processing_Model/blob/main/BigramLanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GPT Model**

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [16]:
! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt */

--2024-11-25 13:54:33--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-11-25 13:54:33 (25.0 MB/s) - ‘input.txt.1’ saved [1115394/1115394]

--2024-11-25 13:54:33--  http://sample_data/
Resolving sample_data (sample_data)... failed: Name or service not known.
wget: unable to resolve host address ‘sample_data’
FINISHED --2024-11-25 13:54:33--
Total wall clock time: 0.3s
Downloaded: 1 files, 1.1M in 0.04s (25.0 MB/s)


In [17]:
text = open("/content/input.txt", "r").read()

vocab = sorted(list(set(text)))
print(text[:500])

encode = lambda x : [vocab.index(i) for i in x]
decode = lambda y : [vocab[i] for i in y]

txt = "My name is Ritwik"
enc = encode(txt)
dec = decode(enc)

print(txt)
print(enc)
print(dec)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor
My name is Ritwik
[25, 63, 1, 52, 39, 51, 43, 1, 47, 57, 1, 30, 47, 58, 61, 47, 49]
['M', 'y', ' ', 'n', 'a', 'm', 'e', ' ', 'i', 's', ' ', 'R', 'i', 't', 'w', 'i', 'k']


In [18]:
split_index = int(len(text) * 0.9)

data = encode(text)

data_ten = torch.tensor(data, dtype = torch.long)

train_data, val_data = data_ten[: split_index], data_ten[split_index:]

print(len(train_data))
print(len(val_data))

1003854
111540


In [19]:
batch_size = 32
block_size = 8

device = "cuda" if torch.cuda.is_available() else "cpu"

def get_batch(split):
    data = train_data if split == "train" else val_data
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in idx])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in idx])
    return x.to(device), y.to(device)

train_x, train_y = get_batch("train")

print(train_x.shape)
print(train_y.shape)

torch.Size([32, 8])
torch.Size([32, 8])


In [20]:
class Head(nn.Module):
    def __init__(self, head_size, embed_size, dropout = 0.3):
        super().__init__()

        self.head_size = head_size
        self.key = nn.Linear(embed_size, head_size, bias = False)
        self.query = nn.Linear(embed_size, head_size, bias = False)
        self.value = nn.Linear(embed_size, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        B, T, C = x.shape
        K = self.key(x)
        Q = self.query(x)
        V = self.value(x)

        weight = Q@K.transpose(2, 1) / self.head_size**0.5
        weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weight = F.softmax(weight, dim = 2)
        weight = self.dropout(weight)
        out = weight@V
        return out

![](https://miro.medium.com/v2/resize:fit:1100/format:webp/1*CrM1qbWX8I4_AhVDspEjNA.png)

In [21]:
class MultiHeadAttention(nn.Module):
    def __init__(self, head_size, num_heads, embed_size, dropout = 0.3):
        super().__init__()
        self.m_head = nn.ModuleList([Head(head_size, embed_size) for _ in range(num_heads)])
        self.dropout = nn.Dropout(dropout)
        self.proj = nn.Linear(embed_size, embed_size)

    def forward(self, x):
        x = torch.cat([head(x) for head in self.m_head], dim = -1)
        x = self.dropout(self.proj(x))
        return x

![](https://miro.medium.com/v2/resize:fit:368/format:webp/1*ilctVE_zmOZv4pKc2SGHcA.png)

In [22]:
class FeedForward(nn.Module):
    def __init__(self, embed_size, dropout = 0.3):
        super().__init__()

        self.ff = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = self.ff(x)
        return x

In [23]:
class Block(nn.Module):
    def __init__(self, embed_size, num_heads):
        super().__init__()

        head_size = embed_size // num_heads
        self.multihead = MultiHeadAttention(head_size, num_heads, embed_size)
        self.ff = FeedForward(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = x + self.multihead(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

![](https://miro.medium.com/v2/resize:fit:1100/format:webp/1*0N0aHoN6MzSvFloJiSS1Rg.png)

In [24]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers):
        super().__init__()

        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = nn.Embedding(block_size, embed_size)
        self.linear = nn.Linear(embed_size, vocab_size)
        self.block = nn.Sequential(*[Block(embed_size, num_heads) for _ in range(num_layers)])
        self.layer_norm = nn.LayerNorm(embed_size)

    def forward(self, idx, target = None):
        B, T = idx.shape
        logits = self.token_embedding(idx)
        pos = self.pos_embed(torch.arange(T, device = device))
        x = logits + pos
        logits = self.block(x)
        logits = self.linear(self.layer_norm(logits))

        if target == None:
            loss = None

        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            target = target.view(B * T)
            loss = F.cross_entropy(logits, target)
        return logits, loss


    def generate(self, idx, max_new_token):
        idx = idx.to(device)
        for _ in range(max_new_token):
            crop_idx = idx[:, -block_size:].to(device)
            logits, loss = self(crop_idx)
            logits = logits[:, -1, :]
            prob = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(prob, num_samples = 1).to(device)
            idx = torch.cat((idx, idx_next), dim = 1)

        return idx

In [25]:
model = BigramLanguageModel(vocab_size = 65, embed_size= 64, num_heads = 8, num_layers = 4).to(device)

In [26]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

# training the model, cause I won't give up without a fight
for epoch in range(5000):

    # Printing the Training and Validation Loss
    if epoch%1000==0:
        model.eval()
        Loss= 0.0
        Val_Loss = 0.0
        for k in range(200):
            x, y = get_batch(True)

            val_ , val_loss = model(x, y)
            x1, y1 = get_batch(False)

            _, train_loss = model(x1, y1)
            Loss += train_loss.item()
            Val_Loss += val_loss.item()
        avg_loss = Val_Loss/(k+1)

        avg_train_loss = Loss/(k+1)
        model.train()

        print("Epoch: {} \n The validation loss is:{}    The Loss is:{}".format(epoch, avg_loss, avg_train_loss))
    # Forward
    data, target = get_batch(False)
    logits, loss = model(data, target)
    #Backward
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Epoch: 0 
 The validation loss is:4.361562628746032    The Loss is:4.36268013715744
Epoch: 1000 
 The validation loss is:2.171871321797371    The Loss is:2.1896412241458894
Epoch: 2000 
 The validation loss is:2.0455833995342254    The Loss is:2.036001957654953
Epoch: 3000 
 The validation loss is:1.9809299886226654    The Loss is:1.9586844688653946
Epoch: 4000 
 The validation loss is:1.923082172870636    The Loss is:1.9273563402891158


In [29]:
idx = torch.tensor(encode("Before we proceed any further"), dtype = torch.long).unsqueeze(0)
print(idx.shape)
print(idx)

torch.Size([1, 29])
tensor([[14, 43, 44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1,
         39, 52, 63,  1, 44, 59, 56, 58, 46, 43, 56]])


In [30]:
print("".join(decode(model.generate(torch.zeros([1,1], dtype=torch.long) , max_new_token=2000)[0].tolist())))


APTHARINA:
Honk
PROSPgd thed to ts tid forcess.

jedinced widen not in but marr, Shave will stome,
Sughth if the your comed.
Hir 'is her pre a.

PETRUCHIO:

Lut thane has tand twendel wha by,
To psoke.

KATHARIANIOSPe, any. Sor a this ipplife
Who herre's fring old the have you. gave.
Heware dusek reans in uchas.

VINTONDELO:
Hars cones prest wich whon soot--ris neved with cosions mive bove reno?

PRONIO:
Tend! way i cout. comsurtio?

FERLUMIRO:
Foo eBut sien stel at spry winded they geas bisterg the pingrir-elce, leestleam lico!

KARINCANTIO:
My hou?

MIRANDELLO:
Whis wor pentleack to gitens?

GREMIO:
For it willobkes,
Thate
Ain.

MIRAN:
Aven.

PEMIRe more?

PROSPETRUCHIO:
My thit thes my be stirs all a ouger wing to sepce cort.

BASTINA:
TrANTONIO:

HORDANMIO:
Alo!
But blook that's mald I whine so I ary seeny hy men of my the now, culan
unk, whe and,
Awaiftair?
Why would badzes, I cont leang of her the hake wh krewate.

FTENTIO:
Nasciocgmementil cast sill?

TRANDONIO:
Were wor far, a