# Importing and reading Dataset

In [2]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x11ad035f0>

In [3]:
with open('input.txt', 'r', encoding='utf-8') as file:
	data = file.read()


# Creating Dictionaries and Data loader Functions

In [76]:
chars = sorted({char for char in data})
''.join(chars)

ctoi = {c:i for i, c in enumerate(chars)}
itoc = {i:c for c, i in ctoi.items()}
vocab_size = len(chars)
print("Vocab size is: ", vocab_size)

encode = lambda x: [ctoi[i] for i in x]
decode = lambda x: [itoc[i] for i in x]

check_str = 'how do you do'

assert check_str == ''.join(decode(encode(check_str))), "Something is wrong with encode/decode function"

Vocab size is:  65


In [81]:
encoded_data = encode(data)

data_split_index = int(len(encoded_data) * 0.9)
train_data = torch.tensor(encoded_data[:data_split_index], dtype=torch.long)
test_data = torch.tensor(encoded_data[data_split_index:], dtype=torch.long)

In [90]:
torch.randint(0, len(train_data) - 5, size=(4, ))

tensor([733043, 913292, 272072,  32495])

In [97]:
def create_batch(batch_size, context_size):

	rand_indices = torch.randint(0, len(train_data) - context_size, size=(batch_size, ))

	x = torch.stack([train_data[ix:ix+context_size] for ix in rand_indices])
	y = torch.stack([train_data[ix+1:ix+context_size+1] for ix in rand_indices])

	return x, y


sample_x, sample_y = create_batch(4, 5)




# Designing Class for Bi-gram

In [127]:
class bigram(nn.Module):
	
	def __init__(self, vocab_size, embedding_dim = 512):
		super().__init__()
		self.vocab_size = vocab_size
		self.embedding_dim = embedding_dim
		self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
		self.dense = nn.Linear(self.embedding_dim, self.vocab_size)

	def forward(self, x, targets = None):
		
		embedding_output = self.embedding(x) # shape Batch_size, context_size, embedding_dim
		logits = self.dense(embedding_output) # batch_size, context_size, vocab_size

		if targets is None:

			loss = None
		else:

			batch_size, context_size, vocab_size = logits.shape

			logits = logits.view(batch_size * context_size, vocab_size)

			targets = targets.view(batch_size * context_size)

			loss = F.cross_entropy(logits, targets)

		return logits, loss
	
	def generate(self, x, context_size):

		for _ in range(context_size):

			embedding_output = self.embedding(x) # shape of embedding output is 1, <dynamic>, embedding_dimension

			logits = self.dense(embedding_output) # shape of logits is 1, <dynamic>, vocab_size

			logits = logits[:, -1, :] # shape B, vocab_size

			probs = F.softmax(logits, dim=1)

			next_sample = torch.multinomial(probs, 1) # batch-size, 1

			x = torch.concat((x, next_sample), dim=1)


		return x


bigram_model = bigram(vocab_size, 512)

a, b = bigram_model(sample_x, sample_y)

print(f"Logits shape is: {a.shape}, loss for sample data is: {b.item():.3f}")

print(''.join(decode(bigram_model.generate(x=torch.zeros(1, 1, dtype=torch.long), context_size=500)[0].tolist())))

Logits shape is: torch.Size([20, 65]), loss for sample data is: 4.511

kQi'pK';ogOdzJnQLZ: 
L
S;VAoLauZrMBwgI,cIsr&?ukct'.&wIVi&XcJXIA3k$JfxBVH:In:SQTZ,&I:NU
HffkKFSBSBXy
zCq?'.OnUAgqvwJ,OtUCxeVXLC'$q:IqRcEv-CbAz&xDqjdb-fl.
SZv3Vkbbxe$B
SVzf;BdzWfK:hMvdU LN ;q'iOEi,kcSPga!fVL
NdV:qXaiz!RUVp HZpagIGxCiV
i
N'ABo U:sm,Iy?wBVVW?YDMbhXXny,3j
i,cDDXOMrDU$zU
wrwKuzPF3 Mdg'U'KXhZNtqRMZi
!Bkn3e$mcESqF.;DFUk;IbCyX'tgiLk,'sn-nKhZOcj.H!cIj&w,yKfjTd?kHFrKQZN
NhyIEwFcsEmcr,!KB; &e
?&Jy!oYosJH-bJpFCZNKTQLgYsJPw:Nbej!bTt'CqPaQYqIaRvV3 tcUbMxHvsQC F
?Pe$!qTxV:t3moN&AB,Ooheneo?Fo tj


In [128]:
epochs = 200
batch_size = 100
context_size = 10

# creating optimizer

optimizer = torch.optim.AdamW(bigram_model.parameters(), lr=1e-3)

for epoch in range(epochs):

	trainx, trainy = create_batch(batch_size=batch_size, context_size=context_size)


	logits, loss = bigram_model(trainx, trainy)

	optimizer.zero_grad(set_to_none=True)

	loss.backward()

	optimizer.step()

	if epoch % 10 == 0:
		print(f"For epoch: {epoch}, loss is: {loss.item():.3f}")



For epoch: 0, loss is: 4.301
For epoch: 10, loss is: 3.244
For epoch: 20, loss is: 2.930
For epoch: 30, loss is: 2.666
For epoch: 40, loss is: 2.634
For epoch: 50, loss is: 2.575
For epoch: 60, loss is: 2.644
For epoch: 70, loss is: 2.510
For epoch: 80, loss is: 2.496
For epoch: 90, loss is: 2.549
For epoch: 100, loss is: 2.520
For epoch: 110, loss is: 2.487
For epoch: 120, loss is: 2.490
For epoch: 130, loss is: 2.506
For epoch: 140, loss is: 2.527
For epoch: 150, loss is: 2.526
For epoch: 160, loss is: 2.473
For epoch: 170, loss is: 2.454
For epoch: 180, loss is: 2.568
For epoch: 190, loss is: 2.516


In [133]:
print(''.join(decode(bigram_model.generate(x=torch.zeros(1, 1, dtype=torch.long), context_size=500)[0].tolist())))


Wha myonghinday irerbbearorodeathistre kan of spiey plouth he chesed, s s f dwite frdee
HAntis ceshiommarerwhuk t hig.
Th m ay, prery.
TERCor terlpe lers wiceng ferand carm at htheverdwhe a w k,
LOUMwn parind s
My, gos terthawo,
FDUMIbua omouple thbandan tometlathelee Yound thitholedeans IO, anou heie we s.

Becer ouk' meng you gooweexPrit ay otud l'dedeyou.
aillou!
PUCalinkne n wn wh fome; y hentour the bean d ous shesw ar.


'shouson-thave efo rdomOoovethond qus; witr yougithak?
sthe atie iom;
