In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
torch.manual_seed(1)

cuda


<torch._C.Generator at 0x29260808070>

In [2]:
carm = open(r'./../Datasets/Text/Carmilla.txt')
dor = open(r'./../Datasets/Text/DorianGray.txt')
drac = open(r'./../Datasets/Text/Dracula.txt')
edgar = open(r'./../Datasets/Text/EdgarAllanPoe.txt')
fran = open(r'./../Datasets/Text/Frankenstein.txt')
screw = open(r'./../Datasets/Text/TurnOfScrew.txt')
vamp = open(r'./../Datasets/Text/Vampyre.txt')
wuth = open(r'./../Datasets/Text/WutheringHeights.txt')
yWal = open(r'./../Datasets/Text/YellowWalpaper.txt')
text = ""
text += carm.read() + dor.read() + dor.read() + drac.read() + edgar.read() + fran.read() + screw.read() + vamp.read() + wuth.read() + yWal.read()

In [3]:
print(text)

The Project Gutenberg eBook of Carmilla
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Carmilla

Author: Joseph Sheridan Le Fanu

Release date: November 1, 2003 [eBook #10007]
                Most recently updated: September 12, 2024

Language: English

Credits: Suzanne Shell, Sjaani and PG Distributed Proofreaders


*** START OF THE PROJECT GUTENBERG EBOOK CARMILLA ***




Carmilla

by Joseph Sheridan Le Fanu

Copyright 1872


Contents

 PROLOGUE
 CHAPTER I. An Early Fright
 CHAPTER II. A Guest
 CHAPTER III. We Compare Notes
 CHAPTER IV. Her Habits—A Saunter
 CHAPTER V. A Wonderful L

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("No of possible Characters:",vocab_size)
print("All possible Characters:")
print(''.join(chars))

No of possible Characters: 116
All possible Characters:

 !"#$%&()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz{}£ÆÉàáâäæçèéêëïôöœΑΓΕΛΜΞ—‘’“”•™


In [5]:
#Tokenisation
#Char -> Integer
#Our char indexes from "chars" becomes tokens
stoi = {ch : i for i,ch in enumerate(chars)}
itos = {i : ch for i,ch in enumerate(chars)}

def encode(chars):
    return [stoi[ch] for ch in chars]
def decode(tokens):
    return ''.join([itos[i] for i in tokens])

In [6]:
print(encode("hi there"))
print(decode(encode("hi there")))

[65, 66, 1, 77, 65, 62, 75, 62]
hi there


In [7]:
#Dataset
data = torch.tensor(encode(text), dtype =torch.long)
print(data.shape)

torch.Size([3760234])


In [8]:
#Split Training and Validation Dataset
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 8
train_data[:block_size+1]

tensor([48, 65, 62,  1, 44, 75, 72, 67, 62])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([48]) the target: 65
when input is tensor([48, 65]) the target: 62
when input is tensor([48, 65, 62]) the target: 1
when input is tensor([48, 65, 62,  1]) the target: 44
when input is tensor([48, 65, 62,  1, 44]) the target: 75
when input is tensor([48, 65, 62,  1, 44, 75]) the target: 72
when input is tensor([48, 65, 62,  1, 44, 75, 72]) the target: 67
when input is tensor([48, 65, 62,  1, 44, 75, 72, 67]) the target: 62


In [11]:
batch_size = 4
block_size = 8

#generate small batches of x and y
def get_batch(split):
    data = train_data if split == 'train' else val_data
    #selecting random subBatches
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #torch.randint(highvalue, tensorShape)
    x = torch.stack([data[i: i+block_size] for i in ix])
    #Converts List of tensors into 2d tensors
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [12]:
xb,yb = get_batch('train')
xb

tensor([[27,  1, 77, 65, 62,  1, 41, 58],
        [ 1,  1, 72, 63,  1, 58, 60, 60],
        [82, 72, 78,  1, 58, 75, 62,  1],
        [77, 65, 62,  0,  1,  1,  1,  1]], device='cuda:0')

In [13]:
yb

tensor([[ 1, 77, 65, 62,  1, 41, 58, 76],
        [ 1, 72, 63,  1, 58, 60, 60, 66],
        [72, 78,  1, 58, 75, 62,  1, 69],
        [65, 62,  0,  1,  1,  1,  1,  1]], device='cuda:0')

In [14]:
#Selecting random subBatches
ix = torch.randint(len(data) - block_size, (batch_size,))
ix

tensor([ 491263, 1297317, 1230521,  428475])

In [15]:
[data[i: i+block_size] for i in ix]

[tensor([65,  1, 73, 72, 73, 73, 66, 62]),
 tensor([64, 28,  1, 48, 65, 62,  1, 68]),
 tensor([77, 65,  1, 65, 66, 76,  1, 76]),
 tensor([68, 76,  1, 66, 71,  1, 58, 69])]

In [16]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the the target {target}")

when input is [27] the the target 1
when input is [27, 1] the the target 77
when input is [27, 1, 77] the the target 65
when input is [27, 1, 77, 65] the the target 62
when input is [27, 1, 77, 65, 62] the the target 1
when input is [27, 1, 77, 65, 62, 1] the the target 41
when input is [27, 1, 77, 65, 62, 1, 41] the the target 58
when input is [27, 1, 77, 65, 62, 1, 41, 58] the the target 76
when input is [1] the the target 1
when input is [1, 1] the the target 72
when input is [1, 1, 72] the the target 63
when input is [1, 1, 72, 63] the the target 1
when input is [1, 1, 72, 63, 1] the the target 58
when input is [1, 1, 72, 63, 1, 58] the the target 60
when input is [1, 1, 72, 63, 1, 58, 60] the the target 60
when input is [1, 1, 72, 63, 1, 58, 60, 60] the the target 66
when input is [82] the the target 72
when input is [82, 72] the the target 78
when input is [82, 72, 78] the the target 1
when input is [82, 72, 78, 1] the the target 58
when input is [82, 72, 78, 1, 58] the the targe

In [17]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        #Embedding is trainable layer with weights vocab_size * vocab_size
        #Trainable look up table
        #nn.Embedding(vocab_size, embedding_size)
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx, targets = None):

        #idx and target both are type (B,T)
        logits = self.token_embedding_table(idx)    #(B,T,C)  batch,time,channels

        if targets is None:
            loss2 = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss2 = F.cross_entropy(logits,targets) 
        #earlier our logits was of type (B,T,C) ---> (B*T, C)  
        #earlier our target was of type (B,T) ---> (B*T)

        return logits, loss2
    
    def generate(self, idx, max_new_tokens):
        #idx is (B,T)
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            #logits are the predicted values
            #logits (B,T,C)
            logits = logits[:,-1,:]
            #converting logits (B,T,C) ---> (B,C) as we need only the last predicted value from each batch
            probs = F.softmax(logits,dim=-1)
            #out of all C we need to get max Prob char only therefore softmax
            idx_next = torch.multinomial(probs, num_samples=1)      #(B,1)
            #get the highest probability index
            idx = torch.cat((idx, idx_next), dim=1)
            #concat the predicted integer(char) on training input return 
        return idx

In [18]:
model = BigramLanguageModel(vocab_size)
model = model.to(device)
logits,loss = model(xb,yb)
print(logits.shape)
print(loss)

torch.Size([32, 116])
tensor(5.3621, device='cuda:0', grad_fn=<NllLossBackward0>)


In [19]:
idx = torch.zeros((1,1), dtype=torch.long,device=device)
print("Input:",idx)
print("Output:",model.generate(idx, max_new_tokens=100))

Input: tensor([[0]], device='cuda:0')
Output: tensor([[  0,  56,  22,   3,  11,  45,  43,   7,  25,  51,  94,  61,  73,  32,
           6,  50,  72,  35,  84,  43,  46,  59, 113,  10,  71, 107,  21,  10,
          66,  25,  94,   9,  36,  94,  11,  78,  30, 114,  48,  65,  82, 101,
          62,  35,  23,  91,  84, 101,  98,  88,  53,  69,  90, 110,  86,  71,
          97, 101,  37,   5,  65,  99,  24,  20,  59,  31,  73,  23, 110,  94,
          49,  19,  78,  49,  67, 106, 114,  55,  17,  56,  65,  12,  24,  53,
          63,  76, 101, 108,  57,  52,  89,  83,  62,  10,  42, 103,  95,  92,
           7,  65, 109]], device='cuda:0')


In [20]:
print("Input:",decode([idx[0].item()]))
print("Output:",decode(model.generate(idx, max_new_tokens=100)[0].tolist()))

Input: 

Output: 
Sâéœ7âB,+cΛfJi!H!Guu6%af_7CT?JWeΛ+R”0
‘£nÉ—ΛTTOhΞu$wGUxΓdDz!ôSmxïfï3äuHi*]-"pœæn‘Kz{oI™fZqΛZ352)vJHL


In [21]:
#Training Loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-1)
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.58994460105896


In [22]:
idx = torch.zeros((1,1), dtype=torch.long).to(device)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))



d ofory f t’s
tearro. ht  bla h I Hed, by?” are:-
pepele  m  trracllo
Thearo theng!
Have pu byod,” 
