In [2]:
!pip install torch



In [3]:
#Reading in all of the text as a string
with open('drake-lyrics.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

text



In [4]:
print("This is length of drake lyrics: ", len(text))

This is length of drake lyrics:  302961


In [5]:
#First thousand characters of drake lyrics
print(text[:1000])

Uh Yea Man, Vo, Back Again Like We Never Left Huh
Yea Man, Im Sending This Out To Any Of My Girls Thats Different Man
Anybody Thats Special
You Know I Wanna Be Honest With You Women So Ima Start Off Like This

Uh Look, I'm Bold From The Get Go
Go By The Title Of Drake
And Treating Me Like The Rest Is A Vital Mistake
I'm Hopin That Aint Nobody Else As Special As You
When I Say I've Been Disappointed Im Addressin A Few
I Finesse Then I Groove
While Most Of The 8 To 9's In My Past
Now Get A Rating Thats Less Than A 2
Its True, I Been Talking To Aleshia, Keisha And Nadia
And Shadia, Had To Throw Peace Signs To A Lot Of Ya's
Payback, To The Same Chicks From Way Back
Never Call, Now They Wanna Be Where I Stays At
Now They Got The New Boy, I'm Trying To Move Forward
And Pass Me, Classy, Something Like Drew Lord
A New Me, Cool, See I Never Get Around
If I Dont Change Now I Dont Think I'll Ever Settle Down
Even If I Gotta Travel A Bit
I'll Go Around The World And Back
Cuz I Know That Once I Hav

In [6]:
#Set removes all duplicates put into a list and then sorts them by ascii values
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#%&'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz©ãéóú–…
90


In [7]:
#Now we need to tokenize the input text
#Meaning we convert raw text to some sequence of integers
#create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] #encoder: takes a string, outputs a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: takes a list of integers and outputs a string
print(encode("yoo hello"))
print(decode(encode("yoo hello")))

[81, 71, 71, 1, 64, 61, 68, 68, 71]
yoo hello


In [8]:
#Now we will encode all of the drake lyrics and store it into a torch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long) #wrapped in torch tensor
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([302961]) torch.int64
tensor([48, 64,  1, 52, 61, 57,  1, 40, 57, 70, 11,  1, 49, 71, 11,  1, 29, 57,
        59, 67,  1, 28, 63, 57, 65, 70,  1, 39, 65, 67, 61,  1, 50, 61,  1, 41,
        61, 78, 61, 74,  1, 39, 61, 62, 76,  1, 35, 77, 64,  0, 52, 61, 57,  1,
        40, 57, 70, 11,  1, 36, 69,  1, 46, 61, 70, 60, 65, 70, 63,  1, 47, 64,
        65, 75,  1, 42, 77, 76,  1, 47, 71,  1, 28, 70, 81,  1, 42, 62,  1, 40,
        81,  1, 34, 65, 74, 68, 75,  1, 47, 64, 57, 76, 75,  1, 31, 65, 62, 62,
        61, 74, 61, 70, 76,  1, 40, 57, 70,  0, 28, 70, 81, 58, 71, 60, 81,  1,
        47, 64, 57, 76, 75,  1, 46, 72, 61, 59, 65, 57, 68,  0, 52, 71, 77,  1,
        38, 70, 71, 79,  1, 36,  1, 50, 57, 70, 70, 57,  1, 29, 61,  1, 35, 71,
        70, 61, 75, 76,  1, 50, 65, 76, 64,  1, 52, 71, 77,  1, 50, 71, 69, 61,
        70,  1, 46, 71,  1, 36, 69, 57,  1, 46, 76, 57, 74, 76,  1, 42, 62, 62,
         1, 39, 65, 67, 61,  1, 47, 64, 65, 75,  0,  0, 48, 64,  1, 39, 71, 71,
       

In [9]:
#Now lets split the dataset into the train and validation sets
n = int(0.9*len(data))
train_data = data[:n] #90% is train data last 10% is validation data 
val_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size + 1]

tensor([48, 64,  1, 52, 61, 57,  1, 40, 57])

In [11]:
x = train_data[:block_size] #inputs into the transformer
y = train_data[1:block_size+1] #next block size characters they are targets for each position in the input
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([48]) the target: 64
when input is tensor([48, 64]) the target: 1
when input is tensor([48, 64,  1]) the target: 52
when input is tensor([48, 64,  1, 52]) the target: 61
when input is tensor([48, 64,  1, 52, 61]) the target: 57
when input is tensor([48, 64,  1, 52, 61, 57]) the target: 1
when input is tensor([48, 64,  1, 52, 61, 57,  1]) the target: 40
when input is tensor([48, 64,  1, 52, 61, 57,  1, 40]) the target: 57


In [12]:
batch_size = 4 #how many independent sequences we will be processing in parallel
block_size = 8 # maximum context length for the predictions

def get_batch(split):
    #generate a sall batch of data of inputs x and targets y
    if split == 'train':
        data = train_data
    else:
        data = val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) #generating random places to take chunks out of
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size): 
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[75, 70,  7, 76,  1, 74, 61, 57],
        [ 1, 32, 78, 61, 74, 81, 76, 64],
        [61,  1, 75, 59, 57, 74, 61, 60],
        [75, 64, 61,  1, 76, 64, 61,  1]])
targets:
torch.Size([4, 8])
tensor([[70,  7, 76,  1, 74, 61, 57, 60],
        [32, 78, 61, 74, 81, 76, 64, 65],
        [ 1, 75, 59, 57, 74, 61, 60,  1],
        [64, 61,  1, 76, 64, 61,  1, 62]])
when input is [75] the target: 70
when input is [75, 70] the target: 7
when input is [75, 70, 7] the target: 76
when input is [75, 70, 7, 76] the target: 1
when input is [75, 70, 7, 76, 1] the target: 74
when input is [75, 70, 7, 76, 1, 74] the target: 61
when input is [75, 70, 7, 76, 1, 74, 61] the target: 57
when input is [75, 70, 7, 76, 1, 74, 61, 57] the target: 60
when input is [1] the target: 32
when input is [1, 32] the target: 78
when input is [1, 32, 78] the target: 61
when input is [1, 32, 78, 61] the target: 74
when input is [1, 32, 78, 61, 74] the target: 81
when input is [1, 32, 78, 61, 

In [13]:
print(xb)

tensor([[75, 70,  7, 76,  1, 74, 61, 57],
        [ 1, 32, 78, 61, 74, 81, 76, 64],
        [61,  1, 75, 59, 57, 74, 61, 60],
        [75, 64, 61,  1, 76, 64, 61,  1]])


In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):#subclass of nn.Module

    def __init__(self, vocab_size):#constructor
        super().__init__()
        #each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):
        #idx and targets are both (B, T) tensor of integrs
        logits = self.token_embedding_table(idx) # B T C(batch by time by channel)
        if targets is None:
            loss = None
        else:   
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)#measures quality of the logits with respect to the targets
            #how well are we predicting the next character based on the logits
        
        return logits, loss #scores for the next character in the sequence
    
    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):
            #gets the predictions
            logits, loss = self(idx)
            #focuses only on the last timestep
            logits = logits[:, -1, :] # makes it (B, C)
            #apply softmax to get the probabilites
            probabilites = F.softmax(logits, dim = -1) #(B, C)
            #sample from the distribution
            idx_next = torch.multinomial(probabilites, num_samples = 1) # (B, 1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)
        return idx
             
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 90])
tensor(4.5061, grad_fn=<NllLossBackward0>)

VnlkuD[7Tx–/M-hjNBuwU…whe[/"P,–rpU%ã_hdWQP"t
h''M_' ,_C©*N_O3g…,ãAX]ú©©]LH:rXó"XNZ4jN W/Ha05é%H)jGCO


In [15]:
#create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [16]:
batch_size = 32
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4382619857788086


In [17]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype = torch.long), max_new_tokens = 300)[0].tolist()))


B m teyoum
Tred omy d aprg s I tholsspxa I ck n ingarnon yme s y, hen
JLan treeenp, a ba bis lp lll k tho ju ack t ardooth Hakndve gND
Spop
Ist f mind myngA I' bemsodas youryop I itcassacy, s s ta wher20ithe Hamme fust kngeouce at t Den w Thourealee mes, ca kur st ingheveit DrdoEx
I ha ilindos g ged


In [18]:
#consider the following toy example: 
B, T, C = 4, 8, 2 
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])