In [97]:
import torch, tiktoken

In [98]:
tokenizer = tiktoken.get_encoding("gpt2")

In [99]:
with open("thelostrace.txt", "r") as f:
    raw_text = f.read()

raw_text

'Cororuc glanced about him and hastened his pace. He was no coward, but he did not like the place. Tall trees rose all about, their sullen branches shutting out the sunlight. The dim trail led in and out among them, sometimes skirting the edge of a ravine, where Cororuc could gaze down at the tree-tops beneath. Occasionally, through a rift in the forest, he could see away to the forbidding hills that hinted of the ranges much farther to the west, that were the mountains of Cornwall.\n\nIn those mountains the bandit chief, Buruc the Cruel, was supposed to lurk, to descend upon such victims as might pass that way. Cororuc shifted his grip on his spear and quickened his step. His haste was due not only to the menace of the outlaws, but also to the fact that he wished once more to be in his native land. He had been on a secret mission to the wild Cornish tribesmen; and though he had been more or less successful, he was impatient to be out of their inhospitable country. It had been a long, 

In [100]:
enc_text = tokenizer.encode(raw_text)
enc_text[:10]


[10606, 273, 1229, 27846, 546, 683, 290, 19338, 2945, 465]

In [101]:
from torch.utils.data import Dataset, DataLoader

In [102]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt) #tokenizes all of text

        for i in range(0, len(token_ids) - max_length, stride): # uses sliding window to chunk book into sequences of max_length
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))


    def __len__(self): #returns total # of rows in data set
        return len(self.input_ids)
    def __getitem__(self, idx): #returns single row from dataset
        return self.input_ids[idx], self.target_ids[idx]



In [103]:
#data loader for gptdatasetv1
def create_dataloader_v1(txt, batch_size=4, max_length=256,stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2") # initializes tokenizer
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)#creates data set
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers) #drops last batch if its shorter than the specificed batch_size wanted for training
                            #also number of cpu processes for preprocessing
    return dataloader 



In [104]:
# testing data loader
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)




[tensor([[10606,   273,  1229, 27846]]), tensor([[  273,  1229, 27846,   546]])]


In [105]:
#trying another batch

second_batch = next(data_iter)
second_batch


[tensor([[  273,  1229, 27846,   546]]),
 tensor([[ 1229, 27846,   546,   683]])]

In [106]:
#testing other variabls
dataloader = create_dataloader_v1(raw_text, batch_size=5, max_length=2, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[10606,   273],
        [ 1229, 27846],
        [  546,   683],
        [  290, 19338],
        [ 2945,   465]]), tensor([[  273,  1229],
        [27846,   546],
        [  683,   290],
        [19338,  2945],
        [  465,  8761]])]


In [107]:
#and messing with stride variable as well as max_length/batch_size and seeing how shuffle true looks vs false

dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=9, stride=3, shuffle=True)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  284,   307, 11544,   508,   815,   407,   307,  2474,   198],
        [  422,   262, 15857,   372,  1969,   416,   262, 24343,   500]]), tensor([[  307, 11544,   508,   815,   407,   307,  2474,   198,   198],
        [  262, 15857,   372,  1969,   416,   262, 24343,   500,   287]])]


In [108]:
#messed with different variables to see what stride did with input field changing position but using this setup for this section
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[10606,   273,  1229, 27846],
        [  546,   683,   290, 19338],
        [ 2945,   465,  8761,    13],
        [  679,   373,   645, 26769],
        [   11,   475,   339,   750],
        [  407,   588,   262,  1295],
        [   13, 22676,  7150,  8278],
        [  477,   546,    11,   511]])

Targets:
 tensor([[  273,  1229, 27846,   546],
        [  683,   290, 19338,  2945],
        [  465,  8761,    13,   679],
        [  373,   645, 26769,    11],
        [  475,   339,   750,   407],
        [  588,   262,  1295,    13],
        [22676,  7150,  8278,   477],
        [  546,    11,   511,   424]])


In [109]:

# needs inputs to turn into a list before decoding
for row in inputs:
    print( tokenizer.decode( row.tolist() ) )


Cororuc glanced
 about him and hast
ened his pace.
 He was no coward
, but he did
 not like the place
. Tall trees rose
 all about, their


In [110]:
#embedding layer 
vocab_size = 4
output_dim = 8

embedding_inputs =  torch.nn.Embedding(vocab_size, output_dim)
print(embedding_inputs.weight) 

#these vectors are so important because it allows the model to know where certain tokens have 
# interactions with others, ie: when many sentences follow a similar path, 
# it recognizes that pattern, and the E vector allows that to be seen


Parameter containing:
tensor([[-1.6419,  0.4539,  0.5244, -1.0051,  0.1259, -0.2979,  0.9161,  1.4429],
        [ 0.4693, -0.5049,  0.7408,  0.2740,  1.7436,  0.1419, -0.3101,  0.2015],
        [ 2.1423, -0.8404, -0.8767,  0.5941,  0.7905,  0.3133,  0.5227, -1.0171],
        [ 0.6914, -0.5581,  1.7775,  1.1747,  0.3262,  1.2015,  1.0287, -0.2045]],
       requires_grad=True)


In [111]:
#get the weights of embedding_inputs
input_weights = embedding_inputs.weight.data
input_weights

tensor([[-1.6419,  0.4539,  0.5244, -1.0051,  0.1259, -0.2979,  0.9161,  1.4429],
        [ 0.4693, -0.5049,  0.7408,  0.2740,  1.7436,  0.1419, -0.3101,  0.2015],
        [ 2.1423, -0.8404, -0.8767,  0.5941,  0.7905,  0.3133,  0.5227, -1.0171],
        [ 0.6914, -0.5581,  1.7775,  1.1747,  0.3262,  1.2015,  1.0287, -0.2045]])

In [112]:
# use query for embedding vector

query = input_weights[2]

#calculate attention scores w the snippet from in class use dot product
attention_scores2 = torch.empty(len(input_weights))
for i in range(len(input_weights)):
    attention_scores2[i] = torch.dot(query,input_weights[i])
print(attention_scores2)

tensor([-5.9385,  1.9987,  8.4484,  2.4698])


In [113]:
#normalize using softmax function
attention_weights_2 = torch.softmax(attention_scores2, dim = 0)
attention_weights_2


tensor([5.6241e-07, 1.5746e-03, 9.9590e-01, 2.5221e-03])

In [114]:
#verify our sum = 1
attention_weights_2.sum()

tensor(1.)

In [115]:
# compute context vectors as the weighted sum with input weight vectors
context_vector_2 = torch.zeros (query.shape)
for i in range(len(attention_weights_2)):
    context_vector_2 += attention_weights_2[i] * input_weights[i]
context_vector_2

tensor([ 2.1360, -0.8391, -0.8675,  0.5951,  0.7909,  0.3153,  0.5227, -1.0132])

In [116]:
#get attention scores with matrix multiplication
attention_scores_2 = input_weights @ input_weights.T
attention_scores_2

tensor([[ 7.2131, -0.7028, -5.9385, -1.3068],
        [-0.7028,  4.2961,  1.9987,  2.6240],
        [-5.9385,  1.9987,  8.4484,  2.4698],
        [-1.3068,  2.6240,  2.4698,  7.9791]])

In [117]:
#normalize again with softmax to find attention weights
attention_weights = torch.softmax(attention_scores_2, dim = -1)
attention_weights

tensor([[9.9943e-01, 3.6470e-04, 1.9413e-06, 1.9935e-04],
        [5.2085e-03, 7.7213e-01, 7.7620e-02, 1.4504e-01],
        [5.6241e-07, 1.5746e-03, 9.9590e-01, 2.5221e-03],
        [9.1908e-05, 4.6823e-03, 4.0135e-03, 9.9121e-01]])

In [118]:
#make sure the sum =1 
attention_weights[0].sum()

tensor(1.0000)

In [119]:
# now we calculate context vectors 
context_vectors = attention_weights @ input_weights
context_vectors

tensor([[-1.6407,  0.4534,  0.5247, -1.0042,  0.1265, -0.2974,  0.9157,  1.4422],
        [ 0.6204, -0.5337,  0.7645,  0.4228,  1.4556,  0.3066, -0.0449,  0.0545],
        [ 2.1360, -0.8391, -0.8675,  0.5951,  0.7909,  0.3153,  0.5227, -1.0132],
        [ 0.6960, -0.5589,  1.7619,  1.1679,  0.3347,  1.1929,  1.0204, -0.2058]])