In [1]:
# Tensor is a fundamental data structure that represents multi-dimensional arrays. 
# Tensors can be of various dimensions, including: 
# 1. scalars (0-dimensional tensors)
# 2. vectors (1-dimensional tensors)
# 3. matrices (2-dimensional tensors)
# 4. higher-dimensional arrays
# Tensor libraries, such as TensorFlow and PyTorch, provide efficient implementations of tensor operations and 
# are widely used in machine learning frameworks for building and training neural networks.

import torch

In [2]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
vocabulary_size=len(chars)

In [3]:
print(chars)
print(vocabulary_size)

['\n', ' ', '!', '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”']
68


In [4]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
## lambda is a keyowrd for defining anonymous function
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

In [6]:
# Tokenization - the process of converting a sequence of text into smaller parts, known as tokens
    ## we are using character tokenizer
    ## we can also use word tokenizer
    ## we can also use subword tokenizer (between word and character)

In [12]:
encoded_hello = encode("hello")
decoded_hello = decode(encoded_hello)

In [13]:
print(encoded_hello)
print(decoded_hello)
print(len(data))
print(data[:100])

[44, 41, 48, 48, 51]
hello
205515
tensor([30, 44, 41,  1, 33, 51, 50, 40, 41, 54, 42, 57, 48,  1, 33, 45, 62, 37,
        54, 40,  1, 51, 42,  1, 25, 62,  0,  0, 13, 44, 37, 52, 56, 41, 54,  1,
        19,  0, 30, 44, 41,  1, 13, 61, 39, 48, 51, 50, 41,  0,  0, 14, 51, 54,
        51, 56, 44, 61,  1, 48, 45, 58, 41, 40,  1, 45, 50,  1, 56, 44, 41,  1,
        49, 45, 40, 55, 56,  1, 51, 42,  1, 56, 44, 41,  1, 43, 54, 41, 37, 56,
         1, 21, 37, 50, 55, 37, 55,  1, 52, 54])


In [None]:
# Splitting to validation and train sets
    # Train set consists of 80% of data from dataset on which we are going to create our model. 
    # We are doing this se we are sure our model is unique and we have a way to validate it (validation set)

# Bigram
    # Pair of consecutive written units such as letters, syllables or words
    # HRV - bigram ili digram je niz susjednih elemenata iz niza tokena koji su obično slova, slogovi ili riječi
        # Bigram je n-gram za n=2
    # BIGRAM LLM - we are giving a character and are predicting next. It is only going to consider previous and predict next.

# Block size - take small random snippet from text corpus and make some prediction and targets out of that
    # it is bunch of encoded characters we have predictions for
    # checks block by block of a tensor
    # how much is prediction from the target

In [8]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [14]:
# sequential

block_size = 8

x = train_data[:block_size] # predictions - [:block_size] extracts train_data up to, but not including, the element at block_size
y = train_data[1:block_size+1] # targets - [1:block_size+1] extracts train_data from element at index 1 up to, but not including, the element at block_size+1

for t in range(block_size):
    context = x[:t+1]
    target=y[t]
    print('When input is', context, 'target is', target)
    

8
8
When input is tensor([30]) target is tensor(44)
When input is tensor([30, 44]) target is tensor(41)
When input is tensor([30, 44, 41]) target is tensor(1)
When input is tensor([30, 44, 41,  1]) target is tensor(33)
When input is tensor([30, 44, 41,  1, 33]) target is tensor(51)
When input is tensor([30, 44, 41,  1, 33, 51]) target is tensor(50)
When input is tensor([30, 44, 41,  1, 33, 51, 50]) target is tensor(40)
When input is tensor([30, 44, 41,  1, 33, 51, 50, 40]) target is tensor(41)
