In [2]:
# Tensor is a fundamental data structure that represents multi-dimensional arrays. 
# Tensors can be of various dimensions, including: 
# 1. scalars (0-dimensional tensors)
# 2. vectors (1-dimensional tensors)
# 3. matrices (2-dimensional tensors)
# 4. higher-dimensional arrays
# Tensor libraries, such as TensorFlow and PyTorch, provide efficient implementations of tensor operations and 
# are widely used in machine learning frameworks for building and training neural networks.

import torch

In [3]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
vocabulary_size=len(chars)

In [4]:
print(chars)
print(vocabulary_size)

['\n', ' ', '!', '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”']
68


In [5]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

In [6]:
# Tokenization - the process of converting a sequence of text into smaller parts, known as tokens

In [7]:
encoded_hello = encode("Hello")
decoded_hello = decode(encoded_hello)

In [10]:
print(encoded_hello)
print(decoded_hello)
print(len(data))
print(data[:400])

[18, 41, 48, 48, 51]
Hello
205515
tensor([30, 44, 41,  1, 33, 51, 50, 40, 41, 54, 42, 57, 48,  1, 33, 45, 62, 37,
        54, 40,  1, 51, 42,  1, 25, 62,  0,  0, 13, 44, 37, 52, 56, 41, 54,  1,
        19,  0, 30, 44, 41,  1, 13, 61, 39, 48, 51, 50, 41,  0,  0, 14, 51, 54,
        51, 56, 44, 61,  1, 48, 45, 58, 41, 40,  1, 45, 50,  1, 56, 44, 41,  1,
        49, 45, 40, 55, 56,  1, 51, 42,  1, 56, 44, 41,  1, 43, 54, 41, 37, 56,
         1, 21, 37, 50, 55, 37, 55,  1, 52, 54, 37, 45, 54, 45, 41, 55,  5,  1,
        59, 45, 56, 44,  1, 31, 50, 39, 48, 41,  0, 18, 41, 50, 54, 61,  5,  1,
        59, 44, 51,  1, 59, 37, 55,  1, 37,  1, 42, 37, 54, 49, 41, 54,  5,  1,
        37, 50, 40,  1, 11, 57, 50, 56,  1, 15, 49,  5,  1, 59, 44, 51,  1, 59,
        37, 55,  1, 56, 44, 41,  1, 42, 37, 54, 49, 41, 54, 65, 55,  1, 59, 45,
        42, 41,  7,  1, 30, 44, 41, 45, 54,  0, 44, 51, 57, 55, 41,  1, 59, 37,
        55,  1, 55, 49, 37, 48, 48,  5,  1, 42, 51, 54,  1, 56, 44, 41,  1, 48,
      

In [None]:
# Splitting to validation and train sets
    # Train set consists of 80% of data from dataset on which we are going to create our model. 
    # We are doing this se we are sure our model is unique and we have a way to validate it (validation set)

# Bigram
    # Pair of consecutive written units such as letters, syllables or words
    # HRV - bigram ili digram je niz susjednih elemenata iz niza tokena koji su obično slova, slogovi ili riječi
        # Bigram je n-gram za n=2