In [30]:
import torch
from tqdm import tqdm
device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size = 8
batch_size = 4 # how many blocks we want to process in parallel

In [1]:
with open("wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(text[:200])

DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YO


In [5]:
chars = sorted(set(text))
print(chars)
vocabualry_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
vocabualry_size

79

In [10]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [12]:
encode('hello')

[60, 57, 64, 64, 67]

In [11]:
decode(encode('hello'))

'hello'

In [15]:
encoded_hello = torch.tensor(encode('hello'), dtype=torch.long)

In [16]:
encoded_hello

tensor([60, 57, 64, 64, 67])

In [17]:
# Transform all our data to tensor:
data = torch.tensor(encode(text), dtype=torch.long)

In [18]:
data

tensor([27, 38, 41,  ..., 66, 56,  0])

In [21]:
data.shape

torch.Size([232143])

In [22]:
print(data[:100])

tensor([27, 38, 41, 38, 43, 31, 48,  1, 24, 37, 27,  1, 43, 31, 28,  1, 46, 32,
        49, 24, 41, 27,  1, 32, 37,  1, 38, 49,  0,  0,  1,  1, 25, 48,  0,  0,
         1,  1, 35, 10,  1, 29, 41, 24, 37, 34,  1, 25, 24, 44, 36,  0,  0,  1,
         1, 24, 44, 43, 31, 38, 41,  1, 38, 29,  1, 43, 31, 28,  1, 46, 32, 49,
        24, 41, 27,  1, 38, 29,  1, 38, 49,  8,  1, 43, 31, 28,  1, 35, 24, 37,
        27,  1, 38, 29,  1, 38, 49,  8,  1, 38])


In [23]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

In [29]:
# block_size = 5
# ... [5, 67, 21, 58, 40] 35 ...
# ... 5 [67, 21, 58, 40, 35] ...

x = train_data[:block_size]
y = train_data[block_size:]

print(train_data[:16])

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} target is {target}")

tensor([27, 38, 41, 38, 43, 31, 48,  1, 24, 37, 27,  1, 43, 31, 28,  1])
When input is tensor([27]) target is 24
When input is tensor([27, 38]) target is 37
When input is tensor([27, 38, 41]) target is 27
When input is tensor([27, 38, 41, 38]) target is 1
When input is tensor([27, 38, 41, 38, 43]) target is 43
When input is tensor([27, 38, 41, 38, 43, 31]) target is 31
When input is tensor([27, 38, 41, 38, 43, 31, 48]) target is 28
When input is tensor([27, 38, 41, 38, 43, 31, 48,  1]) target is 1
