In [57]:
with open('../data/songs.txt', 'r', encoding='utf-8') as f:
    plain_data = f.read()

In [59]:
len(plain_data)

612105

In [61]:
print(plain_data[:500])

Song Name: Tim McGraw
He said the way my blue eyes shined
Put those Georgia stars to shame that night
I said Thats a lie
Just a boy in a Chevy truck
That had a tendency of gettin stuck
On back roads at night
And I was right there beside him all summer long
And then the time we woke up to find that summer gone

But when you think Tim McGraw
I hope you think my favorite song
The one we danced to all night long
The moon like a spotlight on the lake
When you think happiness
I hope you think that lit


In [62]:
# seeing all the unique characters in the data to perform the cleaning
print(''.join(sorted(list(set(plain_data)))))


 !&'(),-./0123456789:?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz®éíïó


In [64]:
# Replace the unwanted characters with the most probable ones

# ® -> ''
# é -> e
# ó -> o
# í -> i
# ï -> i

plain_data = plain_data.replace('®', '')
plain_data = plain_data.replace('é', 'e')
plain_data = plain_data.replace('ó', 'o')
plain_data = plain_data.replace('í', 'i')
plain_data = plain_data.replace('ï', 'i')

In [65]:
chars = sorted(list(set(plain_data)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !&'(),-./0123456789:?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
75


In [66]:
# basic tokenizer
# converting each character to its index in the unique characters array
# Google uses SentencePiece to tokenize
# OpenAI uses tiktoken to tokenize

stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for i,s in enumerate(chars)}

encode = lambda s: [stoi[i] for i in s]
decode = lambda s: ''.join([itos[i] for i in s])

In [67]:
encoded_data = encode('hello world')
print("Encoded data is: ", encoded_data)

Encoded data is:  [56, 53, 60, 60, 63, 1, 71, 63, 66, 60, 52]


In [68]:
print("Decoded data is: ", decode(encoded_data))

Decoded data is:  hello world


In [69]:
import torch

In [70]:
# converting the data to a tensor
data = torch.tensor(encode(plain_data), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:500])

torch.Size([612104]) torch.int64
tensor([41, 63, 62, 55,  1, 36, 49, 61, 53, 21,  1, 42, 57, 61,  1, 35, 51, 29,
        66, 49, 71,  0, 30, 53,  1, 67, 49, 57, 52,  1, 68, 56, 53,  1, 71, 49,
        73,  1, 61, 73,  1, 50, 60, 69, 53,  1, 53, 73, 53, 67,  1, 67, 56, 57,
        62, 53, 52,  0, 38, 69, 68,  1, 68, 56, 63, 67, 53,  1, 29, 53, 63, 66,
        55, 57, 49,  1, 67, 68, 49, 66, 67,  1, 68, 63,  1, 67, 56, 49, 61, 53,
         1, 68, 56, 49, 68,  1, 62, 57, 55, 56, 68,  0, 31,  1, 67, 49, 57, 52,
         1, 42, 56, 49, 68, 67,  1, 49,  1, 60, 57, 53,  0, 32, 69, 67, 68,  1,
        49,  1, 50, 63, 73,  1, 57, 62,  1, 49,  1, 25, 56, 53, 70, 73,  1, 68,
        66, 69, 51, 59,  0, 42, 56, 49, 68,  1, 56, 49, 52,  1, 49,  1, 68, 53,
        62, 52, 53, 62, 51, 73,  1, 63, 54,  1, 55, 53, 68, 68, 57, 62,  1, 67,
        68, 69, 51, 59,  0, 37, 62,  1, 50, 49, 51, 59,  1, 66, 63, 49, 52, 67,
         1, 49, 68,  1, 62, 57, 55, 56, 68,  0, 23, 62, 52,  1, 31,  1, 71, 49,
       

In [71]:
# splitting the data in the training and test set
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [73]:
# defining a block size, sometimes also called the context length
# we take block_size+1 because there are 8 examples residing here
block_size = 8
train_data[:block_size+1]

tensor([41, 63, 62, 55,  1, 36, 49, 61, 53])

In [76]:
for i in range(block_size):
    print(f"When the context is: {train_data[:i+1]} the output is: {train_data[i+1]}")

When the context is: tensor([41]) the output is: 63
When the context is: tensor([41, 63]) the output is: 62
When the context is: tensor([41, 63, 62]) the output is: 55
When the context is: tensor([41, 63, 62, 55]) the output is: 1
When the context is: tensor([41, 63, 62, 55,  1]) the output is: 36
When the context is: tensor([41, 63, 62, 55,  1, 36]) the output is: 49
When the context is: tensor([41, 63, 62, 55,  1, 36, 49]) the output is: 61
When the context is: tensor([41, 63, 62, 55,  1, 36, 49, 61]) the output is: 53
