In [12]:
# import display markdown
from IPython.display import Markdown, display

In [19]:
# import datasets

# # load tiny shakespeare dataset
# # dataset = datasets.load_dataset('tiny_shakespeare', cache_dir="cache")

with open("data/input.txt", "r") as f:
  text = f.read()

display(Markdown(f"""
  # Tiny Shakespeare Dataset
  | Metrix | Value |
  | --- | --- |
  | Number of characters | {len(text)} |
  | Number of unique characters | {len(set(text))} |
  | Number of lines | {len(text.splitlines())} |
  | Number of words | {len(text.split())} |
  """)
)



  # Tiny Shakespeare Dataset
  | Metrix | Value |
  | --- | --- |
  | Number of characters | 1115394 |
  | Number of unique characters | 65 |
  | Number of lines | 40000 |
  | Number of words | 202651 |
  

In [24]:
STOI = {ch: i for i, ch in enumerate(sorted(set(text)))}
ITOS = {i: ch for ch, i in STOI.items()}

def encode(text: str):
  return [STOI[ch] for ch in text]

def decode(indices: list):
  return ''.join(ITOS[i] for i in indices)

display(Markdown(f"""
  # Encoding and Decoding
  | Text | Encoded | Decoded |
  | --- | --- | --- |
  | {text[:10]} | {encode(text[:10])} | {decode(encode(text[:10]))} |
  | {text[-10:-1]} | {encode(text[-10:])} | {decode(encode(text[-10:]))} |
                 
  """))


  # Encoding and Decoding
  | Text | Encoded | Decoded |
  | --- | --- | --- |
  | First Citi | [18, 47, 56, 57, 58, 1, 15, 47, 58, 47] | First Citi |
  | t waking. | [58, 1, 61, 39, 49, 47, 52, 45, 8, 0] | t waking.
 |
                 
  

In [26]:
import torch
data = torch.Tensor(encode(text))
data[:100]

tensor([18., 47., 56., 57., 58.,  1., 15., 47., 58., 47., 64., 43., 52., 10.,
         0., 14., 43., 44., 53., 56., 43.,  1., 61., 43.,  1., 54., 56., 53.,
        41., 43., 43., 42.,  1., 39., 52., 63.,  1., 44., 59., 56., 58., 46.,
        43., 56.,  6.,  1., 46., 43., 39., 56.,  1., 51., 43.,  1., 57., 54.,
        43., 39., 49.,  8.,  0.,  0., 13., 50., 50., 10.,  0., 31., 54., 43.,
        39., 49.,  6.,  1., 57., 54., 43., 39., 49.,  8.,  0.,  0., 18., 47.,
        56., 57., 58.,  1., 15., 47., 58., 47., 64., 43., 52., 10.,  0., 37.,
        53., 59.])

In [27]:
split = int(len(data) * 0.8)
train_data = data[:split]
val_data = data[split:]

display(Markdown(f"""
  # Train and Validation Data
  | Data | Length |
  | --- | --- |
  | Train | {len(train_data)} |
  | Validation | {len(val_data)} |
  | **Total** | **{len(train_data) + len(val_data)}** |
"""))


  # Train and Validation Data
  | Data | Length |
  | --- | --- |
  | Train | 892315 |
  | Validation | 223079 |
  | **Total** | **1115394** |


In [36]:

torch.manual_seed(1337)       # Set the random seed for reproducibility
CONTEXT_LENGTH = 8            # Maximum context length.
BATCH_SIZE = 4                # Number of independent sequences to train on in parallel

def get_batch(split: str):
  data = train_data if split == 'train' else val_data
  start_idx = torch.randint(0, len(data) - CONTEXT_LENGTH, (BATCH_SIZE,))
  end_idx = start_idx + CONTEXT_LENGTH
  inputs = [data[start:end] for start, end in zip(start_idx, end_idx)]
  targets = [data[start+1:end+1] for start, end in zip(start_idx, end_idx)]
  return torch.stack(inputs), torch.stack(targets)

inputs, targets = get_batch('train')
display(Markdown(f"""
  # Batch Data
  | Data | Shape |
  | --- | --- |
  | Inputs | {inputs.shape} |
  | Targets | {targets.shape} |

  Inputs: {inputs}  
  Targets: {targets} 
"""))


  # Batch Data
  | Data | Shape |
  | --- | --- |
  | Inputs | torch.Size([4, 8]) |
  | Targets | torch.Size([4, 8]) |

  Inputs: tensor([[58., 63.,  8.,  0.,  0., 19., 24., 27.],
        [39., 59., 45., 46., 58.,  1., 46., 43.],
        [49., 43., 57.,  1., 53., 50., 42.,  1.],
        [52., 41., 47., 43., 52., 58.,  1., 56.]])  
  Targets: tensor([[63.,  8.,  0.,  0., 19., 24., 27., 33.],
        [59., 45., 46., 58.,  1., 46., 43.,  1.],
        [43., 57.,  1., 53., 50., 42.,  1., 46.],
        [41., 47., 43., 52., 58.,  1., 56., 47.]]) 
