# Introduction to Generative AI

In [2]:
# Import Libraries
import io
import os
import sys
import zipfile
from datetime import datetime

import ipdb
import requests

# Tokenizer
import sentencepiece as spm

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from requests import Response
from tqdm import tqdm

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Clear CUDA cache
torch.cuda.empty_cache()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/admin/Library/Caches/pypoetry/virtualenvs/generative-ai-zqkb-BGY-py3.12/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/admin/Library/Caches/pypoetry/virtualenvs/generative-ai-zqkb-BGY-py3.12/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/admin/Library/Caches/

In [2]:
# Download necessary files
files_url = "https://ideami.com/llm_train"
print("Downloading files using Python ...")
response: Response = requests.get(files_url)

with zipfile.ZipFile(file=io.BytesIO(initial_bytes=response.content)) as zip_ref:
    zip_ref.extractall(path="./data")
print("Download complete")

Downloading files using Python ...
Download complete


In [6]:
# Architecture Parameters
batch_size = 8
context = 512
embed_size = 384
n_layers = 7
n_heads = 7
BIAS = True

# Hyperparameters
learning_rate = 0.0001
dropout = 0.05
weight_decay = 0.01
grad_clip = 1.0
epochs = 10

# Training Parameters
train_iters = 100000
eval_interval = 50
eval_iters = 10
compile = True
checkpoint_dir = "models"
checkpoint_file_name = "latest.pt"
checkpoint_load_file_name = "latest.pt"
dtype: torch.dtype = torch.bfloat16

# Mode
inference = False

# Device
device = torch.device(device="cuda" if torch.cuda.is_available() else "cpu")
print(f"Device:{device}")

Device:cpu


In [7]:
# Logging
wandb_log = True
wandb_project = "llm1"
wandb_run_name: str = "lm1" + datetime.now().strftime(format="%Y%m%d-%H%M%S")

if wandb_log:
    import wandb

    wandb.init(project=wandb_project, name=wandb_run_name)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/admin/.netrc


In [9]:
with open(file="./data/wiki.txt", mode="r", encoding="utf-8") as f:
    text: str = f.read()

print(f"{len(text):,}")
print(text[20000:20500])

178,255,102
's treatment "appalling".


Alanis Morissette

Alanis Nadine Morissette (born June 1, 1974) is a Grammy Award-winning Canadian-American singer and songwriter. She was born in Ottawa, Canada. She began singing in Canada as a teenager in 1990. In 1995, she became popular all over the world.

As a young child in Canada, Morissette began to act on television, including 5 episodes of the long-running series, "You Can't Do That on Television". Her first album was released only in Canada in 1990.

Her 


In [11]:
# Tokenizer
sp = spm.SentencePieceProcessor(model_file="./data/wiki_tokenizer.model")
vocab_size = sp.get_piece_size()
print(f"Vocab Size: {vocab_size}")

Vocab Size: 4096


In [18]:
def encode(s: str) -> list[int]:
    return sp.Encode(input=s)


def decode(n: list[int]) -> str:
    return sp.DecodeIds(input=n)


print(encode(s="Hello World!"))
print(decode([312, 471, 4037, 870, 36]))

[312, 471, 4037, 870, 36]
Hello World!


In [20]:
if os.path.exists(path="./data/encoded_data.pt"):
    print("Loading encoding ...")
    data = torch.load(f="./data/encoded_data.pt")
else:
    data = torch.tensor(data=encode(s=text), dtype=torch.long)
    torch.save(obj=data, f="./data/encoded_data.pt")

Loading encoding ...


# Lec: 25