© 2025 Yegor Tkachenko. 

Python, Deep Learning, and LLMs: A Crash Course for Complete Beginners.

Selected code from *Chapter 6: Language models for text prediction*.

Version: 2025-09-03. https://python2llms.org. 

In [1]:
import requests # downloads
from bs4 import BeautifulSoup # html parsing
import re # regular expressions for text processing
import textwrap # wrap print output

# run the following code to download the text
# or you can read in the provided version directly

url = 'https://www.gutenberg.org/files/11/11-h/11-h.htm'
response = requests.get(url) # get web site's html
response.encoding = 'utf-8' # correct encoding
html_content = response.text # extract html content
soup = BeautifulSoup(html_content,'html.parser') # parse html
text = soup.get_text() # raw text from html as a string

# remove repeated sequential spaces and newline characters
text = re.sub(' +', ' ', text) 
text = re.sub(r'\n\s*\n', '\n\n', text)

# save the file
with open("alice.txt", "w", encoding="utf-8") as file:
    file.write(text)

# read the saved file
# with open("alice.txt", "r", encoding="utf-8") as file:
#     text = file.read()

print("Text string length:", len(text))
print("Text:",textwrap.fill(text[:600], 58))

Text string length: 146458
Text:   Alice’s Adventures in Wonderland | Project Gutenberg
*** START OF THE PROJECT GUTENBERG EBOOK 11 ***  Alice’s
Adventures in Wonderland by Lewis Carroll THE MILLENNIUM
FULCRUM EDITION 3.0  Contents   CHAPTER I.Down the Rabbit-
Hole   CHAPTER II.The Pool of Tears   CHAPTER III.A
Caucus-Race and a Long Tale   CHAPTER IV.The Rabbit Sends
in a Little Bill   CHAPTER V.Advice from a Caterpillar
CHAPTER VI.Pig and Pepper   CHAPTER VII.A Mad Tea-Party
CHAPTER VIII.The Queen’s Croquet-Ground   CHAPTER IX.The
Mock Turtle’s Story   CHAPTER X.The Lobster Quadrille
CHAPTER XI.Who Stole the Tarts?


In [2]:
vocab = sorted(list(set(text)))
V = len(vocab) # vocabulary size
print("Vocabulary:\n", textwrap.fill(str(vocab), 60))
print("Vocabulary size:", V)

Vocabulary:
 ['\n', '\r', ' ', '!', '(', ')', '*', ',', '-', '.', '0',
'1', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c',
'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|',
'\xa0', 'ù', '—', '‘', '’', '“', '”']
Vocabulary size: 78


In [3]:
# token - integer index mapping
t2i = {tok:i for i,tok in enumerate(vocab)}
i2t = {i:tok for i,tok in enumerate(vocab)}

# functions to convert between integers and strings
str2ind = lambda s: [t2i[t] for t in s] 
ind2str = lambda l: ''.join([i2t[i] for i in l])

In [4]:
print(textwrap.fill(text[1500:1600], 60))

she ought to have wondered at this, but at the time it all
seemed  quite natural); but when the Rabb


In [5]:
print("Index encoding of a string:\n", 
    textwrap.fill(str( str2ind(text[1500:1600]) ), 60))

Index encoding of a string:
 [62, 51, 48, 2, 58, 64, 50, 51, 63, 2, 63, 58, 2, 51, 44,
65, 48, 2, 66, 58, 57, 47, 48, 61, 48, 47, 2, 44, 63, 2, 63,
51, 52, 62, 7, 2, 45, 64, 63, 2, 44, 63, 2, 63, 51, 48, 2,
63, 52, 56, 48, 2, 52, 63, 2, 44, 55, 55, 2, 62, 48, 48, 56,
48, 47, 1, 0, 60, 64, 52, 63, 48, 2, 57, 44, 63, 64, 61, 44,
55, 5, 14, 2, 45, 64, 63, 2, 66, 51, 48, 57, 2, 63, 51, 48,
2, 33, 44, 45, 45]


In [6]:
# integers back to string
print( textwrap.fill(ind2str(str2ind(text[1500:1600])), 60))

she ought to have wondered at this, but at the time it all
seemed  quite natural); but when the Rabb


In [7]:
from collections import Counter

# frequency of adjacent token pairs
def get_freq(sequence):
    pairs = Counter()
    for i in range(len(sequence) - 1):
        pair = (sequence[i], sequence[i + 1])
        pairs[pair] += 1
    return pairs

# merge the most frequent token pair
def merge_tokens(sequence, pair_to_merge):
    a, b = pair_to_merge
    merged_token = a + b
    i = 0
    new_sequence = []
    while i < len(sequence):
        if i < len(sequence) - 1 and sequence[i] == a and sequence[i + 1] == b:
            new_sequence.append(merged_token)
            i += 2
        else:
            new_sequence.append(sequence[i])
            i += 1
    return new_sequence

# example
sequence = "a b r a c a d a b r a _ r a".split()
vocab_bpe = sorted(set(sequence))
V_bpe = 10  # target vocabulary size
merge_rules = [] # to store merge history

print(f"Step 1: {' '.join(sequence)}")
print(f"Vocabulary: {sorted(vocab_bpe)}\n")

step = 2
while len(vocab_bpe) < V_bpe:
    stats = get_freq(sequence)
    if not stats:
        break
    most_common = stats.most_common(1)[0][0]
    merge_rules.append(most_common)
    sequence = merge_tokens(sequence, most_common)
    vocab_bpe.append(most_common[0] + most_common[1])
    print(f"Step {step}: {' '.join(sequence)}")
    print(textwrap.fill(f"Vocabulary: {vocab_bpe}",60)+"\n")
    step += 1

Step 1: a b r a c a d a b r a _ r a
Vocabulary: ['_', 'a', 'b', 'c', 'd', 'r']

Step 2: a b ra c a d a b ra _ ra
Vocabulary: ['_', 'a', 'b', 'c', 'd', 'r', 'ra']

Step 3: ab ra c a d ab ra _ ra
Vocabulary: ['_', 'a', 'b', 'c', 'd', 'r', 'ra', 'ab']

Step 4: abra c a d abra _ ra
Vocabulary: ['_', 'a', 'b', 'c', 'd', 'r', 'ra', 'ab',
'abra']

Step 5: abrac a d abra _ ra
Vocabulary: ['_', 'a', 'b', 'c', 'd', 'r', 'ra', 'ab',
'abra', 'abrac']



In [8]:
# apply learned merge rules to a new sequence
def apply_merges(sequence, merge_rules):
    for a, b in merge_rules:
        i = 0
        new_sequence = []
        while i < len(sequence):
            if i < len(sequence) - 1 and sequence[i] == a and sequence[i + 1] == b:
                new_sequence.append(a + b)
                i += 2
            else:
                new_sequence.append(sequence[i])
                i += 1
        sequence = new_sequence  # update after each rule
    return sequence

print(textwrap.fill("Learned merge rules: "+ 
    str(merge_rules),60))

# tokenize new sequence
new_seq = "a b b r a _ r a".split()
tokenized = apply_merges(new_seq, merge_rules)

print("New text:", ' '.join(new_seq))
print("Tokenized:", tokenized)

Learned merge rules: [('r', 'a'), ('a', 'b'), ('ab', 'ra'),
('abra', 'c')]
New text: a b b r a _ r a
Tokenized: ['ab', 'b', 'ra', '_', 'ra']


In [9]:
# example: "Don't panic" (biè huāng) in Chinese
chi = "别慌"

utf8_bytes = chi.encode('utf-8')
print("UTF-8 bytes:", utf8_bytes)

# convert each byte into an integer token (0-255)
seq = list(utf8_bytes)
print("Byte-level tokens:", seq)

# decode back (for illustration)
decoded = bytes(seq).decode('utf-8')
print("Decoded string:", decoded)

UTF-8 bytes: b'\xe5\x88\xab\xe6\x85\x8c'
Byte-level tokens: [229, 136, 171, 230, 133, 140]
Decoded string: 别慌


In [10]:
import numpy as np

X_raw = np.array([
["H", "e", "l", "l", "o", "."],
["i", "d", " ", "i", "t", "\n"],
["!", " ", "W", "h", "y", "?"],
["a", "r", "e", " ", "y", "o"],
])
print(X_raw)
print(X_raw.shape)

[['H' 'e' 'l' 'l' 'o' '.']
 ['i' 'd' ' ' 'i' 't' '\n']
 ['!' ' ' 'W' 'h' 'y' '?']
 ['a' 'r' 'e' ' ' 'y' 'o']]
(4, 6)


In [11]:
B, S = X_raw.shape
X_ind = np.zeros((B,S), dtype=int)
for b in range(B):
    for s in range(S):
        X_ind[b,s] = t2i[X_raw[b,s]] # token to index

print(X_ind)

[[23 48 55 55 58  9]
 [52 47  2 52 63  0]
 [ 3  2 38 51 68 15]
 [44 61 48  2 68 58]]


In [12]:
np.random.seed(999)

E = 2 # embedding size

# array of all embeddings shaped as
# Alice in Wonderland vocab size V x embedding size E
token_embeddings = np.random.normal(size=(V, E)) 

# array of token indices to array of embeddings
X_emb = token_embeddings[X_ind]
print(X_emb.shape)
print(X_emb)

(4, 6, 2)
[[[ 0.60487573  0.30054313]
  [ 0.10892936 -0.03912716]
  [ 0.43816732  0.18814676]
  [ 0.43816732  0.18814676]
  [ 0.48026069 -1.08525169]
  [ 0.98434258 -0.83544737]]

 [[-0.92767682 -0.71360919]
  [ 0.61275229  0.25506099]
  [-0.26613444 -0.64890071]
  [-0.92767682 -0.71360919]
  [ 0.66565147  1.07865774]
  [ 0.12715784  1.40189088]]

 [[ 1.56626757 -2.09137019]
  [-0.26613444 -0.64890071]
  [ 0.15188388  1.08328312]
  [ 0.07300425  0.73492972]
  [-0.30719913  0.52414483]
  [ 0.55931017  0.4740131 ]]

 [[ 2.0691885  -0.30205227]
  [ 0.04509789  0.8590863 ]
  [ 0.10892936 -0.03912716]
  [-0.26613444 -0.64890071]
  [-0.30719913  0.52414483]
  [ 0.48026069 -1.08525169]]]


In [13]:
token_embeddings[0]

array([0.12715784, 1.40189088])

In [14]:
# average embedding within each sequence
X_avg_emb = np.mean(X_emb, axis=1)

print(X_avg_emb.shape)
print(X_avg_emb)

(4, 2)
[[ 0.50912383 -0.2138316 ]
 [-0.11932108  0.10991509]
 [ 0.29618872  0.01268331]
 [ 0.35502381 -0.11535012]]


In [15]:
np.random.seed(123)

# layer params
W = np.random.normal(size=(E, V)) 
b = np.random.normal(size=(1, V)) 
# note that bias is commonly initialized to zero

# BxV array of logits / utilities
X_logits = X_avg_emb.dot(W) + b

print(X_logits.shape)

(4, 78)


In [16]:
def row_wise_softmax(x):
    # subtract the maximum value for numerical stability
    # this operation does not affect the resulting probability value 
    # because it is equivalent to dividing numerator and denominator
    # by the same exp(max) value
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    # divide by the sum of exponentials for normalization
    return e_x / np.sum(e_x, axis=1, keepdims=True)

X_prob = row_wise_softmax(X_logits)

print(X_prob.shape)

# to verify probs. sum to 1 across full vocab.
print(X_prob.sum(1)) 

(4, 78)
[1. 1. 1. 1.]


In [17]:
samples = np.array([
    np.random.choice(X_prob.shape[1], p=row) for row in X_prob
])

print(samples)  # e.g., array([1, 2, 0])

[61  0 10 55]


In [18]:
# row-wise cumulative probabilities
# shape: (batch size B, vocab size V)
cumulative = np.cumsum(X_prob, axis=1)  

# one uniform random number per row
u = np.random.rand(B)

# compare uniform draw with cumulative distribution to sample
samples = (cumulative < u[:, None]).sum(axis=1)

print(samples)

[ 2 50 75 61]


In [19]:
# continuation of sequences in X_raw
y_raw = np.array([
[" "],
["f"],
["\n"],
["u"]
])

# indices
y_ind = np.zeros((B,1), dtype=int)
for b in range(B):
    y_ind[b,0] = t2i[y_raw[b,0]]

# cross entropy for each batch sequence
# i.e., negative log probability assigned by the model 
# to the actual realized next character
CE_s = -np.log(X_prob[np.arange(B), y_ind.flatten()])
print(CE_s)

# average cross entropy
CE = np.mean(CE_s)
print(CE)

[3.61284783 5.0697951  4.55075799 5.66530036]
4.724675319705576


In [21]:
import torch
import torch.nn as nn
from torch.nn import functional as F

if torch.cuda.is_available():
    device = 'cuda' # gpu
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

print("Fastest available hardware for deep learning:", device)

Fastest available hardware for deep learning: mps


In [22]:
# integer value pytorch tensor 
# containing integer encoding of the full 
# Alice in Wonderland text
data = torch.tensor(str2ind(text), dtype=torch.long)  
n = int(0.8*data.shape[0])
data_train = data[:n]
data_test = data[n:]

print("First 5 character indices:", data_train[:5])

First 5 character indices: tensor([ 0,  0, 16, 55, 52])


In [23]:
B = 4 # batch size - number of token sequences
S = 6 # input sequence length

def get_batch(split):
    # get text sequences from random starting points
    if split == 'train':
        d = data_train
    if split == "test":
        d = data_test

    # sampling starting points for sequences 
    start_ind = torch.randint(d.shape[0] - S, (B,)) 
    # (d.shape[0] - S) is 1 above the largest integer to be drawn

    X_ind = torch.stack([d[i:i+S] for i in start_ind])
    y_ind = torch.stack([d[i+S] for i in start_ind])
    X_ind, y_ind = X_ind.to(device), y_ind.to(device)
    return X_ind, y_ind

get_batch('train')

(tensor([[62, 64, 61, 59, 61, 52],
         [ 2, 76, 27, 48, 63,  2],
         [48,  2, 53, 64, 56, 59],
         [58, 61,  2, 63, 51, 48]], device='mps:0'),
 tensor([62, 64, 52,  1], device='mps:0'))

In [24]:
# random seed
torch.manual_seed(999)

# V is Alice in Wonderland vocabulary size
B = 4  # batch size - number of token sequences
S = 8  # input sequence length
E = 16 # embedding size

# model definition
class AvgEmbeddingModel(nn.Module):

    def __init__(self, V, E):
        super().__init__() 
        # this calls the __init__ of the parent class nn.Module

        # V x E array of all embedding vectors
        self.token_embeddings = nn.Embedding(V, E)

        # a fully-connected layer of V neurons
        # each neuron takes E input values
        self.output_layer = nn.Linear(E, V)

    def forward(self, X_ind):
        # X_ind is B x S array 
        # sequences of integers
      
        # X_emb is B x S x E array
        # sequences of embeddings
        X_emb = self.token_embeddings(X_ind)  

        # X_emb_avg is B x E array
        # average within-sequence embeddings
        X_emb_avg = torch.mean(X_emb, 1)

        # X_logits is B x V array
        # utilities / logits for possible next characters
        # row-wise softmax would generate probabilities
        X_logits = self.output_layer(X_emb_avg)
        return X_logits

    def next_token_prob(self, X_logits):
        # X_prob is B x V array of probabilities
        # for possible next characters
        X_prob = F.softmax(X_logits, dim=-1) # B x V
        return X_prob

    def loss(self, X_logits, y_ind):
        # cross-entropy loss
        return F.cross_entropy(X_logits, y_ind)

net = AvgEmbeddingModel(V, E)
net = net.to(device)

In [25]:
print('Number of trainable parameters (embeddings + output layer)\n in AvgEmbeddingModel:', 
  sum(p.numel() for p in net.parameters()))

Number of trainable parameters (embeddings + output layer)
 in AvgEmbeddingModel: 2574


In [26]:
import time

# training settings
learning_rate = 1e-2 # 0.01
max_iters = 3000
eval_interval = 300
eval_iters = 200

# AdamW optimizer
optimizer = torch.optim.AdamW(net.parameters(), lr=learning_rate)

# train and test set evaluation
@torch.no_grad() # this decorator disables gradient calculation within the function
def evaluate(net):
    record = {}
    net.eval() # evaluation mode (e.g., turns off dropout when present)
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X_ind, Y_ind = get_batch(split)
            X_logits = net(X_ind)
            loss = net.loss(X_logits, Y_ind)
            losses[k] = loss.item()
        record[split] = losses.mean()
    net.train() # train mode
    return record

# training loop
start_time = time.time()
for i in range(max_iters):
    
    # regularly evaluate the loss on train and test sets
    if i % eval_interval == 0 or i == max_iters - 1:
        eval_ = evaluate(net)
        print(f"Step {i}: "
              f"Train loss = {eval_['train']:.4f}, "
              f"Test loss = {eval_['test']:.4f}")

    # sample a batch of data
    X_batch, y_batch = get_batch('train')

    # evaluate the loss and update the parameters
    X_logits = net(X_batch)
    loss = net.loss(X_logits, y_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

elapsed = time.time() - start_time
print(f"\n\t Elapsed time = {elapsed/60:.2f} mins")

Step 0: Train loss = 4.4215, Test loss = 4.3955


Step 300: Train loss = 3.0962, Test loss = 3.1353


Step 600: Train loss = 3.1422, Test loss = 3.1990


Step 900: Train loss = 3.0638, Test loss = 3.0082


Step 1200: Train loss = 2.9821, Test loss = 3.0287


Step 1500: Train loss = 2.8936, Test loss = 3.0107


Step 1800: Train loss = 3.0589, Test loss = 3.0315


Step 2100: Train loss = 2.9676, Test loss = 2.9623


Step 2400: Train loss = 2.9257, Test loss = 3.0203


Step 2700: Train loss = 2.8846, Test loss = 2.9998


Step 2999: Train loss = 2.9716, Test loss = 2.9562

	 Elapsed time = 0.32 mins


In [27]:
print("Final test set cross-entropy (CE):", 
    float(f"{eval_['test']:.4f}"))
print("Probability of picking the correct next token: ")
print("> Neural net (exp(-CE)):", 
    float(f"{torch.exp(-eval_['test']):.4f}"))
print("> Uniform random guess (1/V):", round(1/V,4))

Final test set cross-entropy (CE): 2.9562
Probability of picking the correct next token: 
> Neural net (exp(-CE)): 0.052
> Uniform random guess (1/V): 0.0128


In [28]:
def generate_text(net, n_new_tokens):
    X_ind = torch.zeros((1, 1), dtype=torch.long, device=device)
    for i in range(n_new_tokens):
        # crop to sequence size
        X_ind_crop = X_ind[:, -S:] # 1 x S
        X_logits = net(X_ind_crop) # 1 x V
        X_prob = net.next_token_prob(X_logits) # 1 x V
        X_ind_next = torch.multinomial(X_prob, num_samples=1) # 1 x 1
        X_ind = torch.cat((X_ind, X_ind_next), dim=1) # 1 x S+1
    return ind2str(X_ind[0].tolist())

net.eval()
s = generate_text(net, n_new_tokens=500)
print(textwrap.fill(s, 60))

 Areie se npryataw  nidtha ,sgeetil We hens mof uoot aotn—ug
Wlhieatigs etb hhro seo natpd ?h tet olnaadtak nnatlheat”! ’
tohdahey rhws  ehWeh pesi  oano tu e mhhMsoeok sjn—dhee nbas
Ao e  sofhsol r entihoH use,laan  niatgmhlel r ouosa
irupnstwe ehrerud lo, hb etrrobe“f  hror mefelann role
oonttohi,: a wts tnae ihttdr Kli iivttns si aaatnins yohe
aAt tseireon w siomei nt eshd“a xertpab esitrfs eha out‘vnss
a g  i“ygo ‘e’’ oroattae peyntt oehse inthtoeng zin
TEeithir  nta  edohont ehpa rohtsi


In [29]:
def my_decorator(func):
    def wrapper():
        print("Something happens before func runs")
        func()
        print("Something happens after func runs")
    return wrapper

@my_decorator
def say_hello():
    print("Hello!")

say_hello()

Something happens before func runs
Hello!
Something happens after func runs


In [30]:
# toy example
torch.manual_seed(999)

B = 4 # batch size (number of sequences)
S = 6 # sequence length
E = 2 # embedding vector length

# random batch - sequences of embeddings
X_emb = torch.randn(B, S, E) 

print("Random batch:", X_emb)
print("Batch shape:", X_emb.shape)

Random batch: tensor([[[-0.2528,  1.4072],
         [ 0.2910,  1.0365],
         [-0.9816, -3.4219],
         [ 1.4910,  0.2422],
         [ 1.4832, -0.3704],
         [ 0.0941,  2.1528]],

        [[ 0.6271, -1.1666],
         [-0.7862,  0.0759],
         [-0.0086, -0.6568],
         [-1.0011,  0.2992],
         [ 0.6396, -1.0857],
         [-1.6153,  1.5635]],

        [[-1.7952,  0.6095],
         [-0.7203,  0.6119],
         [ 0.3259, -1.6059],
         [-0.5272,  0.3401],
         [-1.3832,  1.1149],
         [-0.7776,  0.2738]],

        [[ 0.9147, -1.1896],
         [-0.7501, -1.5465],
         [ 1.0044, -0.0986],
         [ 1.3962, -0.9138],
         [-1.1788, -0.6681],
         [-0.3168,  0.9893]]])
Batch shape: torch.Size([4, 6, 2])


In [31]:
X_emb_avg = torch.mean(X_emb, dim=1) # B x E
X_emb_avg

tensor([[ 0.3542,  0.1744],
        [-0.3575, -0.1618],
        [-0.8129,  0.2240],
        [ 0.1783, -0.5712]])

In [32]:
# weights - vector of 6 scalars of 1/6
w = torch.ones(S) / S

# dot product
X_emb_avg = w @ X_emb 
X_emb_avg

# due to PyTorch broadcasting along batch dimension:
# S  @  B x S x E  >>>  B x S  @  B x S x E  >>>  B x E

tensor([[ 0.3542,  0.1744],
        [-0.3575, -0.1618],
        [-0.8129,  0.2240],
        [ 0.1783, -0.5712]])

In [33]:
w = torch.ones(S)

# softmax - gives equal probabilities
w = torch.exp(w)
w = w / w.sum()

# broadcasting magic again
# S  @  B x S x E  >>>  B x E
X_emb_avg = w @ X_emb 
X_emb_avg

tensor([[ 0.3542,  0.1744],
        [-0.3575, -0.1618],
        [-0.8129,  0.2240],
        [ 0.1783, -0.5712]])

In [34]:
torch.manual_seed(999)

B, S, E = 4, 6, 2

# input data
X_emb = torch.randn(B, S, E)

# attention parameters
m_1 = m_2 = E # setting key/query and value matrix sizes
W_q = torch.randn(E, m_1)
W_k = torch.randn(E, m_1)
W_v = torch.randn(E, m_2)

# attention forward computation
X_q = X_emb @ W_q  # B x S x m_1
X_k = X_emb @ W_k  # B x S x m_1
X_v = X_emb @ W_v  # B x S x m_2

W = X_q @ X_k.transpose(-2, -1) # B x S x S
W = W / (m_1**0.5)
W = torch.softmax(W, dim=-1)

X_out = W @ X_v # B x S x m_2  
print(X_out.shape)

torch.Size([4, 6, 2])


In [35]:
torch.manual_seed(999)
get_batch('train')

(tensor([[46, 44, 57, 75, 63,  2],
         [61, 62, 48, 55, 49,  1],
         [62,  2, 44, 62,  2, 62],
         [77,  2, 16, 55, 52, 46]], device='mps:0'),
 tensor([63,  0, 63, 48], device='mps:0'))

In [36]:
torch.manual_seed(999)

B = 4    # number of token sequences
S = 6    # input sequence length

def get_batch(split):
    # get text sequences from random starting points
    if split == 'train':
        d = data_train
    if split == 'test':
        d = data_test

    start_ind = torch.randint(d.shape[0] - S, (B,)) 
    X_ind = torch.stack([d[i:i+S] for i in start_ind])
    
    # NOTE THE CHANGE:
    Y_ind = torch.stack([d[i+1:i+S+1] for i in start_ind]) 

    X_ind, Y_ind = X_ind.to(device), Y_ind.to(device)
    return X_ind, Y_ind

X_ind, Y_ind = get_batch('train')

print("X_ind:",X_ind)
print("Y_ind:",Y_ind)

X_ind: tensor([[46, 44, 57, 75, 63,  2],
        [61, 62, 48, 55, 49,  1],
        [62,  2, 44, 62,  2, 62],
        [77,  2, 16, 55, 52, 46]], device='mps:0')
Y_ind: tensor([[44, 57, 75, 63,  2, 63],
        [62, 48, 55, 49,  1,  0],
        [ 2, 44, 62,  2, 62, 63],
        [ 2, 16, 55, 52, 46, 48]], device='mps:0')


In [37]:
X_ind[0,:3] # first 3 elements in the first sampled sequence
Y_ind[0,3]  # 4th element in that sequence, a valid prediction target for the first sequence
X_ind[0,4]  # this is the same as Y_ind[0,3]

tensor(63, device='mps:0')

In [38]:
torch.manual_seed(999)
B, S, E = 4, 6, 2  # batch size, sequence length, embd size
X_emb = torch.randn(B, S, E)
X_emb.shape

torch.Size([4, 6, 2])

In [39]:
X_emb[0] # first sequence of embeddings

tensor([[-0.2528,  1.4072],
        [ 0.2910,  1.0365],
        [-0.9816, -3.4219],
        [ 1.4910,  0.2422],
        [ 1.4832, -0.3704],
        [ 0.0941,  2.1528]])

In [40]:
for i in range(S): 
    print(torch.mean(X_emb[0][:i+1,:],0))

tensor([-0.2528,  1.4072])
tensor([0.0191, 1.2218])
tensor([-0.3144, -0.3261])
tensor([ 0.1369, -0.1840])
tensor([ 0.4062, -0.2213])
tensor([0.3542, 0.1744])


In [41]:
# initialize the lower triangular matrix
tril = torch.tril( torch.ones(S, S) )
tril

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])

In [42]:
# normalize its rows to sum to 1
W = tril / tril.sum(1, keepdim=True)
W

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667]])

In [43]:
# dot product to get the averages
W @ X_emb[0]

tensor([[-0.2528,  1.4072],
        [ 0.0191,  1.2218],
        [-0.3144, -0.3261],
        [ 0.1369, -0.1840],
        [ 0.4062, -0.2213],
        [ 0.3542,  0.1744]])

In [44]:
W = torch.ones(S, S) # arbitrary weight matrix
mask = torch.tril(torch.ones(S, S))
W = W.masked_fill(mask == 0, float('-inf')) 
W

tensor([[1., -inf, -inf, -inf, -inf, -inf],
        [1., 1., -inf, -inf, -inf, -inf],
        [1., 1., 1., -inf, -inf, -inf],
        [1., 1., 1., 1., -inf, -inf],
        [1., 1., 1., 1., 1., -inf],
        [1., 1., 1., 1., 1., 1.]])

In [45]:
W = torch.softmax(W, dim=-1)
W

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667]])

In [46]:
W @ X_emb[0]

tensor([[-0.2528,  1.4072],
        [ 0.0191,  1.2218],
        [-0.3144, -0.3261],
        [ 0.1369, -0.1840],
        [ 0.4062, -0.2213],
        [ 0.3542,  0.1744]])

In [47]:
torch.manual_seed(999)

B, S, E = 4, 6, 2

# input data
X_emb = torch.randn(B, S, E)

# attention parameters
m_1 = m_2 = E # setting key/query and value matrix sizes
W_q = torch.randn(E, m_1)
W_k = torch.randn(E, m_1)
W_v = torch.randn(E, m_2)

# attention forward computation
X_q = X_emb @ W_q  # B x S x m_1
X_k = X_emb @ W_k  # B x S x m_1
X_v = X_emb @ W_v  # B x S x m_2

W = X_q @ X_k.transpose(-2, -1)  # B x S x S
W = W / (m_1**0.5)

# causal filter

# (S, S) lower triangular mask
mask = torch.tril(torch.ones(S, S))  
W = W.masked_fill(mask == 0, float('-inf'))
W = torch.softmax(W, dim=-1)

# the mask needs to be broadcasted to (B, S, S) if necessary
# where mask is 0, set to negative infty >>> 0 after exp() 

X_out = W @ X_v  # B x S x m_2  
print(X_out.shape)

torch.Size([4, 6, 2])


In [48]:
B = 64 # batch size -- number of sequences
S = 256 # sequence length, tokens per sequence
E = 64 # embedding size per token

n_heads = 4 # number of parallel attention heads
n_layers = 4 # number of transformer blocks
dropout = 0.2 # dropout probability (regularization)

In [49]:
class Attention(nn.Module):
    # causal self-attention module

    def __init__(self, m):
        super().__init__()
        self.m = m # m_1 = m_2

        self.key = nn.Linear(E, m, bias=False) # W_k
        self.query = nn.Linear(E, m, bias=False) # W_q
        self.value = nn.Linear(E, m, bias=False) # W_v
        
        # create a non-trainable lower triangular matrix
        self.register_buffer('tril', 
            torch.tril(torch.ones(S, S)))
        self.dropout = nn.Dropout(dropout) # regularization

    def forward(self, X_emb):
        B, S, m = X_emb.shape 

        X_k = self.key(X_emb)   # B x S x m
        X_q = self.query(X_emb) # B x S x m
        X_v = self.value(X_emb) # B x S x m

        # B x S x m  @  B x m x S  >>>  B x S x S
        W = X_q @ X_k.transpose(-2,-1) / self.m**0.5
        
        W = W.masked_fill(self.tril[:S, :S] == 0, 
                float('-inf'))   # B x S x S
        W = F.softmax(W, dim=-1) # B x S x S
        W = self.dropout(W)
        
        # B x S x S  @  B x S x m  >>>  B x S x m
        X_out = W @ X_v
        return X_out

In [50]:
class MultiHeadAttention(nn.Module):
    # group of attention heads running in parallel

    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([
            Attention(head_size) for i in range(n_heads)
        ])
        self.lin = nn.Linear(E, E)
        self.dropout = nn.Dropout(dropout)

    def forward(self, X_emb):
        X_out = torch.cat([
            h(X_emb) for h in self.heads
        ], dim=-1) # B x S x E
        X_out = self.dropout(self.lin(X_out))
        return X_out

In [51]:
class FeedForward(nn.Module):
    # neural net of two fully connected layers 

    def __init__(self, E):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(E, 4*E),
            nn.ReLU(),
            nn.Linear(4*E, E),
            nn.Dropout(dropout))

    def forward(self, X_emb):
        return self.layers(X_emb)

In [52]:
class LayerNorm(nn.Module):
    # layer normalization

    def __init__(self, D, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(D))
        self.bias = nn.Parameter(torch.zeros(D))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, unbiased=False, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * x_norm + self.bias

In [53]:
class TransformerBlock(nn.Module):
    # key transformer's building block 

    def __init__(self, E, n_heads):
        super().__init__()
        # E and n_heads should be selected 
        # so that E / n_heads has 0 remainder
        head_size = E // n_heads
        self.mha = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(E)
        self.lnorm1 = LayerNorm(E)
        self.lnorm2 = LayerNorm(E)

    def forward(self, X_emb):
        # skip connections via addition of input to output
        X_emb = X_emb + self.mha(self.lnorm1(X_emb)) 
        X_emb = X_emb + self.ffwd(self.lnorm2(X_emb))
        return X_emb

In [54]:
class Transformer(nn.Module):
    # a basic transformer architecture LLM

    def __init__(self, V, E):
        super().__init__()
        
        # an embedding for each token
        self.token_embeddings = nn.Embedding(V, E)

        # and an embedding for each sequence position
        self.position_embeddings = nn.Embedding(S, E)

        # sequence of repeated transformer blocks
        self.blocks = nn.Sequential(*[TransformerBlock(E, 
            n_heads=n_heads) for i in range(n_layers)
        ])

        # output layer
        self.output_lnorm = LayerNorm(E) 
        self.output_layer = nn.Linear(E, V) 

    def forward(self, X_ind):
        B, S = X_ind.shape
        tok_emb = self.token_embeddings(X_ind) # B x S x E
        pos_emb = self.position_embeddings(
            torch.arange(S, device=device)) # S x E
        X_emb = tok_emb + pos_emb # B x S x E
        X_emb = self.blocks(X_emb) # B x S x E
        X_emb = self.output_lnorm(X_emb) # B x S x E
        X_logits = self.output_layer(X_emb) # B x S x V
        return X_logits

    def loss(self, X_logits, Y_ind):
        # X_logits is B x S x V; Y_ind is B x S
        return F.cross_entropy(
                    X_logits.view(-1, X_logits.shape[-1]), 
                    Y_ind.view(-1))

    def next_token_prob(self, X_logits):
        # X_logits is B x S x V
        X_logits = X_logits[:, -1, :] # B x V 
        X_prob = F.softmax(X_logits, dim=-1) # B x V
        return X_prob

In [55]:
def init_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        nn.init.normal_(module.weight, mean=0.0, std=0.02)
        if hasattr(module, 'bias') and module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, LayerNorm):
        nn.init.ones_(module.scale)
        nn.init.zeros_(module.bias)

In [56]:
torch.manual_seed(999)
net = Transformer(V, E)
net.apply(init_weights)
net = net.to(device)

In [57]:
print(sum(p.numel() for p in net.parameters())/1e6, 'million parameters')

0.225742 million parameters


In [58]:
max_iters = 30000
eval_interval = 3000
learning_rate = 1e-4 # 0.0001

optimizer = torch.optim.AdamW(net.parameters(), 
    lr=learning_rate, weight_decay=0.1)

start_time = time.time()
for i in range(max_iters):
    
    # evaluation
    if i % eval_interval == 0 or i == max_iters - 1:
        # elapsed time in seconds
        elapsed = time.time() - start_time  
        # reusing the eval function from earlier
        eval_ = evaluate(net) 
        print(f"Step {i}: "
            f"Train loss = {eval_['train']:.4f}, "
            f"Test loss = {eval_['test']:.4f}, " 
            f"\n\t Elapsed time = {elapsed/60:.2f} mins, "
            f"{100*(i+1)/max_iters:.0f} % complete\n")

    # training
    X_batch, Y_batch = get_batch('train')
    X_logits = net(X_batch)
    loss = net.loss(X_logits, Y_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Step 0: Train loss = 4.3803, Test loss = 4.3808, 
	 Elapsed time = 0.00 mins, 0 % complete



Step 3000: Train loss = 1.6440, Test loss = 1.7307, 
	 Elapsed time = 5.19 mins, 10 % complete



Step 6000: Train loss = 1.2971, Test loss = 1.5063, 
	 Elapsed time = 9.55 mins, 20 % complete



Step 9000: Train loss = 1.1784, Test loss = 1.4534, 
	 Elapsed time = 13.89 mins, 30 % complete



Step 12000: Train loss = 1.1093, Test loss = 1.4398, 
	 Elapsed time = 18.21 mins, 40 % complete



Step 15000: Train loss = 1.0628, Test loss = 1.4304, 
	 Elapsed time = 22.49 mins, 50 % complete



Step 18000: Train loss = 1.0273, Test loss = 1.4260, 
	 Elapsed time = 26.84 mins, 60 % complete



Step 21000: Train loss = 1.0010, Test loss = 1.4223, 
	 Elapsed time = 31.12 mins, 70 % complete



Step 24000: Train loss = 0.9772, Test loss = 1.4253, 
	 Elapsed time = 35.43 mins, 80 % complete



Step 27000: Train loss = 0.9587, Test loss = 1.4209, 
	 Elapsed time = 39.76 mins, 90 % complete



Step 29999: Train loss = 0.9412, Test loss = 1.4215, 
	 Elapsed time = 44.12 mins, 100 % complete



In [59]:
print("Final test set cross-entropy (CE):", 
    float(f"{eval_['test']:.4f}"))
print("Probability of picking the correct next token: ")
print("> Neural net (exp(-CE)):", 
    float(f"{torch.exp(-eval_['test']):.4f}"))
print("> Uniform random guess (1/V):", round(1/V,4))

Final test set cross-entropy (CE): 1.4215
Probability of picking the correct next token: 
> Neural net (exp(-CE)): 0.2413
> Uniform random guess (1/V): 0.0128


In [60]:
net.eval()  # turns off dropout
s = generate_text(net, n_new_tokens=500)
print(textwrap.fill(s, 60))

 “Exactly,” For said Alice; “but what to know.”   And what
people half is Canto much happening and from like, but it
was it back  out out like to fall on, “I’ll at they but her
way I do is to my hear now.”   “See is their right the right
to she left!” exclas she King the Queen, turning at  here
way good outt.   “The exest of a shome gimmpent?” said
Alice, aloud: “what o the Queen into of  reaces.”   “Yes, as
you might to livery good round for to that do,” the Lory its
make  out of she had not sai


In [61]:
# only saving weights / parameter values
PATH = "./llm_weights.pth"
torch.save(net.state_dict(), PATH) 

In [62]:
#| output: false

net = Transformer(V, E)
net.load_state_dict(torch.load(PATH, 
    map_location=device, weights_only=True))
net = net.to(device)
net.eval()

Transformer(
  (token_embeddings): Embedding(78, 64)
  (position_embeddings): Embedding(256, 64)
  (blocks): Sequential(
    (0): TransformerBlock(
      (mha): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Attention(
            (key): Linear(in_features=64, out_features=16, bias=False)
            (query): Linear(in_features=64, out_features=16, bias=False)
            (value): Linear(in_features=64, out_features=16, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (lin): Linear(in_features=64, out_features=64, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=64, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (lnorm1): LayerNorm()
      (lnorm2): LayerNorm()
    )
    (

In [63]:
torch.manual_seed(999)

X_ind, Y_ind = get_batch('train') # both are B x S 

# as an example,
# we form initial token index sequence x 
# and y_w and y_l sequences of token inds that follow x
# y_w > y_l

x = X_ind[0,:24].unsqueeze(0) # 1 x 24
y_w = X_ind[0,24:].unsqueeze(0) # 1 x (S-24)
y_l = X_ind[1,24:].unsqueeze(0) # 1 x (S-24)

# y_w follows x in the text, so should be more likely 
# y_l does not follow x, so should be less likely
net.train()
y_w_logits = net(torch.cat((x,y_w),1))[:, 24:]
y_l_logits = net(torch.cat((x,y_l),1))[:, 24:]

print(y_w.shape) # 1 x (S-24) 
print(y_w_logits.shape) # 1 x (S-24) x V

torch.Size([1, 232])
torch.Size([1, 232, 78])


In [64]:
# negative cross entropy
r_w = -net.loss(y_w_logits, y_w)
r_l = -net.loss(y_l_logits, y_l)

print("r_w =", r_w.item())
print("r_l =", r_l.item())

r_w = -8.336804389953613
r_l = -8.146387100219727


In [65]:
simpo_loss = -F.logsigmoid(r_w - r_l)

optimizer_simpo = torch.optim.AdamW(net.parameters(), 
    lr=1e-6, weight_decay=0.1) # small learning rate

optimizer_simpo.zero_grad()
simpo_loss.backward() # accumulates back-propagated gradient
optimizer_simpo.step() # parameter update (fine-tuning)

print("SimPO loss:", simpo_loss.item())

SimPO loss: 0.7928813695907593


In [66]:
import copy

# creating reference model
net_ref = copy.deepcopy(net).to(device)

# freezing the reference model
net_ref.eval()
for p in net_ref.parameters():
    p.requires_grad = False

# forward pass under theta and reference models
net.train()
y_w_logits = net(torch.cat((x, y_w), 1))[:, 24:]
y_l_logits = net(torch.cat((x, y_l), 1))[:, 24:]
y_w_logits_ref = net_ref(torch.cat((x, y_w), 1))[:, 24:]
y_l_logits_ref = net_ref(torch.cat((x, y_l), 1))[:, 24:]

# compute per-token average log-probs = negative CE
r_w_theta = -net.loss(y_w_logits, y_w) # theta reward for y_w
r_l_theta = -net.loss(y_l_logits, y_l) # theta reward for y_l
r_w_ref   = -net_ref.loss(y_w_logits_ref, y_w)  # reference reward for y_w
r_l_ref   = -net_ref.loss(y_l_logits_ref, y_l)  # reference reward for y_l

# DPO hyperparameter
beta = 0.1

# DPO y_w and y_l rewards
r_w = beta*(r_w_theta - r_w_ref)
r_l = beta*(r_l_theta - r_l_ref)

# DPO loss
dpo_loss = -F.logsigmoid(r_w - r_l)

# optimizer
optimizer_dpo = torch.optim.AdamW(net.parameters(), lr=1e-6, weight_decay=0.1)

# backprop and update
optimizer_dpo.zero_grad()
dpo_loss.backward() # this step could be repeated multiple times to accumulate gradient across compared sequences
optimizer_dpo.step() # only updates theta net parameters, not reference

print("DPO loss:", dpo_loss.item())

DPO loss: 0.6938948035240173


In [67]:
# !pip install datasets
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict

data = load_dataset("jondurbin/py-dpo-v0.1")

# alternatively, load in the file directly
# url = ("https://huggingface.co/datasets/jondurbin/"
#          "py-dpo-v0.1/resolve/main/py-dpo.parquet"
#          )
# d = pd.read_parquet(url)
# d = Dataset.from_pandas(d, split="train")
# data = DatasetDict()
# data['train'] = d

# single row as an example
data['train'].to_pandas().head(1).T 

Unnamed: 0,0
prompt,Use the function to debug the given program an...
chosen,One possible solution to prevent the segmentat...
rejected,def debug_program(arr):\n n = len(arr)\n ...
id,8c94f83f-6a5a-5f8c-98a2-e242d7764938


In [68]:
# !pip install transformers==4.52.3
# specific version set for reproducibility
from transformers import pipeline, set_seed

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

set_seed(99)

generator = pipeline('text-generation', 
        model=model_name, device=device)

prompt = "Give me a Python function to compute sample standard deviation based on a list of numbers, without numpy. Use [1, 2, 3, 4, 5] as an example."
messages = [
    {"role": "user", "content": prompt},
]

output = generator(
    messages,
    max_new_tokens=256
)

Device set to use mps


In [69]:
print(textwrap.fill(str(output), 60))

[{'generated_text': [{'role': 'user', 'content': 'Give me a
Python function to compute sample standard deviation based
on a list of numbers, without numpy. Use [1, 2, 3, 4, 5] as
an example.'}, {'role': 'assistant', 'content': "To compute
the sample standard deviation of a list of numbers in Python
without using `numpy`, you can follow these steps:\n\n1.
Sort the list.\n2. Calculate the mean of the list.\n3.
Subtract the mean from each element and square the
result.\n4. Sum all the squared differences.\n5. Divide the
sum by the number of elements minus one (n-1) to get the
variance.\n6. Take the square root of the variance to get
the standard deviation.\n\nHere's how you can implement this
in Python:\n\n```python\ndef
calculate_sample_std_dev(numbers):\n    # Step 1: Sort the
list\n    sorted_numbers = sorted(numbers)\n    \n    # Step
2: Calculate the mean of the list\n    n =
len(sorted_numbers)\n    mean = sum(sorted_numbers[:n]) /
n\n    \n    # Step 3: Calculate the sum of squared

In [70]:
s = output[0]['generated_text'][1]['content']
for si in s.split("\n"):
    print(textwrap.fill(si, 60))

To compute the sample standard deviation of a list of
numbers in Python without using `numpy`, you can follow
these steps:

1. Sort the list.
2. Calculate the mean of the list.
3. Subtract the mean from each element and square the
result.
4. Sum all the squared differences.
5. Divide the sum by the number of elements minus one (n-1)
to get the variance.
6. Take the square root of the variance to get the standard
deviation.

Here's how you can implement this in Python:

```python
def calculate_sample_std_dev(numbers):
    # Step 1: Sort the list
    sorted_numbers = sorted(numbers)

    # Step 2: Calculate the mean of the list
    n = len(sorted_numbers)
    mean = sum(sorted_numbers[:n]) / n

    # Step 3: Calculate the sum of squared differences from
the mean
    total_sum_of_squares = sum((x - mean) ** 2 for x in
sorted_numbers)

    # Step 4: Calculate the standard deviation
    std_deviation = total_sum_of_squares / (n - 1)

    return std_deviation

# Example usage:
numbers_list =

In [71]:
# !pip install trl
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer

set_seed(99)

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct", 
            torch_dtype=torch.float32).to('cpu') 
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
train_dataset = load_dataset("jondurbin/py-dpo-v0.1", split="train")

training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", 
                    fp16=False, bf16=False, use_cpu=True, 
                    num_train_epochs=1, max_steps=10, logging_steps=1,
                    per_device_train_batch_size=4, 
                    gradient_accumulation_steps=4)
trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)

start_time = time.time()
train_output = trainer.train()
elapsed = time.time() - start_time 

Step,Training Loss
1,0.6931
2,0.6511
3,0.6238
4,0.6046
5,0.6112
6,0.5573
7,0.5305
8,0.6018
9,0.562
10,0.5487


In [72]:
print(f"Total elapsed time = {elapsed/60:.2f} mins")
print(f"Last update training loss: {trainer.state.log_history[-2].get('loss')}")

Total elapsed time = 18.29 mins
Last update training loss: 0.5487


In [73]:
set_seed(99)

generator = pipeline('text-generation', 
        model=model, tokenizer=tokenizer, device=device)

output = generator(
    messages,
    max_new_tokens=256
)

Device set to use mps


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.




In [74]:
s = output[0]['generated_text'][1]['content']
for si in s.split("\n"):
    print(textwrap.fill(si, 60))

Sure! Here's a Python function that computes the sample
standard deviation using only basic arithmetic operations:
```python
import math

def sample_std_dev(numbers):
    mean = sum(numbers) / len(numbers)
    variance = sum((x - mean) ** 2 for x in numbers) /
(len(numbers) - 1)
    return math.sqrt(variance)

numbers = [1, 2, 3, 4, 5]
print(sample_std_dev(numbers))
```

This function first calculates the mean of the input list by
adding up all the numbers and dividing by the number of
elements in the list. It then calculates the variance by
subtracting each number from its mean and squaring the
result. The variance is then divided by the square root of
the number of elements in the list minus one, which gives us
the sample standard deviation.
Note that this implementation assumes that the input list
contains only integers. If you want to handle floating-point
numbers as well, you'll need to modify the code accordingly.
