# Quantize the Model
An experiment in quantizing a model in-place, leaving it in floating point, with the goals:
* Make logits consistent across environments (versions of cuda, cpu, etc)
* Preserve good compression on text
* Allow application to new models

In [1]:
from lac_llm import *

In [2]:
cuda_device = 'cuda:2'

In [3]:
make_torch_deterministic()

def about(in_x):
    x = in_x.to(torch.float64)
    print(f"{x.shape}, {in_x.dtype}, ({x.max()}, {x.mean()}, {x.min()}), {(x*x).mean(dtype=torch.float32):.3e}")
    #print(f"{(x*x).sum(dtype=torch.float32):.3e}, {x.norm():.3e}, {torch.linalg.vector_norm(x):.3e}, {(x.to(dtype=torch.float64)*x.to(dtype=torch.float64)).sum(dtype=torch.float64):.3e}")

#### Examples

In [4]:
about((torch.arange(11)-5.0)/2.5)

torch.Size([11]), torch.float32, (2.0, 0.0, -2.0), 1.600e+00, cpu


In [5]:
#![binary32](https://upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Float_example.svg/1180px-Float_example.svg.png)

https://en.wikipedia.org/wiki/Single-precision_floating-point_format

![binary32](https://upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Float_example.svg/590px-Float_example.svg.png)

#### Examples

In [6]:
s = '012345678abcdefghijklmnopqrstuvw'
f32_delimit(s)

'0|12345678|abcdefghijklmnopqrstuvw'

In [7]:
binstack((torch.arange(11)-5.0)/2.5)

['1|10000000|00000000000000000000000',
 '1|01111111|10011001100110011001101',
 '1|01111111|00110011001100110011010',
 '1|01111110|10011001100110011001101',
 '1|01111101|10011001100110011001101',
 '0|00000000|00000000000000000000000',
 '0|01111101|10011001100110011001101',
 '0|01111110|10011001100110011001101',
 '0|01111111|00110011001100110011010',
 '0|01111111|10011001100110011001101',
 '0|10000000|00000000000000000000000']

In [8]:
binstack(torch.tensor([-1]))

['1111111111111111111111111111111111111111111111111111111111111111']

In [9]:
binstack(torch.arange(9, dtype=torch.float64).reshape(3,3))

['0000000000000000000000000000000000000000000000000000000000000000',
 '0011111111110000000000000000000000000000000000000000000000000000',
 '0100000000000000000000000000000000000000000000000000000000000000',
 '0100000000001000000000000000000000000000000000000000000000000000',
 '0100000000010000000000000000000000000000000000000000000000000000',
 '0100000000010100000000000000000000000000000000000000000000000000',
 '0100000000011000000000000000000000000000000000000000000000000000',
 '0100000000011100000000000000000000000000000000000000000000000000',
 '0100000000100000000000000000000000000000000000000000000000000000']

In [10]:
float_to_binary(1/2)

'0|01111110|00000000000000000000000'

In [11]:
print(float_to_binary(0), "0")
print(float_to_binary(1), "1")
print(float_to_binary(1/2), "1/2")
print(float_to_binary(2/3), "2/3")
print(float_to_binary(torch.tensor([2/3]).round(decimals=2).item()), "2/3 round(decimals=2)")
print(float_to_binary(((torch.tensor([2/3])*(1<<8)).round() * 1.0/(1<<8)).item()), "2/3 binary rounded 8 places")
print(float_to_binary(((torch.tensor([2/3])*(1<<16)).round() * 1.0/(1<<16)).item()), "2/3 binary rounded 16 places")
print(float_to_binary(((torch.tensor([2/3])*(1<<23)).round() * 1.0/(1<<23)).item()), "2/3 binary rounded 23 places")
print(float_to_binary(((torch.tensor([2/3])*(1<<24)).round() * 1.0/(1<<24)).item()), "2/3 binary rounded 24 places")
print(float_to_binary(((torch.tensor([2/3])*(1<<25)).round() * 1.0/(1<<25)).item()), "2/3 binary rounded 25 places")

0000000000000000000000000000000000000000000000000000000000000000 0
0000000000000000000000000000000000000000000000000000000000000001 1
0|01111110|00000000000000000000000 1/2
0|01111110|01010101010101010101011 2/3
0|01111110|01010111000010100011111 2/3 round(decimals=2)
0|01111110|01010110000000000000000 2/3 binary rounded 8 places
0|01111110|01010101010101100000000 2/3 binary rounded 16 places
0|01111110|01010101010101010101100 2/3 binary rounded 23 places
0|01111110|01010101010101010101011 2/3 binary rounded 24 places
0|01111110|01010101010101010101011 2/3 binary rounded 25 places


In [12]:
float_to_binary(0.15625)

'0|01111100|01000000000000000000000'

In [13]:
float_to_binary(1.0/(1<<149))

'0|00000000|00000000000000000000001'

In [14]:
#2−126 × (1 − 2−23)
float_to_binary(1.0/(1<<126) * (1.0 - 1.0/(1<<23)))

'0|00000000|11111111111111111111111'

In [15]:
float_to_binary(1.0/(1<<126))

'0|00000001|00000000000000000000000'

In [16]:
float_to_binary(1.0 - (1.0/(1<<24)))

'0|01111110|11111111111111111111111'

In [17]:
float_to_binary(1.0)

'0|01111111|00000000000000000000000'

In [18]:
#2127 × (2 − 2−23)
float_to_binary((1<<127) * (2.0 - 1.0/(1<<23)))

'0|11111110|11111111111111111111111'

In [19]:
t = (torch.arange(11, dtype=torch.float32)-5.0)/2.5
binstack(t)

['1|10000000|00000000000000000000000',
 '1|01111111|10011001100110011001101',
 '1|01111111|00110011001100110011010',
 '1|01111110|10011001100110011001101',
 '1|01111101|10011001100110011001101',
 '0|00000000|00000000000000000000000',
 '0|01111101|10011001100110011001101',
 '0|01111110|10011001100110011001101',
 '0|01111111|00110011001100110011010',
 '0|01111111|10011001100110011001101',
 '0|10000000|00000000000000000000000']

In [20]:
t = torch.arange(5)
binstack(bitwise_or_reduce(t)), t

(['0000000000000000000000000000000000000000000000000000000000000111'],
 tensor([0, 1, 2, 3, 4]))

In [21]:
bitwise_or_reduce(torch.arange(1025, dtype=torch.int32))

tensor([2047], dtype=torch.int32)

In [22]:
torch.arange(9).reshape(3,3).flatten()

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [23]:
t = (torch.arange(11, dtype=torch.float32)-5.0)/2.5
or_all([t]), binor_all([t])#, binor_model(model)

(tensor([-2236961], dtype=torch.int32), ['1|11111111|10111011101110111011111'])

In [24]:
t = torch.tensor([2/3])

In [25]:
print(float_to_binary(t.item()))
print(float_to_binary(round_to_bits(t, 16).item()))
print(float_to_binary(t.item()))

0|01111110|01010101010101010101011
0|01111110|01010101010101100000000
0|01111110|01010101010101010101011


In [26]:
print(float_to_binary(t.item()))
print(float_to_binary(round_to_bits(t, 16).item()))
print(float_to_binary(t.item()))
print(float_to_binary(round_to_bits_(t, 16).item()))
print(float_to_binary(t.item()))

0|01111110|01010101010101010101011
0|01111110|01010101010101100000000
0|01111110|01010101010101010101011
0|01111110|01010101010101100000000
0|01111110|01010101010101100000000


In [27]:
t = torch.randn(2*3*5).reshape(2,3,5)/7.0
about(t)
about(round_to_bits(t, 8))
about(t)
about(round_to_bits_(t, 8))
about(t)

torch.Size([2, 3, 5]), torch.float32, (0.3517160713672638, -0.020990868488540097, -0.3402797281742096), 2.818e-02, cpu
torch.Size([2, 3, 5]), torch.float32, (0.3515625, -0.020963541666666665, -0.33984375), 2.811e-02, cpu
torch.Size([2, 3, 5]), torch.float32, (0.3517160713672638, -0.020990868488540097, -0.3402797281742096), 2.818e-02, cpu
torch.Size([2, 3, 5]), torch.float32, (0.3515625, -0.020963541666666665, -0.33984375), 2.811e-02, cpu
torch.Size([2, 3, 5]), torch.float32, (0.3515625, -0.020963541666666665, -0.33984375), 2.811e-02, cpu


In [28]:
binstack(torch.arange(5) * 1.0)

['0|00000000|00000000000000000000000',
 '0|01111111|00000000000000000000000',
 '0|10000000|00000000000000000000000',
 '0|10000000|10000000000000000000000',
 '0|10000001|00000000000000000000000']

In [29]:
binstack(torch.arange(5, dtype=torch.int32))

['0|00000000|00000000000000000000000',
 '0|00000000|00000000000000000000001',
 '0|00000000|00000000000000000000010',
 '0|00000000|00000000000000000000011',
 '0|00000000|00000000000000000000100']

In [30]:
t = torch.randn(5)
binstack(t), binor_each(t), binor_all(t)

(['0|01111100|01011011010011000001001',
  '1|01111111|10111100101111000110010',
  '1|01111101|11111100001010111111101',
  '1|01111111|01010100001110101100000',
  '0|01111111|00100011001001011000101'],
 ['0|01111100|01011011010011000001001',
  '1|01111111|10111100101111000110010',
  '1|01111101|11111100001010111111101',
  '1|01111111|01010100001110101100000',
  '0|01111111|00100011001001011000101'],
 ['1|01111111|11111111111111111111111'])

## Get model

In [31]:
model_name = 'internal'
#model_name = 'gpt2'
reference_model = provide_model_on_cpu(model_name)

In [32]:
model = copy.deepcopy(reference_model)
model, host_device_of(model)

(GPT(
   (transformer): ModuleDict(
     (wte): Embedding(50304, 768)
     (wpe): Embedding(1024, 768)
     (drop): Dropout(p=0.0, inplace=False)
     (h): ModuleList(
       (0-11): 12 x Block(
         (ln_1): LayerNorm()
         (attn): CausalSelfAttention(
           (c_attn): Linear(in_features=768, out_features=2304, bias=False)
           (c_proj): Linear(in_features=768, out_features=768, bias=False)
           (attn_dropout): Dropout(p=0.0, inplace=False)
           (resid_dropout): Dropout(p=0.0, inplace=False)
         )
         (ln_2): LayerNorm()
         (mlp): MLP(
           (c_fc): Linear(in_features=768, out_features=3072, bias=False)
           (gelu): GELU(approximate='none')
           (c_proj): Linear(in_features=3072, out_features=768, bias=False)
           (dropout): Dropout(p=0.0, inplace=False)
         )
       )
     )
     (ln_f): LayerNorm()
   )
   (lm_head): Linear(in_features=768, out_features=50304, bias=False)
 ),
 device(type='cpu'))

## Traversing the model

```python
def print_model_components(model):
    for name, module in model.named_modules():
        print([" ","A"][is_activation_class(module)], \
              # [" ","N"][isinstance(module, LayerNorm)], \
              [" ","W"][hasattr(module, "weight")], \
              hasattr(module, "weight") and module.weight is not None and "%24s" % (str(module.weight.shape)) or " "*24, \
              [" ","B"][hasattr(module, "bias")], \
              hasattr(module, "bias") and module.bias is not None and "%22s" % (str(module.bias.shape)) or " "*22, \
              name)
```

In [33]:
print_model_components(model)

                                                      
                                                      transformer
  W torch.Size([50304, 768])                          transformer.wte
  W  torch.Size([1024, 768])                          transformer.wpe
                                                      transformer.drop
                                                      transformer.h
                                                      transformer.h.0
  W        torch.Size([768]) B                        transformer.h.0.ln_1
                                                      transformer.h.0.attn
  W  torch.Size([2304, 768]) B                        transformer.h.0.attn.c_attn
  W   torch.Size([768, 768]) B                        transformer.h.0.attn.c_proj
                                                      transformer.h.0.attn.attn_dropout
                                                      transformer.h.0.attn.resid_dropout
  W        torch.Size([768]) B         

In [34]:
names = [name for name, module in model.named_modules()]
len(names), len(set(names))

(164, 164)

## Record model intermediates

In [35]:
record_module_output

<function lac_llm.record_module_output(name, output)>

```python
def hook_model_for_recording(model):
    record_handles = {}
    for name, module in model.named_modules():
        f = lambda module, args, output, name=name: record_module_output(name, output)
        record_handles[name] = module.register_forward_hook(f)
    return record_handles
```

In [36]:
record_handles = hook_model_for_recording(model)

In [37]:
record_handles.keys()

dict_keys(['', 'transformer', 'transformer.wte', 'transformer.wpe', 'transformer.drop', 'transformer.h', 'transformer.h.0', 'transformer.h.0.ln_1', 'transformer.h.0.attn', 'transformer.h.0.attn.c_attn', 'transformer.h.0.attn.c_proj', 'transformer.h.0.attn.attn_dropout', 'transformer.h.0.attn.resid_dropout', 'transformer.h.0.ln_2', 'transformer.h.0.mlp', 'transformer.h.0.mlp.c_fc', 'transformer.h.0.mlp.gelu', 'transformer.h.0.mlp.c_proj', 'transformer.h.0.mlp.dropout', 'transformer.h.1', 'transformer.h.1.ln_1', 'transformer.h.1.attn', 'transformer.h.1.attn.c_attn', 'transformer.h.1.attn.c_proj', 'transformer.h.1.attn.attn_dropout', 'transformer.h.1.attn.resid_dropout', 'transformer.h.1.ln_2', 'transformer.h.1.mlp', 'transformer.h.1.mlp.c_fc', 'transformer.h.1.mlp.gelu', 'transformer.h.1.mlp.c_proj', 'transformer.h.1.mlp.dropout', 'transformer.h.2', 'transformer.h.2.ln_1', 'transformer.h.2.attn', 'transformer.h.2.attn.c_attn', 'transformer.h.2.attn.c_proj', 'transformer.h.2.attn.attn_dro

### Get the pre-surgery logits

In [38]:
text = r"""You will rejoice to hear that no disaster has accompanied the
commencement of an enterprise which you have regarded with such evil
forebodings. I arrived here yesterday, and my first task is to assure
my dear sister of my welfare and increasing confidence in the success
of my undertaking.
"""

In [39]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
toks = enc.encode(text)
len(text), len(toks), type(toks)

(290, 62, list)

In [40]:
#idx = torch.tensor([[198]], dtype=torch.int64)
idx = torch.tensor([toks], dtype=torch.int64)

#### Calculated on cpu

In [41]:
config.model_record = []
#del(config.model_record)

In [42]:
y_pre_surgery, loss = model(idx)
y_pre_surgery

tensor([[[  4.1730,   2.5067,   4.7283,  ..., -10.1042, -10.1043, -10.1041]]])

In [43]:
len(config.model_record), len(record_handles)

(150, 164)

In [44]:
cpu_model_record = config.model_record
config.model_record = []

##### Copy model record to gpu for later analysis

In [45]:
def toer(v):
    match v:
        case torch.Tensor() as t:
            return t.to(cuda_device)
        case (t, _) if isinstance(t, torch.Tensor):
            return t.to(cuda_device)

cpu_model_record = [(name, toer(t)) for name, t in cpu_model_record]

#### Calculated on gpu

In [46]:
#model_g = copy.deepcopy(reference_model).to('cuda')
model_g = copy.deepcopy(model).to(cuda_device)
print(f"{model_g.training=}, {host_device_of(model_g)=}")

model_g.training=False, host_device_of(model_g)=device(type='cuda', index=2)


In [47]:
yg_pre_surgery, loss = model_g(idx.to(host_device_of(model_g)))

In [48]:
gpu_model_record = config.model_record
config.model_record = []

### Compare calculation records from cpu and gpu

```python
def compare_records(record_a, record_b, device='cpu'):
    assert all(c[0]==g[0] for c,g in zip(record_a, record_b))
    assert all(c[1].dtype==g[1].dtype for c,g in zip(record_a[:-1], record_b[:-1]))
    record_names = [v[0] for v in record_a]
    close_list = [torch.allclose(c[1].to(device), g[1].to(device), atol=1e-09) for c,g in zip(record_a[:-1], record_b[:-1])]
    close_dict = dict((name, (i, close)) for i, (name, close) in enumerate(zip(record_names, close_list)))
    for name, (i, close) in close_dict.items():
        if True or not close:
            print(name)
            about(record_b[i][1].to(device) - record_a[i][1].to(device))
```

In [49]:
compare_records(cpu_model_record, gpu_model_record)

transformer.wte
torch.Size([1, 62, 768]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.wpe
torch.Size([62, 768]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.drop
torch.Size([1, 62, 768]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.h.0.ln_1
torch.Size([1, 62, 768]), torch.float32, (9.5367431640625e-07, 6.322277850588479e-10, -1.430511474609375e-06), 2.063e-15, cpu
transformer.h.0.attn.c_attn
torch.Size([1, 62, 2304]), torch.float32, (0.0014039278030395508, 1.2933771252246972e-06, -0.0015301704406738281), 3.322e-08, cpu
transformer.h.0.attn.c_proj
torch.Size([1, 62, 768]), torch.float32, (0.00022935867309570312, 2.5189960301642997e-08, -6.835907697677612e-05), 3.404e-11, cpu
transformer.h.0.attn.resid_dropout
torch.Size([1, 62, 768]), torch.float32, (0.00022935867309570312, 2.5189960301642997e-08, -6.835907697677612e-05), 3.404e-11, cpu
transformer.h.0.attn
torch.Size([1, 62, 768]), torch.float32, (0.00022935867309570312, 2.5189960301642997e-

In [50]:
tok_emb = gpu_model_record[0]
binor_all(tok_emb[1])

['1|01111111|11111111111111111111111']

In [51]:
about(tok_emb[1])

torch.Size([1, 62, 768]), torch.float32, (0.15897409617900848, -0.0001894384528583231, -0.3170527517795563), 2.892e-04, cuda:2


### Inspect model intermediates

In [52]:
for name, t in cpu_model_record:
    match t:
        case torch.Tensor() as tensor if tensor.dtype == torch.float32:
            print(binor_all(tensor), name)
        case (tensor, _) if isinstance(tensor, torch.Tensor):
            print(binor_all(tensor), name, _)
        case _:
            print(f"{name} is {t}")

['1|01111111|11111111111111111111111'] transformer.wte
['1|01111111|11111111111111111111111'] transformer.wpe
['1|01111111|11111111111111111111111'] transformer.drop
['1|11111111|11111111111111111111111'] transformer.h.0.ln_1
['1|11111111|11111111111111111111111'] transformer.h.0.attn.c_attn
['1|01111111|11111111111111111111111'] transformer.h.0.attn.c_proj
['1|01111111|11111111111111111111111'] transformer.h.0.attn.resid_dropout
['1|01111111|11111111111111111111111'] transformer.h.0.attn
['1|11111111|11111111111111111111111'] transformer.h.0.ln_2
['1|11111111|11111111111111111111111'] transformer.h.0.mlp.c_fc
['1|11111111|11111111111111111111111'] transformer.h.0.mlp.gelu
['1|11111111|11111111111111111111111'] transformer.h.0.mlp.c_proj
['1|11111111|11111111111111111111111'] transformer.h.0.mlp.dropout
['1|11111111|11111111111111111111111'] transformer.h.0.mlp
['1|11111111|11111111111111111111111'] transformer.h.0
['1|11111111|11111111111111111111111'] transformer.h.1.ln_1
['1|1111111

## Quantizing the model
torch.FloatTensor on CPU and torch.cuda.FloatTensor on GPU is a [tensor](https://pytorch.org/docs/stable/tensors.html#torch.Tensor) of ``dtype`` ``torch.float32``

IEEE 754 [binary16](https://en.wikipedia.org/wiki/Single-precision_floating-point_format) has 24 bits of mantissa (1 implied and 23 expressed)

If we have two floats, a and b, quantized to m bits of mantissa (the lower bits zeros), their product can require 2m bits of mantissa to avoid loss of information. If we add n such products together, *all of which have the same exponent*, we need ceil(log2(n)) more bits

In [53]:
from torch import Tensor
import torch.nn as nn

In [54]:
binstack(model.transformer.h[0].mlp.gelu((torch.arange(11)-5.0)/2.5))

['1|01111010|01110100101111001010000',
 '1|01111011|01100111001000011111010',
 '1|01111100|00011010110010111001110',
 '1|01111100|01011011000110101001010',
 '1|01111100|00011010010001110101001',
 '0|00000000|00000000000000000000000',
 '0|01111101|00001100011101011111001',
 '0|01111110|01000010110100101111011',
 '0|01111111|00001111110110011100000',
 '0|01111111|10000011001001110111101',
 '0|01111111|11110100010110100001110']

## Surgery

### `quantize_module`

```python
def quantize_module(module, n_bits):
    def quantize_param(parameter):
        if parameter is not None:
            round_to_bits_(parameter, n_bits - parameter.abs().max().log2().ceil().int().item())
            
    def quantize_output(module, args, output):
        # A forward hook function, viz.
        # https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_forward_hook
        match output:
            case torch.Tensor() as t:
                round_to_bits_(t, n_bits) # Round in-place
            case (t, _) if isinstance(t, torch.Tensor):
                round_to_bits_(t, n_bits) # Round in-place
        return output
        
    quantize_handles = {}
    for name, module in module.named_modules():
        if hasattr(module, "weight"):
            quantize_param(module.weight)
        if hasattr(module, "bias"):
            quantize_param(module.bias)
        # Hook in the quantization of the module's output
        quantize_handles[name] = module.register_forward_hook(quantize_output)
    return quantize_handles
```

### Apply

In [82]:
model = copy.deepcopy(reference_model)
record_handles = hook_model_for_recording(model)
quantize_handles = quantize_module(model, 8)
len(quantize_handles), quantize_handles.keys() == record_handles.keys()

(164, True)

In [83]:
model, host_device_of(model)

(GPT(
   (transformer): ModuleDict(
     (wte): Embedding(50304, 768)
     (wpe): Embedding(1024, 768)
     (drop): Dropout(p=0.0, inplace=False)
     (h): ModuleList(
       (0-11): 12 x Block(
         (ln_1): LayerNorm()
         (attn): CausalSelfAttention(
           (c_attn): Linear(in_features=768, out_features=2304, bias=False)
           (c_proj): Linear(in_features=768, out_features=768, bias=False)
           (attn_dropout): Dropout(p=0.0, inplace=False)
           (resid_dropout): Dropout(p=0.0, inplace=False)
         )
         (ln_2): LayerNorm()
         (mlp): MLP(
           (c_fc): Linear(in_features=768, out_features=3072, bias=False)
           (gelu): GELU(approximate='none')
           (c_proj): Linear(in_features=3072, out_features=768, bias=False)
           (dropout): Dropout(p=0.0, inplace=False)
         )
       )
     )
     (ln_f): LayerNorm()
   )
   (lm_head): Linear(in_features=768, out_features=50304, bias=False)
 ),
 device(type='cpu'))

#### Verify all weights quantized

In [84]:
binor_model(model)

'1||1111111|1|11111110000000000000000'

for name, module in model.named_modules():
    #print([" ","A"][is_activation_class(module)], [" ","W"][hasattr(module, "weight")], name)
    if hasattr(module, "weight"):
        print(binor_all(module.weight), "W", name)
    if hasattr(module, "bias"):
        print(binor_all(module.weight), "B", name)

In [85]:
gact = model.transformer.h[0].mlp.gelu
print(f"{type(gact)=}")
binstack(gact((torch.arange(11)-5.0)/2.5))

type(gact)=<class 'torch.nn.modules.activation.GELU'>


['1|01111010|10000000000000000000000',
 '1|01111011|01100000000000000000000',
 '1|01111100|00011000000000000000000',
 '1|01111100|01011000000000000000000',
 '1|01111100|00011000000000000000000',
 '0|00000000|00000000000000000000000',
 '0|01111101|00001100000000000000000',
 '0|01111110|01000010000000000000000',
 '0|01111111|00010000000000000000000',
 '0|01111111|10000011000000000000000',
 '0|01111111|11110100000000000000000']

#### Get post-surgery logits

In [86]:
config.model_record = []

In [87]:
y_post_surgery, loss = model(idx)
cpu_model_record_post_surgery = config.model_record
config.model_record = []

#### Verify the weights are still quantized

In [88]:
binor_model(model)

'1||1111111|1|11111110000000000000000'

In [89]:
about(y_pre_surgery)
about(y_post_surgery)
about(y_post_surgery - y_pre_surgery)

torch.Size([1, 1, 50304]), torch.float32, (20.126220703125, -3.596728064276814, -23.976789474487305), 1.980e+01, cpu
torch.Size([1, 1, 50304]), torch.float32, (22.6328125, -4.290519811421557, -26.8671875), 2.728e+01, cpu
torch.Size([1, 1, 50304]), torch.float32, (2.506591796875, -0.6937917471180833, -3.5022177696228027), 7.810e-01, cpu


#### View intermediate states of the model calculation

```python
def view_record(model_record):
    for name, t in model_record:
        if hasattr(t, "dtype") and t.dtype == torch.float32:
            #print(f32_delimit(binstack(bitwise_or_reduce(view_as_int32(t)))[0]), name)
            print(binor(t), name)
        else:
            print(f"{name} is {t}")
```

In [90]:
view_record(cpu_model_record_post_surgery)

['1|01111111|11111100000000000000000'] transformer.wte
['1|01111111|11111100000000000000000'] transformer.wpe
['1|01111111|11111111000000000000000'] transformer.drop
['1|11111111|11111111110000000000000'] transformer.h.0.ln_1
['1|11111111|11111111110000000000000'] transformer.h.0.attn.c_attn
['1|01111111|11111100000000000000000'] transformer.h.0.attn.c_proj
['1|01111111|11111100000000000000000'] transformer.h.0.attn.resid_dropout
['1|01111111|11111100000000000000000'] transformer.h.0.attn
['1|11111111|11111111111111000000000'] transformer.h.0.ln_2
['1|11111111|11111111111110000000000'] transformer.h.0.mlp.c_fc
['1|11111111|11111111110110000000000'] transformer.h.0.mlp.gelu
['1|11111111|11111111111100000000000'] transformer.h.0.mlp.c_proj
['1|11111111|11111111111100000000000'] transformer.h.0.mlp.dropout
['1|11111111|11111111111100000000000'] transformer.h.0.mlp
['1|11111111|11111111111100000000000'] transformer.h.0
['1|11111111|11111111111110000000000'] transformer.h.1.ln_1
['1|1111111

In [91]:
#config.model_record[3][0], binstack(config.model_record[3][1])

In [92]:
about(model.transformer.h[0].ln_1.weight)

torch.Size([768]), torch.float32, (1.75, 0.60345458984375, 0.28125), 3.747e-01, cpu


In [93]:
#assert 0

## Try on cuda

In [94]:
# Copy surgically-altered model to GPU
model_g = copy.deepcopy(model).to('cuda')
print(f"{model_g.training=}, {host_device_of(model_g)=}")

model_g.training=False, host_device_of(model_g)=device(type='cuda', index=0)


In [95]:
all(torch.all(c==g.to('cpu')) for c,g in zip(model.parameters(), model_g.parameters()))

True

In [96]:
yg_post_surgery, loss = model_g(idx.to(host_device_of(model_g)))
gpu_model_record_post_surgery = config.model_record
config.model_record = []

In [97]:
view_record(gpu_model_record_post_surgery)

['1|01111111|11111100000000000000000'] transformer.wte
['1|01111111|11111100000000000000000'] transformer.wpe
['1|01111111|11111111000000000000000'] transformer.drop
['1|11111111|11111111110000000000000'] transformer.h.0.ln_1
['1|11111111|11111111110000000000000'] transformer.h.0.attn.c_attn
['1|01111111|11111100000000000000000'] transformer.h.0.attn.c_proj
['1|01111111|11111100000000000000000'] transformer.h.0.attn.resid_dropout
['1|01111111|11111100000000000000000'] transformer.h.0.attn
['1|11111111|11111111111111000000000'] transformer.h.0.ln_2
['1|11111111|11111111111110000000000'] transformer.h.0.mlp.c_fc
['1|11111111|11111111111110000000000'] transformer.h.0.mlp.gelu
['1|11111111|11111111111100000000000'] transformer.h.0.mlp.c_proj
['1|11111111|11111111111100000000000'] transformer.h.0.mlp.dropout
['1|11111111|11111111111100000000000'] transformer.h.0.mlp
['1|11111111|11111111111100000000000'] transformer.h.0
['1|11111111|11111111111110000000000'] transformer.h.1.ln_1
['1|1111111

## Compare model intermediates CPU to GPU

In [98]:
compare_records(cpu_model_record_post_surgery, gpu_model_record_post_surgery)

transformer.wte
torch.Size([1, 62, 768]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.wpe
torch.Size([62, 768]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.drop
torch.Size([1, 62, 768]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.h.0.ln_1
torch.Size([1, 62, 768]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.h.0.attn.c_attn
torch.Size([1, 62, 2304]), torch.float32, (0.0, 0.0, 0.0), 0.000e+00, cpu
transformer.h.0.attn.c_proj
torch.Size([1, 62, 768]), torch.float32, (0.00390625, -3.2814600134408605e-07, -0.00390625), 3.205e-09, cpu
transformer.h.0.attn.resid_dropout
torch.Size([1, 62, 768]), torch.float32, (0.00390625, -3.2814600134408605e-07, -0.00390625), 3.205e-09, cpu
transformer.h.0.attn
torch.Size([1, 62, 768]), torch.float32, (0.00390625, -3.2814600134408605e-07, -0.00390625), 3.205e-09, cpu
transformer.h.0.ln_2
torch.Size([1, 62, 768]), torch.float32, (0.17578125, 9.024015036962366e-07, -0.19921875), 6.969e-06, cpu
tran

# How did the logits compare between cpu and gpu before and after surgery?

In [99]:
about(yg_pre_surgery.to('cpu') - y_pre_surgery)
about(yg_post_surgery.to('cpu') - y_post_surgery)
#about((yg_post_surgery - y_post_surgery.to('cuda')).to('cpu'))

torch.Size([1, 1, 50304]), torch.float32, (0.024580001831054688, 0.003459617039354627, -0.019138336181640625), 2.027e-05, cpu
torch.Size([1, 1, 50304]), torch.float32, (0.23828125, 0.041479367942907124, -0.17578125), 2.758e-03, cpu


With ctx = first paragraph of Dr. Frankenstein, "You will rejoice ...":

In [100]:
# quantize to 16 bits, internal model flash attention
#torch.Size([1, 1, 50304]), torch.float32, (0.024580001831054688, 0.003459617039354627, -0.019138336181640625), 2.027e-05, cpu
#torch.Size([1, 1, 50304]), torch.float32, (0.0093841552734375, 0.0012059809294062414, -0.0066070556640625), 3.099e-06, cpu

# quantize to 16 bits, gpt2 flash attention
#torch.Size([1, 1, 50257]), torch.float32, (0.041912078857421875, 0.03370751197366046, 0.026607513427734375), 1.139e-03, cpu
#torch.Size([1, 1, 50257]), torch.float32, (0.0253448486328125, 0.012218502743068595, -0.0092010498046875), 1.591e-04, cpu

# quantize to 16 bits, gpt2 slow attention
#torch.Size([1, 1, 50257]), torch.float32, (-0.007354736328125, -0.014376958907047852, -0.021404266357421875), 2.088e-04, cpu
#torch.Size([1, 1, 50257]), torch.float32, (0.02532958984375, 0.0173440996189741, 0.0064239501953125), 3.050e-04, cpu

# quantize to 6 bits, gpt2 slow attention
#torch.Size([1, 1, 50257]), torch.float32, (-0.007354736328125, -0.014376958907047852, -0.021404266357421875), 2.088e-04, cpu
#torch.Size([1, 1, 50257]), torch.float32, (7.609375, 5.427644096842231, 2.578125), 2.966e+01, cpu

# quantize to 10 bits:
#torch.Size([1, 1, 50257]), torch.float32, (0.041912078857421875, 0.03370751197366046, 0.026607513427734375), 1.139e-03, cpu
#torch.Size([1, 1, 50257]), torch.float32, (0.1748046875, 0.12240598946415425, 0.0302734375), 1.521e-02, cpu

These are with `ctx = torch.tensor([[198]])`, which is just a newline:

In [101]:
#no quantize:
#torch.Size([1, 1, 50304]), torch.float32, (2.9206275939941406e-05, -3.924847043737883e-06, -2.8967857360839844e-05), 5.533e-11
#torch.Size([1, 1, 50304]), torch.float32, (2.9206275939941406e-05, -3.924847043737883e-06, -2.8967857360839844e-05), 5.533e-11

#quantize_module(model, 24)
#torch.Size([1, 1, 50304]), torch.float32, (2.9206275939941406e-05, -3.924847043737883e-06, -2.8967857360839844e-05), 5.533e-11
#torch.Size([1, 1, 50304]), torch.float32, (3.910064697265625e-05, 7.596433591618944e-06, -8.58306884765625e-06), 7.956e-11

#quantize_module(model, 16)
#torch.Size([1, 1, 50304]), torch.float32, (2.9206275939941406e-05, -3.924847043737883e-06, -2.8967857360839844e-05), 5.533e-11
#torch.Size([1, 1, 50304]), torch.float32, (0.00019979476928710938, 2.0292119843454003e-05, -0.00014019012451171875), 1.593e-09

#quantize_module(model, 12)
#torch.Size([1, 1, 50304]), torch.float32, (2.9206275939941406e-05, -3.924847043737883e-06, -2.8967857360839844e-05), 5.533e-11
#torch.Size([1, 1, 50304]), torch.float32, (0.0030562877655029297, 0.000693647690433132, -0.0014503002166748047), 7.557e-07

#quantize_module(model, 8)
#torch.Size([1, 1, 50304]), torch.float32, (2.9206275939941406e-05, -3.924847043737883e-06, -2.8967857360839844e-05), 5.533e-11
#torch.Size([1, 1, 50304]), torch.float32, (0.0013637542724609375, 0.00016730938150616168, -0.0010051727294921875), 1.011e-07

In [102]:
print('\n'.join(' '.join([c, g]) for c, g in zip(binstack(y_post_surgery[0,0,:10]), binstack(yg_post_surgery[0,0,:10]))))

0|10000001|00101000110000000000000 0|10000001|00100110110000000000000
0|10000000|01100000100000000000000 0|10000000|01011100100000000000000
0|10000001|00111111000000000000000 0|10000001|00111100110000000000000
0|01111101|10101000000000000000000 0|01111101|10011000000000000000000
0|01111111|10010001000000000000000 0|01111111|10001100000000000000000
0|01111111|01101101000000000000000 0|01111111|01100110000000000000000
0|10000000|10010101000000000000000 0|10000000|10001111000000000000000
0|10000000|10000110100000000000000 0|10000000|10000011100000000000000
0|10000001|01111000110000000000000 0|10000001|01110100000000000000000
0|10000000|01001101100000000000000 0|10000000|01000111100000000000000


In [103]:
binstack(model.transformer.wte.weight[0,:6])

['1|01111010|00000000000000000000000',
 '0|01111000|10000000000000000000000',
 '0|01111001|01100000000000000000000',
 '0|01111000|10000000000000000000000',
 '1|01111010|01000000000000000000000',
 '0|01111000|01000000000000000000000']

In [104]:
model.transformer.h[0].mlp.c_fc.weight

Parameter containing:
tensor([[ 0.0039,  0.0234,  0.0391,  ..., -0.0156,  0.0469,  0.0039],
        [ 0.0352, -0.0039, -0.0000,  ..., -0.0352, -0.0000, -0.0352],
        [-0.0117,  0.0117, -0.0234,  ..., -0.0039, -0.0273, -0.0117],
        ...,
        [-0.0312, -0.0273, -0.0156,  ..., -0.0078, -0.0352,  0.0234],
        [ 0.0273, -0.0078, -0.0117,  ...,  0.0039, -0.0000,  0.0352],
        [-0.0352, -0.0039,  0.0391,  ..., -0.0195,  0.0430, -0.0508]])

In [105]:
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [106]:
model_g, host_device_of(model_g)

(GPT(
   (transformer): ModuleDict(
     (wte): Embedding(50304, 768)
     (wpe): Embedding(1024, 768)
     (drop): Dropout(p=0.0, inplace=False)
     (h): ModuleList(
       (0-11): 12 x Block(
         (ln_1): LayerNorm()
         (attn): CausalSelfAttention(
           (c_attn): Linear(in_features=768, out_features=2304, bias=False)
           (c_proj): Linear(in_features=768, out_features=768, bias=False)
           (attn_dropout): Dropout(p=0.0, inplace=False)
           (resid_dropout): Dropout(p=0.0, inplace=False)
         )
         (ln_2): LayerNorm()
         (mlp): MLP(
           (c_fc): Linear(in_features=768, out_features=3072, bias=False)
           (gelu): GELU(approximate='none')
           (c_proj): Linear(in_features=3072, out_features=768, bias=False)
           (dropout): Dropout(p=0.0, inplace=False)
         )
       )
     )
     (ln_f): LayerNorm()
   )
   (lm_head): Linear(in_features=768, out_features=50304, bias=False)
 ),
 device(type='cuda', index=0))

In [107]:
del model_g

# NEXT
1. Do the recording and quantization by ``register_forward_hook``
2. Examine the progress of the calculation side-by-side in recordings of cpu v gpu
3. Try logit quantization on model quant results
6. Measure effect of model quant on compression effectiveness
7. Try float64?
## Done or Obsolete
1. Write a loop to march through quantizations recording results
4. Make Quactivation also inspect what it's fed
5. Consider making a specific quant module to interpose
   1. Can it inspect its surroundings to decide quantization?
   2. Can it learn from experience how to quantize?

## Boneyard

In [108]:
assert 0, "Boneyard"

AssertionError: Boneyard

### Embedding

In [None]:
embed = model.transformer.wte
embed

In [None]:
about(embed.weight)
embedded = embed(torch.tensor([[198]]))
about(embedded)

In [None]:
binstack(embedded[0,0,:6])

In [None]:
about(embed.weight)
about(round_to_bits(embed.weight, 16))
about(embed.weight)
about(round_to_bits_(embed.weight, 16))
about(embed.weight)

In [None]:
float_to_binary(0.310760498046875)

In [None]:
round_to_bits_(embed.weight, 16)
about(embed.weight)
embedded = embed(torch.tensor([[198]]))
about(embedded)
binstack(embedded[0,0,:6])

Quantize the players, with eyeballed scaling:

In [None]:
qw = ((1<<25) * w).round().to(dtype=torch.int64)
qx = ((1<<25) * x).round().to(dtype=torch.int64)
qy = ((1<<50) * y).round().to(dtype=torch.int64)

Calculate the output in int64:

In [None]:
cqy = qw @ qx

A good match:

In [None]:
about(y)
about(qy.to(torch.float64) / (1<<50))
about(qy)
about(cqy)
about((cqy - qy).to(torch.float64) / (1<<50))

### Try it on gpu

In [None]:
device = 'cuda'
gqw = qw.to(device)
gqx = qx.to(device)

In [None]:
gcqy = gqw @ gqx

Try it with int32

In [None]:
fgqw = ((1<<16) * w).round().to(dtype=torch.int32, device=device)
fgqx = ((1<<16) * x).round().to(dtype=torch.int32, device=device)
fgcqy = fgqw @ fgqx

In [None]:
fgqw = qw.to(device=device, dtype=torch.int32)
fgqx = qx.to(device=device, dtype=torch.int32)
f

In [None]:
import psutil
torch.set_num_threads(psutil.cpu_count(logical=False))
torch.get_num_threads()

In [None]:
%%timeit
cqy = qw @ qx

In [None]:
test_types = (torch.int64, torch.int32, torch.int16, torch.int8,
        torch.float64, torch.float32, torch.float16, torch.bfloat16)
works = []
for device in ('cpu', 'cuda'):
    for w_type in test_types: 
        for x_type in test_types:
            w = torch.arange(6).reshape(2,3).to(dtype=w_type, device=device)
            x = torch.arange(3).to(dtype=x_type, device=device)
            try:
                y = w @ x
            except RuntimeError as e:
                print(f"{w_type=}, {x_type=}: {e}")
            else:
                print(f"{w_type=}, {x_type=}: {y}")
                works.append((device, w_type, x_type))
print(f"{works}")

In [None]:
sorted(works)

In [None]:
torch.__version__

In [None]:
#Chat4:
import struct

def float_to_binary(num):
    # Convert a float to a 32-bit binary string
    # First, pack the float into 4 bytes using IEEE 754 format
    packed = struct.pack('f', num)
    
    # Then, unpack those bytes into an integer
    [integer_representation] = struct.unpack('I', packed)
    
    # Finally, format the integer as a 32-bit binary string
    return format(integer_representation, '032b')

# Example usage
float_num = 1.2345
binary_representation = float_to_binary(float_num)
print(binary_representation)

### Pluck out a 2-d weight

In [None]:
def about(in_x):
    x = in_x.to(torch.float64)
    print(f"{x.shape}, {in_x.dtype}, ({x.max()}, {x.mean()}, {x.min()}), {(x*x).mean(dtype=torch.float32):.3e}")
    #print(f"{(x*x).sum(dtype=torch.float32):.3e}, {x.norm():.3e}, {torch.linalg.vector_norm(x):.3e}, {(x.to(dtype=torch.float64)*x.to(dtype=torch.float64)).sum(dtype=torch.float64):.3e}")

In [None]:
#w = model.transformer.h[0].attn.c_attn.weight
w = model.transformer.h[0].mlp.c_fc.weight
w.requires_grad = False
about(w)

Make a pretend input to the layer:

In [None]:
x = torch.randn(768)
about(x)

Calculate the output in model-native float32:

In [None]:
y =  w @ x
about(y)