In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
"""Goal is to build a Simple Linear Layer class where weights are stored in 8-bits we upcaste them 
to inputs dtype in forward"""

'Goal is to build a Simple Linear Layer class where weights are stored in 8-bits we upcaste them \nto inputs dtype in forward'

In [3]:
random_int8 = torch.randint(-128, 127, (32, 16)).to(torch.int8)
random_hs = torch.randn((1, 16), dtype=torch.bfloat16)
scales = torch.randn((32), dtype=torch.bfloat16)
bias = torch.randn((1, 32), dtype=torch.bfloat16)

In [4]:
def w8_a16_forward(weights,inputs,scales,bias=None):
    casted_weights = weights.to(inputs.dtype)
    out = F.linear(inputs,casted_weights) * scales
    if bias is not None:
        out = out + bias
    return out

In [5]:
class W8A16LinearLayer(nn.Module):
    #make init similar to nn.Linear
    def __init__(self,in_features,out_features,bias=None,dtype=torch.float32):
        super().__init__()
        #create params using register buffer
        self.register_buffer("int8_weights",torch.randint(-128,127,(out_features,in_features),dtype=torch.int8))
        self.register_buffer("scales",torch.randn((out_features),dtype=dtype))
        #bias
        if bias is not None:
            self.register_buffer("bias",torch.randn((1,out_features),dtype=dtype))
        else:
            self.bias = None
    def quantize(self,weights):
        #upcast weights for stability!?
        wp_32 = weights.clone().to(torch.float32)
        scales = wp_32.abs().max(dim=-1).values/127
        scales = scales.to(weights.dtype)
        int8_weights = torch.round(weights/scales.unsqueeze(1)).to(torch.int8)
        self.int8_weights = int8_weights
        self.scales = scales
    def forward(self,inputs):
        return w8_a16_forward(self.int8_weights,inputs,self.scales,self.bias)
    

In [6]:
module = W8A16LinearLayer(8,16)

In [7]:
module.int8_weights

tensor([[  38,   34,   96,   15, -118,  -11,  -37,   45],
        [ 115, -106, -109,   21,  -81, -122,   46,  -65],
        [  59,  126,  -47,   32,  -22,   70,   88, -124],
        [ -35,  -31,   52,  -38,  -49,   33,  105,  -13],
        [  74, -100,  -64,  -43,  -29,  -10,  -98,  -74],
        [  68,   61,   60,  -64,   95,  -86,  -76,  -48],
        [ -57,   19,  -61,  -82,  -70,  -54, -107, -111],
        [ -32, -114,  -92,  -28,   49,   36,  -81,  -12],
        [  18,   57,  -25,  -28,    7,   16,   75, -101],
        [ 120,  -59,  121,   26,  -16,   59, -122,   -9],
        [-100,  116,  -24,   44,  -88,   94,  105,  -74],
        [-126, -101,   45,   56,  -74,  -84,   68,  -81],
        [ 110,  -88,   81,  -20,  -24,  -74, -105,   99],
        [  34,   92,  -71, -102,   29,  -29,  -82,  108],
        [  19,    2,  -74,  -77,   76,   46,  -80, -112],
        [-127,   17,  122, -117,   48,  -61,   67,  -69]], dtype=torch.int8)

In [8]:
module.scales

tensor([ 0.4381,  0.0704,  1.0747,  0.0724, -0.8331, -0.4196, -1.5668,  1.5509,
        -0.2814,  0.0656, -0.2010,  0.6287,  1.2702, -0.3286,  0.9354,  1.8299])

In [9]:
dummy_init = W8A16LinearLayer(16,32,0.02)

In [10]:
module = W8A16LinearLayer(16,32)
dummy_hidden_state = torch.randn(1,6,16)

In [11]:
module(dummy_hidden_state).dtype

torch.float32

In [12]:
module.int8_weights.shape

torch.Size([32, 16])

In [13]:
def replace_quant_weights(module,target_class,module_name_to_exclude):
    for name,child in module.named_children():
        if isinstance(child,nn.Linear) and not any([x == name for x in module_name_to_exclude]):
            old_bias = child.bias
            new_module = target_class(child.in_features,child.out_features,
                                      old_bias is not None,
                                      child.weight.dtype)
            setattr(module,name,new_module)
            if old_bias is not None:
                getattr(module,name).bias = old_bias
        else:
            #recurssively call func for nested ones
            replace_quant_weights(child,target_class,module_name_to_exclude)

In [14]:
class DummyModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = torch.nn.Embedding(1, 1)
        # Try with bias
        self.linear_1 = nn.Linear(1, 1)
        # Try without bias
        self.linear_2 = nn.Linear(1, 1, bias=False)
        # Lm prediction head
        self.lm_head = nn.Linear(1, 1, bias=False)

In [15]:
model_1 = DummyModel()
model_2 = DummyModel()

In [16]:
replace_quant_weights(model_1, W8A16LinearLayer, ["lm_head"])
print(model_1)

DummyModel(
  (emb): Embedding(1, 1)
  (linear_1): W8A16LinearLayer()
  (linear_2): W8A16LinearLayer()
  (lm_head): Linear(in_features=1, out_features=1, bias=False)
)


In [17]:
replace_quant_weights(model_2, W8A16LinearLayer, [])
print(model_2)

DummyModel(
  (emb): Embedding(1, 1)
  (linear_1): W8A16LinearLayer()
  (linear_2): W8A16LinearLayer()
  (lm_head): W8A16LinearLayer()
)


In [18]:
def replace_quant_weights_and_quant(module,target_class,module_name_to_exclude):
    for name,child in module.named_children():
        if isinstance(child,nn.Linear) and not any([x == name for x in module_name_to_exclude]):
            old_bias = child.bias
            old_weight = child.weight
            new_module = target_class(child.in_features,child.out_features,
                                      old_bias is not None,
                                      child.weight.dtype)
            setattr(module,name,new_module)
            #quantize
            getattr(module,name).quantize(old_weight)
            if old_bias is not None:
                getattr(module,name).bias = old_bias
        else:
            #recurssively call func for nested ones
            replace_quant_weights_and_quant(child,target_class,module_name_to_exclude)

In [19]:
model_3 = DummyModel()

In [20]:
replace_quant_weights_and_quant(model_3, W8A16LinearLayer, [])
print(model_3)

DummyModel(
  (emb): Embedding(1, 1)
  (linear_1): W8A16LinearLayer()
  (linear_2): W8A16LinearLayer()
  (lm_head): W8A16LinearLayer()
)


In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "Salesforce/codegen-350M-mono"

model = AutoModelForCausalLM.from_pretrained(model_id, 
                                    torch_dtype=torch.bfloat16, 
                                             low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

In [22]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [23]:
print(pipe("def hello_world():", max_new_tokens=20, do_sample=False))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': 'def hello_world():\n    print("Hello World")\n\nhello_world()\n\n# 파'}]


In [24]:
print("Model before:\n\n", model)

Model before:

 CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bi

In [25]:
replace_quant_weights_and_quant(model, W8A16LinearLayer, ["lm_head"])

In [26]:
print(pipe("def hello_world():", max_new_tokens=20, 
           do_sample=False)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


def hello_world(): " " " " " " " " " " " " " " " " " " " "
