### Demonstrate the LLM GPT2 Model OnBoarding on Cloud AI 100 Platform

##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory

In [1]:
# Initiate the Orignal Transformer model
import os

from QEfficient import QEFFAutoModelForCausalLM

# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.
# os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache"

# ROOT_DIR = os.path.dirname(os.path.abspath(""))
# CACHE_DIR = os.path.join(ROOT_DIR, "tmp") #, you can use a different location for just one model by passing this param as cache_dir in below API.

# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl
model_name = "gpt2"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.

qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
print(f"{model_name} from hugging-face \n", qeff_model)



gpt2 from hugging-face 
 GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


##### Now we Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.
##### Here we generate models with below Optimizations:

* RMS Norm Fixes for FP16 Overflows and Underflow
* Causal Mask Fix
* Handling FP16 Overflows.
* KV Cache (Retention Changes).
* Triu/Tril Ops support.

In [2]:
import QEfficient

# Easy and minimal api to update the model
model_transformed = QEfficient.transform(qeff_model, form_factor="cloud")

print("Model after Optimized transformations \n", model_transformed)



Model after Optimized transformations 
 QEffGPT2LMHeadModel(
  (transformer): QEffGPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x QEffGPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): QEffGPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


##### Export the Optimized Pytorch model to the Onnx Framework

In [3]:
from QEfficient.utils import load_hf_tokenizer
# We can now export the modified models to Onnx framework
# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for
# Cloud AI 100 Platform.

# This will generate Onnx model, clip the overflow constants to fp16
# Verify the model on Onnxruntime vs Pytorch
# Then generate inputs and customio yaml file required for compilation.

# We can generate the KV Style models with the flag "kv"
# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.
# It is recommended to use kv=True for better performance.
tokenizer = load_hf_tokenizer(model_name, use_cache=True)
base_path, onnx_path = QEfficient.export(
    model_name=model_name,
    model_kv=model_transformed,
    tokenizer=tokenizer,
    kv=True,
    form_factor="cloud",
    return_path=True,
)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

  if batch_size <= 0:
  assert value.shape[2] == seq_length
  attention_mask_RetainedState=attention_mask_retained if past_length > 0 else None,


verbose: False, log level: Level.ERROR



logits 		 7.62939453125e-05
attention_mask_RetainedState 		 0.0
past_keys (mean) 		 2.635022004445394e-06
past_value (mean) 		 5.5730342864990234e-06




logits 		 7.62939453125e-05
attention_mask_RetainedState 		 0.0
past_keys (mean) 		 2.635022004445394e-06
past_value (mean) 		 5.5730342864990234e-06




##### Compile the Optimized KV Cache Single Model on Cloud AI 100 (**Config; 16C;32PL;128CTX;FP16**)

In [4]:
# Please use platform SDk to Check num_cores for your card.

generated_qpc_path = QEfficient.compile(
    onnx_path=onnx_path,
    num_cores=14,
    qpc_path=os.path.dirname(base_path),
    mxfp6=False,
    device_group=[0],
)

Running AI 100 compiler: /opt/qti-aic/exec/qaic-exec -m=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/gpt2_kv_clipped_fp16.onnx -aic-hw -aic-hw-version=2.0 -network-specialization-config=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/specializations.json -convert-to-fp16 -retained-state -aic-num-cores=14 -custom-IO-list-file=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/onnx/custom_io_fp16.yaml -compile-only -aic-binary-dir=/local/mnt/workspace/open-source/myown/efficient-transformers/qeff_models/gpt2/qpcs


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)






##### Execute the Optimized KV Model on H/W and Print the Latency Stats *(tok/sec)*

In [5]:
from QEfficient.generation.text_generation_inference import get_compilation_batch_size

# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
batch_size = get_compilation_batch_size(generated_qpc_path)
QEfficient.cloud_ai_100_exec_kv(batch_size=batch_size, tokenizer=tokenizer, qpc_path=generated_qpc_path, device_id=[0], prompt=["My name is"])

0 My name is  John .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man  of  God .  I 'm  a  man 

Prefill time a.k.a TTFT is= 0.01 s
Decode: 220.31 tok/s
E2E: 216.88 tok/s
Total (E2E) inference time is= 0.44 s
