### Demonstrate the LLM GPT2 Model OnBoarding on Cloud AI 100 Platform

##### Download the OpenSource GPT2 based HuggingFace Model and Save in local *Cache* directory

In [None]:
# Initiate the Orignal Transformer model
import os

from transformers import AutoTokenizer
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel

from QEfficient.utils import hf_download
from QEfficient.utils.constants import Constants

# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.
# os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache"

ROOT_DIR = os.path.dirname(os.path.abspath(""))

# Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl
model_name = "gpt2"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.

model_hf_path = hf_download(
    repo_id=model_name,
    cache_dir=Constants.CACHE_DIR,
    ignore_pattrens=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf"],
)
model_hf = GPT2LMHeadModel.from_pretrained(model_hf_path, use_cache=True)
model_hf.eval()
print(f"{model_name} from hugging-face \n", model_hf)

##### Now we Modify the GPT2 Classes using the Optimized Software Library to generate model for Cloud AI 100.
##### Here we generate models with below Optimizations:

* RMS Norm Fixes for FP16 Overflows and Underflow
* Causal Mask Fix
* Handling FP16 Overflows.
* KV Cache (Retention Changes).
* Triu/Tril Ops support.

In [None]:
import QEfficient

# Easy and minimal api to update the model
model_transformed = QEfficient.transform(model_hf, type="Transformers", form_factor="cloud")

model_transformed.eval()
print("Model after Optimized transformations \n", model_transformed)

##### Export the Optimized Pytorch model to the Onnx Framework

In [None]:
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter

# We can now export the modified models to Onnx framework
# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for
# Cloud AI 100 Platform.

# This will generate Onnx model, clip the overflow constants to fp16
# Verify the model on Onnxruntime vs Pytorch
# Then generate inputs and customio yaml file required for compilation.

# We can generate the KV Style models with the flag "kv"
# Bertstyle models do not have any optimization w.r.t KV cache changes and are unoptimized version.
# It is recommended to use kv=True for better performance.
tokenizer = AutoTokenizer.from_pretrained(model_hf_path, use_cache=True)
base_path, onnx_path = qualcomm_efficient_converter(
    model_kv=model_transformed,
    model_name=model_name,
    kv=True,
    form_factor="cloud",
    return_path=True,
    tokenizer=tokenizer,
)

##### Compile the Optimized KV Cache Single Model on Cloud AI 100 (**Config; 16C;32PL;128CTX;FP16**)

In [None]:
# Please use platform SDk to Check num_cores for your card.
from QEfficient.cloud.compile import main as compile

generated_qpc_path = compile(
    onnx_path=onnx_path,
    num_cores=14,
    qpc_path=base_path,
    mxfp6=True,
    device_group=[0],
)

##### Execute the Optimized KV Model on H/W and Print the Latency Stats *(tok/sec)*

In [None]:
from QEfficient.generation.text_generation_inference import latency_stats_kv

# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
latency_stats_kv(tokenizer=tokenizer, qpc=generated_qpc_path, device_id=[0], prompt="My name is")