# Convert Qwen to Llama Architecture

This conversion allows the Qwen model to run on a serving endpoint as provisiioined throughput.

[Source](https://www.databricks.com/blog/serving-qwen-models-databricks)

In [None]:
import json
import os
import shutil
from collections import OrderedDict
from pathlib import Path

import huggingface_hub
import torch
from safetensors import safe_open
from safetensors.torch import save_file

# transformers==4.44.2 is required.
from transformers.modeling_utils import (
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
    shard_checkpoint,
)

model_name = "Qwen/Qwen2.5-Coder-7B"
dbfs_root_target = "../fixtures/models"

# Convert Qwen to Llama architecture


In [None]:
# Define a function to convert and save Qwen weights


def save_weight(input_dir: str, output_dir: str, shard_size: str = "2GB") -> None:
    """
    Copies a Qwen model in the input directory to a Llama-compatible version of it in the output directory.
    Injects zeroed-out bias vectors in attention layers where needed in order to make it compatible with the Llama
    architecture.  Also updates configuration as needed.
    """
    # Load Qwen state dict from .safetensors files
    qwen_state_dict = OrderedDict()
    for filepath in os.listdir(input_dir):
        if filepath.endswith(".safetensors"):
            full_path = os.path.join(input_dir, filepath)
            with safe_open(full_path, framework="pt", device="cpu") as sf:
                for key in sf.keys():
                    qwen_state_dict[key] = sf.get_tensor(key)

    # Copy tensors and inject bias where needed to match Llama
    llama_state_dict = OrderedDict()
    for key, value in qwen_state_dict.items():
        llama_state_dict[key] = value
        # Qwen omits bias on attn.o_proj; Llama expects it
        if "attn.o_proj.weight" in key:
            # Each attn.o_proj.weight needs an associated bias in order to be
            # compatible with the Llama architecture. Since Qwen doesn't use this we
            # insert zeroed out vectors.
            bias_key = key.replace("attn.o_proj.weight", "attn.o_proj.bias")
            llama_state_dict[bias_key] = torch.zeros_like(value[:, 0]).squeeze()

    # Save weights using safetensors
    shards, index = shard_checkpoint(
        llama_state_dict, max_shard_size=shard_size, weights_name=SAFE_WEIGHTS_NAME
    )
    for shard_file, shard_data in shards.items():
        save_path = os.path.join(output_dir, shard_file)
        save_file(shard_data, save_path, metadata={"format": "pt"})

    if index is not None:
        with open(
            os.path.join(output_dir, SAFE_WEIGHTS_INDEX_NAME), "w", encoding="utf-8"
        ) as f:
            json.dump(index, f, indent=2, sort_keys=True)

In [None]:
def save_configs(input_dir: str, output_dir: str) -> None:
    """
    Copies Qwen config and tokenizer files to output_dir, removing Qwen-specific fields
    and making them compatible with the Llama architecture.
    """
    config_name = "config.json"

    # Load Qwen config.json
    with open(os.path.join(input_dir, config_name), encoding="utf-8") as f:
        qwen_config_dict = json.load(f)

    # Modify the Qwen config to look like a Llama model
    llama_config_dict = {**qwen_config_dict}
    llama_config_dict["architectures"] = ["LlamaForCausalLM"]  # now it's Llama 8-)
    llama_config_dict["model_type"] = "llama"
    llama_config_dict["attention_bias"] = True  # Llama-specific
    llama_config_dict["mlp_bias"] = False
    llama_config_dict["pretraining_tp"] = 0

    # Remove Qwen-specific fields related to sliding window
    for del_key in ["sliding_window", "use_sliding_window", "max_window_layers"]:
        if del_key in llama_config_dict:
            del llama_config_dict[del_key]

    # Write updated config to the new directory
    with open(os.path.join(output_dir, config_name), "w", encoding="utf-8") as f:
        json.dump(llama_config_dict, f, indent=2)

    # Copy other relevant files (tokenizer, merges, vocab, and so on)
    additional_files = [
        "generation_config.json",
        "merges.txt",
        "tokenizer.json",
        "tokenizer_config.json",
        "vocab.json",
    ]
    for fname in additional_files:
        src = os.path.join(input_dir, fname)
        dst = os.path.join(output_dir, fname)
        if os.path.exists(src):
            shutil.copyfile(src, dst)

In [None]:
# Master Function to Orchestrate the Qwen→Llama Conversion


def llamafy_qwen(input_dir: str, output_dir: str) -> None:
    """
    Converts Qwen2.5 into a Llama-like architecture by rewriting weights and configs.
    After this step, the resulting folder can be treated as if it's a Llama model.
    """
    os.makedirs(output_dir, exist_ok=False)

    # Rewrite Qwen weights to add missing biases
    save_weight(input_dir, output_dir)

    # Update config to make it a Llama model and copy other files
    save_configs(input_dir, output_dir)

    print(
        f"Successfully converted Qwen from '{input_dir}' to Llama format at '{output_dir}'."
    )

In [None]:
assert model_name
assert dbfs_root_target

target_dbfs_path = os.path.join(dbfs_root_target, model_name)
target_dbfs_modified_path = f"{target_dbfs_path}-Llama"

if not os.path.exists(Path(target_dbfs_path).parent):
    os.makedirs(Path(target_dbfs_path).parent)

if not os.path.exists(target_dbfs_path):
    print(f"Downloading to {target_dbfs_path}")
    huggingface_hub.snapshot_download(model_name, local_dir=target_dbfs_path)
else:
    print(f"Already exists: {target_dbfs_path}")

In [None]:
if not os.path.exists(target_dbfs_modified_path):
    print(f"Modifying Qwen model and writing to {target_dbfs_modified_path}")
    llamafy_qwen(target_dbfs_path, target_dbfs_modified_path)
else:
    print(f"Already exists: {target_dbfs_modified_path}")

# Register Lamafied Qwen Model to MLflow


In [None]:
import mlflow
import mlflow.tracking._model_registry.utils
from dotenv import load_dotenv

# The login and mlflow model registery are set this way because of local development. If you are running this in Databricks, you can remove the login and use the regualr mlflow registry.

load_dotenv()

mlflow.login()

mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = (
    lambda: "databricks-uc"
)

mlflow.set_experiment("/LamafiedQwen")

In [None]:
import os

model_path = "../../fixtures/models/Qwen/Qwen2.5-Coder-7B-Llama"
catalog = "gen_ai_toolkit"
schema = "models"

In [None]:
assert model_path

if not model_name:
    model_name = model_path.rstrip("/").split("/")[-1]
    model_name = model_name.replace(".", "").lower()

print(f"Model path: {model_path}")
print(f"Model name: {model_name}")

In [None]:
# Modify the tokenizer files and model config in the source directory

# Load the tokenizer config
with open(os.path.join(model_path, "tokenizer_config.json")) as f:
    tokenizer_config_obj = json.loads(f.read())

# Remove Qwen’s chat template since it is not recognized by model serving.
if "chat_template" in tokenizer_config_obj:
    del tokenizer_config_obj["chat_template"]

# Update the tokenizer class so that Databricks sees it as "PreTrainedTokenizerFast",
# since Qwen's tokenizer is not recognized, and it derives from PreTrainedTokenizerFast.
# This also avoids saving additional files during registration that model
# serving would not expect.
tokenizer_config_obj["tokenizer_class"] = "PreTrainedTokenizerFast"

# Model serving expects model_input_names to be specified for Llama models.
tokenizer_config_obj["model_input_names"] = ["input_ids", "attention_mask"]

# Write the updated configs back
with open(os.path.join(model_path, "tokenizer_config.json"), "w") as f:
    f.write(json.dumps(tokenizer_config_obj, indent=2))

config_path = os.path.join(model_path, "config.json")
with open(config_path) as f:
    config_obj = json.loads(f.read())

# Set 'max_position_embeddings' to 16000 for compatibility with certain throughput settings.
config_obj["max_position_embeddings"] = 16000

with open(config_path, "w") as f:
    f.write(json.dumps(config_obj, indent=2))

# don't need the slow tokenizer files
files_to_delete = ["merges.txt", "vocab.json"]
for file_to_delete in files_to_delete:
    if os.path.exists(os.path.join(model_path, file_to_delete)):
        os.remove(os.path.join(model_path, file_to_delete))

In [None]:
# Register the model as a Llama model

# We'll create metadata that references Llama so Databricks sees "LlamaForCausalLM".
# Some of the specific versions/sizes referenced here don't matter for our purposes.
# The important thing is that model serving considers this a Llama model.
# Note that we register this as a completion model and removed the chat template.
# Chat formatting will need to be performed by the client.
task = "llm/v1/completions"
metadata = {
    "task": task,
    "curation_version": 1,
    "databricks_model_family": "LlamaForCausalLM (llama-3.2)",
    "databricks_model_size_parameters": "7b",
    "databricks_model_source": "genai-fine-tuning",
    "source": "huggingface",
    "source_model_name": "meta-llama/Llama-3.2-3B-Instruct",
    "source_model_revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
}

input_example = {
    "prompt": "def print_hello_world():",
    "max_tokens": 20,
    "temperature": 0.05,
    "stop": ["\n\n"],
}

In [None]:
assert catalog
assert schema
assert model_name

registered_model_name = ".".join([catalog, schema, model_name])

print(f"Registering model as {registered_model_name}")

with mlflow.start_run():
    mlflow.transformers.log_model(
        transformers_model=model_path,
        artifact_path="model",
        registered_model_name=registered_model_name,
        input_example=input_example,
        metadata=metadata,
        task=task,
        torch_dtype="bfloat16",
    )