In [1]:
# Test merged model before GGUF conversion
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# from peft import PeftModel, PeftConfig
from huggingface_hub import login
from dotenv import load_dotenv

In [2]:
load_dotenv(override=True)

FFT_MODEL_PATH = "rtweera/qwen-choreo-full-ft-2025-May-18_14-33-09-final"

login(token=os.getenv('hf_token'))


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Ravindu\.cache\huggingface\token
Login successful


In [3]:
# Test the pushed model
print("Testing already pushed model from Hub...")
try:
    merged_model = AutoModelForCausalLM.from_pretrained(
       FFT_MODEL_PATH,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(FFT_MODEL_PATH)
    print("Successfully loaded fft model.")
except Exception as e:
    print(f"Could not load fft model: {e}")


Testing already pushed model from Hub...


config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Successfully loaded fft model.


In [4]:

# Test the model
def test_model(prompt, max_tokens=100):
    print(f"\nTesting with prompt: '{prompt}'")
    formatted_test = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(formatted_test, return_tensors="pt").to(merged_model.device)
    
    print("Generating response...")
    outputs = merged_model.generate(
        input_ids=inputs["input_ids"],
        max_new_tokens=max_tokens,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    print("\nModel response:")
    print(response)
    
    # Check if response contains end token
    if "<|im_end|>" in response:
        print("\n✅ Response properly terminated with <|im_end|> token")
    else:
        print("\n❌ Response did NOT terminate with <|im_end|> token")
    
    return response


In [5]:

# Test with a variety of prompts
test_prompts = [
    "What is your name?",
    "How do I configure my applications to scale automatically?",
    "Tell me a short joke",
    "What are the steps to deploy a container?"
]

for prompt in test_prompts:
    test_model(prompt)



Testing with prompt: 'What is your name?'
Generating response...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Model response:
<|im_start|>user
What is your name?<|im_end|>
<|im_start|>assistant
My name is Claude. How can I assist you today?<|im_end|>

✅ Response properly terminated with <|im_end|> token

Testing with prompt: 'How do I configure my applications to scale automatically?'
Generating response...

Model response:
<|im_start|>user
How do I configure my applications to scale automatically?<|im_end|>
<|im_start|>assistant
To configure your applications to scale automatically, you can follow these steps:\n\n### Step 1: Identify Your Applications\nFirst, identify the applications that are dependent on external resources. These could be databases, servers, or other services.\n\n### Step 2: Determine Resource Requirements\nFor each dependent application, determine how much resource (CPU, memory, disk space) it requires at peak usage. This is crucial for determining the scaling strategy.\n\n### Step 3: Choose a

✅ Response properly terminated with <|im_end|> token

Testing with prompt: 'Te

In [7]:
test_model("What is choreo by wso2")


Testing with prompt: 'What is choreo by wso2'
Generating response...

Model response:
<|im_start|>user
What is choreo by wso2<|im_end|>
<|im_start|>assistant
Choreo is a managed service for orchestrating and managing containerized applications. It provides orchestration capabilities, which allow you to manage multiple containers in a single console interface using the WSO2 platform. Choreo supports various types of orchestration, including:

1. **Single Container Orchestration**: You can define and run a single application instance with specific configurations.
2. **Multi-Container Orchestration**: You can orchestrate multiple applications across different environments or clusters within your organization.

Ch

✅ Response properly terminated with <|im_end|> token


'<|im_start|>user\nWhat is choreo by wso2<|im_end|>\n<|im_start|>assistant\nChoreo is a managed service for orchestrating and managing containerized applications. It provides orchestration capabilities, which allow you to manage multiple containers in a single console interface using the WSO2 platform. Choreo supports various types of orchestration, including:\r\n\r\n1. **Single Container Orchestration**: You can define and run a single application instance with specific configurations.\r\n2. **Multi-Container Orchestration**: You can orchestrate multiple applications across different environments or clusters within your organization.\r\n\r\nCh'