In [2]:
# Test merged model before GGUF conversion
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# from peft import PeftModel, PeftConfig
from huggingface_hub import login
from dotenv import load_dotenv

In [3]:
load_dotenv(override=True)

LORA_MODEL_PATH = "rtweera/qwen_choreo_ft_lora"  
BASE_MODEL_PATH = "Qwen/Qwen2.5-0.5B-Instruct"
MERGED_MODEL_PATH = "rtweera/qwen_choreo_merged" 

login(token=os.getenv('hf_token'))


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Ravindu\.cache\huggingface\token
Login successful


In [4]:

# Option 1: Test already merged model (if already pushed it)
print("Testing already merged model from Hub...")
try:
    merged_model = AutoModelForCausalLM.from_pretrained(
        MERGED_MODEL_PATH,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
    using_already_merged = True
    print("Successfully loaded already merged model.")
except Exception as e:
    print(f"Could not load already merged model: {e}")
    using_already_merged = False


Testing already merged model from Hub...


config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.54k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

Successfully loaded already merged model.


In [None]:

# Option 2: Create merged model fresh if Option 1 failed
# if not using_already_merged:
#     print("Creating fresh merged model...")
#     # Load base model and tokenizer
#     base_model = AutoModelForCausalLM.from_pretrained(
#         BASE_MODEL_PATH,
#         torch_dtype=torch.bfloat16,
#         device_map="auto"
#     )
#     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)

#     # Load LoRA and merge
#     peft_model = PeftModel.from_pretrained(base_model, LORA_MODEL_PATH)
#     merged_model = peft_model.merge_and_unload()
#     print("Successfully created fresh merged model.")


In [5]:

# Test the model
def test_model(prompt, max_tokens=100):
    print(f"\nTesting with prompt: '{prompt}'")
    formatted_test = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(formatted_test, return_tensors="pt").to(merged_model.device)
    
    print("Generating response...")
    outputs = merged_model.generate(
        input_ids=inputs["input_ids"],
        max_new_tokens=max_tokens,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    print("\nModel response:")
    print(response)
    
    # Check if response contains end token
    if "<|im_end|>" in response:
        print("\n✅ Response properly terminated with <|im_end|> token")
    else:
        print("\n❌ Response did NOT terminate with <|im_end|> token")
    
    return response


In [6]:

# Test with a variety of prompts
test_prompts = [
    "What is your name?",
    "How do I configure my applications to scale automatically?",
    "Tell me a short joke",
    "What are the steps to deploy a container?"
]

for prompt in test_prompts:
    test_model(prompt)



Testing with prompt: 'What is your name?'
Generating response...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Model response:
<|im_start|>user
What is your name?<|im_end|>
<|im_start|>assistant
Hello! I am not a person, but rather an artificial intelligence designed to answer questions and provide information on various topics. Is there anything specific you would like to know or discuss?<|im_end|>

✅ Response properly terminated with <|im_end|> token

Testing with prompt: 'How do I configure my applications to scale automatically?'
Generating response...

Model response:
<|im_start|>user
How do I configure my applications to scale automatically?<|im_end|>
<|im_start|>assistant
Configuring your applications to scale automatically involves several steps, including setting up your application's infrastructure and implementing appropriate scaling policies. Here’s a step-by-step guide:

### 1. **Understand the Basics**
   - **Application Architecture**: Understand how your application is designed. Is it single-node or multi-node?
   - **Scalability Needs**: Determine what resources (CPU, memory, 

In [7]:

# Check tokenizer special tokens to ensure they're properly configured
print("\nTokenizer special tokens configuration:")
print(tokenizer.special_tokens_map)

# Check generation config
print("\nModel generation config:")
print(merged_model.generation_config)

# Verify the EOS token is properly set
print("\nEOS Token ID:", merged_model.generation_config.eos_token_id)
if merged_model.generation_config.eos_token_id is None:
    print("❌ EOS token ID is not set! This could cause infinite generation.")
    # Try to fix by setting the EOS token
    eos_token_id = tokenizer.encode("<|im_end|>", add_special_tokens=False)[-1]
    print(f"Setting EOS token ID to: {eos_token_id}")
    merged_model.generation_config.eos_token_id = eos_token_id
    # Test again with fixed configuration
    print("\nTesting again with fixed EOS token:")
    test_model("Tell me another short joke")


Tokenizer special tokens configuration:
{'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}

Model generation config:
GenerationConfig {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "repetition_penalty": 1.1,
  "temperature": 0.7,
  "top_k": 20,
  "top_p": 0.8
}


EOS Token ID: [151645, 151643]
