# API vs Direct Chat Manual Inspection Notebook
This notebook provides two helper functions:
1. **api_chat** - calls the running BeautyAI REST API.
2. **direct_chat** - invokes the internal ChatService directly.

You can: 
- Ask the *same* question with identical parameters.
- Inspect whether responses match.
- Compare output structure / format.
- Check whether the same ChatService instance ID persists (printed).

No regex cleaning, no auto comparison; just raw outputs for your manual review.

In [16]:
import sys, json, logging, requests
from pathlib import Path
from typing import Any, Dict

# Silence noisy loggers
logging.basicConfig(level=logging.CRITICAL)
for noisy in ["uvicorn", "uvicorn.error", "uvicorn.access", "httpx", "transformers", "torch"]:
    logging.getLogger(noisy).setLevel(logging.CRITICAL)

# Add backend source
backend_src = Path('..').resolve().parent / 'src'
sys.path.insert(0, str(backend_src))

from beautyai_inference.services.inference.chat_service import ChatService  # type: ignore
from beautyai_inference.config.config_manager import AppConfig  # type: ignore
from beautyai_inference.services.model.registry_service import ModelRegistryService  # type: ignore

API_BASE_URL = 'http://127.0.0.1:8000'
_CHAT_SERVICE = ChatService()
_APP_CONFIG = AppConfig()
_MODEL_REGISTRY_SERVICE = ModelRegistryService()

def api_chat(model_name: str, message: str, **params) -> Dict[str, Any]:
    payload = {'model_name': model_name, 'message': message} | params
    r = requests.post(f'{API_BASE_URL}/inference/chat', json=payload, timeout=60)
    try: return r.json()
    except Exception: return {'error': True, 'status_code': r.status_code, 'text': r.text} 

def direct_chat(model_name: str, message: str, **params) -> Dict[str, Any]:
    if not getattr(_APP_CONFIG, 'model_registry', None): _APP_CONFIG.load_model_registry()
    model_cfg = _MODEL_REGISTRY_SERVICE.get_model(_APP_CONFIG, model_name)
    if model_cfg is None: return {'error': f'model {model_name} not in registry'}
    gen_cfg = params.copy()
    for k in ['disable_content_filter','enable_thinking']: gen_cfg.pop(k, None)
    response, detected_language, _, session_id = _CHAT_SERVICE.chat(
        message=message, model_name=model_name, model_config=model_cfg,
        generation_config=gen_cfg, conversation_history=[], response_language='auto',
        session_id=None, disable_content_filter=params.get('disable_content_filter', False)
    )
    return {
        'final_content': response,
        'detected_language': detected_language,
        'session_id': session_id,
        'chat_service_id': id(_CHAT_SERVICE),
        'model_name': model_name,
    }
print('Functions ready: api_chat, direct_chat')

Functions ready: api_chat, direct_chat


In [17]:
# Example usage — adjust model, message, and params as needed.
model_name = 'qwen3-unsloth-q4ks'  # change if needed
message = '/no_think What is botox?'
params = {
    'temperature': 0.0,
    'top_p': 0.95,
    'max_new_tokens': 200,
    'do_sample': False,
    'disable_content_filter': True,
    'enable_thinking': False,
} 

api_result = api_chat(model_name, message, **params)
direct_result = direct_chat(model_name, message, **params)
print('=== API RESULT RAW JSON ===')
print(json.dumps(api_result, ensure_ascii=False, indent=2))
print('=== DIRECT RESULT RAW JSON ===')
print(json.dumps(direct_result, ensure_ascii=False, indent=2))

=== API RESULT RAW JSON ===
{
  "success": true,
  "data": null,
  "timestamp": "2025-08-22T02:24:52.188558Z",
  "execution_time_ms": 127.55966186523438,
  "response": "…",
  "session_id": "default",
  "model_name": "qwen3-unsloth-q4ks",
  "generation_stats": {
    "model_info": {},
    "generation_config_used": {
      "temperature": 0.0,
      "top_p": 0.95,
      "top_k": 20,
      "repetition_penalty": 1.05,
      "max_new_tokens": 200,
      "do_sample": false,
      "enable_thinking": false
    },
    "content_filter_config": {
      "strictness_level": "disabled"
    },
    "performance": {
      "total_time_ms": 127.55966186523438,
      "generation_time_ms": 125.53858757019043,
      "tokens_generated": 1,
      "tokens_per_second": 7.96567827753268,
      "thinking_tokens": 0
    }
  },
  "effective_config": {
    "temperature": 0.0,
    "top_p": 0.95,
    "top_k": 20,
    "repetition_penalty": 1.05,
    "max_new_tokens": 200,
    "do_sample": false,
    "enable_thinking": fa

In [18]:
# Simple test to debug the model response generation
import requests

# Test just the API with a simple question
test_payload = {
    "model_name": "qwen3-unsloth-q4ks",
    "message": "ما هي الفائدة الرئيسية لعلاج الضوء النبدي المكثف؟",
    "temperature": 0.7,
    "max_new_tokens": 150,
    "disable_content_filter": True,
}

print("Testing API with Arabic question...")
response = requests.post('http://127.0.0.1:8000/inference/chat', json=test_payload, timeout=60)
print(f"Status Code: {response.status_code}")

if response.status_code == 200:
    result = response.json()
    print(f"Response: {result.get('response', 'No response field')[:500]}")
    print(f"Language: {result.get('language', 'No language field')}")
else:
    print(f"Error: {response.text}")

Testing API with Arabic question...


Status Code: 200
Response: <think>
Okay, the user is asking about the main benefit of Intense Pulsed Light (IPL) treatment. Let me recall what IPL is used for. It's a non-invasive procedure that uses broad-spectrum light to target various skin issues.
First, I should mention the primary uses: things like sun damage, age spots, freckles, and uneven skin tone. Also, it can help with vascular issues like spider veins and rosacea. Maybe also mention hair removal as another application.
But the user specifically asked for the 
Language: No language field


In [13]:
# Test the /no_think instruction specifically
test_payload_no_think = {
    "model_name": "qwen3-unsloth-q4ks",
    "message": "ما هي الفائدة الرئيسية لعلاج الضوء النبدي المكثف؟ /no_think",
    "temperature": 0.7,
    "max_new_tokens": 150,
    "disable_content_filter": True,
}

print("Testing with /no_think instruction...")
response = requests.post('http://127.0.0.1:8000/inference/chat', json=test_payload_no_think, timeout=60)
print(f"Status Code: {response.status_code}")

if response.status_code == 200:
    result = response.json()
    print(f"Response: {result.get('response', 'No response field')}")
    print(f"Language: {result.get('language', 'No language field')}")
    print(f"Response Length: {len(result.get('response', ''))}")
else:
    print(f"Error: {response.text}")

Testing with /no_think instruction...
Status Code: 200
Response: <think>
Okay, the user is asking about the main benefit of Intense Pulsed Light (IPL) treatment. Let me recall what IPL is used for. It's a non-invasive procedure that uses broad-spectrum light to target various skin issues.
First, I should mention the primary uses: things like sun damage, pigmentation, and vascular lesions. The main benefit here is improving skin texture and appearance by reducing these issues. Also, IPL can help with hair removal, but the user might be more interested in skin rejuvenation.
I need to make sure the answer is clear and concise, in Arabic only. Avoid medical jargon so it's easy for patients to understand. Highlight the key points: treating pigmentation, sunspots
Language: No language field
Response Length: 702


In [19]:
# Test the fixed API with enable_thinking=False explicitly
test_payload_fixed = {
    "model_name": "qwen3-unsloth-q4ks",
    "message": "ما هي الفائدة الرئيسية لعلاج الضوء النبدي المكثف؟",
    "temperature": 0.3,
    "top_p": 0.95,
    "max_new_tokens": 150,
    "disable_content_filter": True,
    "enable_thinking": False,  # Explicitly disable thinking mode
}

print("Testing with enable_thinking=False...")
response = requests.post('http://127.0.0.1:8000/inference/chat', json=test_payload_fixed, timeout=60)
print(f"Status Code: {response.status_code}")

if response.status_code == 200:
    result = response.json()
    print(f"Response: {result.get('response', 'No response field')}")
    print(f"Language: {result.get('language', 'No language field')}")
    print(f"Response Length: {len(result.get('response', ''))}")
    print(f"Thinking Enabled: {result.get('thinking_enabled', 'N/A')}")
else:
    print(f"Error: {response.text}")

Testing with enable_thinking=False...


Status Code: 200
Response: العلاج بالضوء النبدي المكثف (IPL) يُستخدم بشكل رئيسي لتحسين مظهر البشرة من خلال تقليل البقع الداكنة، التصبغات، والتهابات البشرة. كما يساعد في تقليل ظهور الشعر الزائد وتحسين نضارة البشرة.
Language: No language field
Response Length: 186
Thinking Enabled: False


In [15]:
# Check recent API logs for debug messages about enable_thinking
import subprocess

print("Checking recent API logs for enable_thinking debug messages...")
try:
    result = subprocess.run([
        "sudo", "journalctl", "-u", "beautyai-api.service", 
        "--since", "2 minutes ago", "--no-pager"
    ], capture_output=True, text=True, timeout=10)
    
    lines = result.stdout.split('\n')
    relevant_lines = [line for line in lines if any(keyword in line.lower() for keyword in [
        'enable_thinking', 'chat template', 'fallback', 'debug'
    ])]
    
    for line in relevant_lines[-10:]:  # Show last 10 relevant lines
        print(line)
        
except Exception as e:
    print(f"Error checking logs: {e}")

Checking recent API logs for enable_thinking debug messages...


In [None]:
# Test remote API availability and streaming voice status
import requests
import ssl
import urllib3

# Disable SSL warnings for self-signed certificates
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def test_remote_api():
    base_url = "https://api.gmai.sa"
    
    # Test basic API health
    try:
        response = requests.get(f"{base_url}/health", verify=False, timeout=10)
        print(f"API Health Status: {response.status_code}")
        if response.status_code == 200:
            print(f"API Health Response: {response.json()}")
    except Exception as e:
        print(f"API Health Check Failed: {e}")
    
    # Test streaming voice status endpoint
    try:
        response = requests.get(f"{base_url}/api/v1/ws/streaming-voice/status", verify=False, timeout=10)
        print(f"Streaming Voice Status: {response.status_code}")
        if response.status_code == 200:
            print(f"Streaming Voice Response: {response.json()}")
        else:
            print(f"Error Response: {response.text}")
    except Exception as e:
        print(f"Streaming Voice Status Check Failed: {e}")
        
    # Test basic inference endpoint
    try:
        test_payload = {
            "model_name": "qwen3-unsloth-q4ks",
            "message": "Hello test",
            "disable_content_filter": True,
            "max_new_tokens": 10
        }
        response = requests.post(f"{base_url}/inference/chat", json=test_payload, verify=False, timeout=30)
        print(f"Inference Test Status: {response.status_code}")
        if response.status_code == 200:
            result = response.json()
            print(f"Inference Test Success - Response length: {len(result.get('response', ''))}")
        else:
            print(f"Inference Error: {response.text[:200]}")
    except Exception as e:
        print(f"Inference Test Failed: {e}")

test_remote_api()