# Dependencies

In [1]:
import json
import os
from pydantic import BaseModel
from typing import List
from bespokelabs import curator
from transformers import AutoModelForCausalLM, AutoTokenizer




# Load Musique Test and Train Data

In [2]:
test_musique = json.load(open("/home/yigit/codebase/gsw-memory/playground_data/musique.json"))
# Load Musique Train Data jsonl to json format
train_musique = [json.loads(line) for line in open("/home/yigit/codebase/gsw-memory/playground_data/musique_full_v1.0_train.jsonl")]

In [3]:
test_musique_questions = {q["id"]: q["question"] for q in test_musique}
train_musique_questions = {q["id"]: q["question"] for q in train_musique}

In [4]:
# compare if there is any overlap between train and test questions
set(train_musique_questions) & set(test_musique_questions)


set()

In [5]:
len(train_musique_questions)

19938

In [6]:
# Print questions by type using train_musique_questions
datapoint_type_counts = {}
seen_ids = set()
duplicate_ids = set()
for qid in train_musique_questions:
    # count by type
    datapoint_type_counts[qid.split("_")[0]] = datapoint_type_counts.get(qid.split("_")[0], 0) + 1
    # check for same id
    if qid in seen_ids:
        duplicate_ids.add(qid)
    else:
        seen_ids.add(qid)
print("Datapoint type counts:", datapoint_type_counts)
if duplicate_ids:
    print("Duplicate IDs found:", duplicate_ids)
else:
    print("No duplicate IDs found.")


Datapoint type counts: {'2hop': 14376, '3hop1': 3737, '4hop2': 127, '4hop1': 648, '4hop3': 400, '3hop2': 650}
No duplicate IDs found.


In [7]:
# get 5 from each type from train_musique
q_type_keys = list(datapoint_type_counts.keys())
train_musique_questions_by_type = {q_type: [] for q_type in q_type_keys}
for q_type in q_type_keys:
    count = 0
    for qid in train_musique_questions:
        if qid.split("_")[0] == q_type:
            train_musique_questions_by_type[q_type].append(qid)
            count += 1
            if count == 135:
                break
            
# print type length
for q_type, qids in train_musique_questions_by_type.items():
    print(f"{q_type}: {len(qids)}")
    
# convert train_musique_questions_by_type to flat list
train_musique_questions_by_type_list = [q for q_type in q_type_keys for q in train_musique_questions_by_type[q_type]]
len(train_musique_questions_by_type_list)




2hop: 135
3hop1: 135
4hop2: 127
4hop1: 135
4hop3: 135
3hop2: 135


802

In [8]:
decompose_inputs = [
    {"question_id": qid, "question": train_musique_questions[qid]}
    for qid in train_musique_questions_by_type_list
]

In [9]:
len(decompose_inputs)

802

# Define Question Decomposition Class

In [10]:
class DecomposedQuestion(BaseModel):
    question: str
    requires_retrieval: bool

class DecomposedQuestionList(BaseModel):
    questions: List[DecomposedQuestion]

class ChainQuestionDecomposer(curator.LLM):
    """Curator class for decomposing multi-hop questions in parallel."""
    
    # return_completions_object = True
    
    def __init__(self, **kwargs):
        """Initialize the question decomposer."""
        super().__init__(**kwargs)
    
    def prompt(self, input):
        """Create a decomposition prompt for each question."""
        decomposition_prompt = f"""Your task is to break down a complex multi-hop question into the most efficient sequence of single-hop, **atomic** questions.

## Your Main Goal: Build Smart Bridges, Don't Just Collect Nouns
The most critical skill is to convert complex logical clauses (like "despite," "the country where," "the year before") into a single, powerful **bridging question**. This question should use a known entity as context to find the next one. Avoid finding all the entities separately and then trying to figure out how they connect.

---
## A Simple Analogy for Efficiency

**Question:** "What is the phone number of the mother of the tallest player on the Lakers?"

** Inefficient Path:**
1.  Who are the players on the Lakers?
2.  What are all their heights?
3.  Who is the mother of the tallest player? *(This step is a logical leap)*

** Efficient Path:**
1.  Who is the tallest player on the Lakers?
2.  Who is the mother of `<ENTITY_Q1>`?
3.  What is the phone number of `<ENTITY_Q2>`?

---
## How to Decompose a Question
This process follows a logical flow from high-level analysis to the fine-tuning of your question chain.

### 1. Analyze the Query's Components
First, break down the original question into its fundamental building blocks. Identify the core **entities** (people, places, organizations), their **properties** (attributes like rank, location, date), and the **relationships** that connect them.

### 2. Construct an Atomic Chain
Next, formulate a sequence of questions where each question retrieves a single fact.
* **Isolate Comparisons:** Don't ask "who is faster?" Ask for the specific rank or time of each person involved.
* **Link with Placeholders:** Use `<ENTITY_Qn>` to pass the answer from a previous question (`Qn`) into the next one.

### 3. Optimize for Efficiency and Precision
Your final goal is the **shortest and most direct path** to the answer.
* **Embed Constraints to Build Bridges:** If a piece of information is only a filter (like a date or location), embed it as a constraint in the next question instead of asking for it directly.
  **Important note for bridges:** There can be no `<ENTITY_Qn>` in the first question if the nth question DOES NOT require retrieval.

## Formatting
Format each decomposed question as follows:

<decomposition>
Question: [the question text]
Requires retrieval: [true/false]

And provide the response in the following json format:
{{
  "questions": [
    {{
      "question": "the decomposed question text",
      "requires_retrieval": "true/false"
    }}
  ]
}}

Examples:

Input: "What is the birth year of the spouse of the director of Casablanca?"
Output:
{{
    "questions": [
        {{
            "question": "Who directed Casablanca?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who was <ENTITY_Q1>'s spouse?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "What is <ENTITY_Q2>'s birth year?",
            "requires_retrieval": "true"
        }}
    ]
}}

Input: "Which film has the director who is older, Dune or The Dark Knight?"
Output:
{{
    "questions": [
        {{
            "question": "Who directed Dune?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who directed The Dark Knight?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who is older, <ENTITY_Q1> or <ENTITY_Q2>?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who is older, <ENTITY_Q1> or <ENTITY_Q2>?",
            "requires_retrieval": "false"
        }}
    ]
}}


IMPORTANT:
    AVOID over-decomposition like this:
    DON'T break "Who is John Doe?" into:
    1. Who is John Doe? → "English"
    2. When was <ENTITY_Q1> born? → "When was English born?"

    DO ask directly: "When was John Doe born?"

Now decompose this question:
Input: "{input['question']}"
Output:
"""
        
        return [
            {"role": "system", "content": "You are a helpful assistant that breaks down complex questions into simple steps."},
            {"role": "user", "content": decomposition_prompt}
        ]
    
    def parse(self, input, response: DecomposedQuestionList):
        """Parse the decomposition response."""

        # print(response)
        questions = [{"question" : q.question, "requires_retrieval" : q.requires_retrieval} for q in response.questions]
        
        return [{
            "question_id": input['question_id'],
            "original_question": input['question'],
            "decomposed_questions": questions,
            # "raw_response": decomposition_text
        }]

In [11]:
golden_question_decomposer = ChainQuestionDecomposer(
                model_name="gpt-5",
                # model_name="gpt-4o",
                # generation_params={"temperature": 0.0}, 
                response_format=DecomposedQuestionList
            )

decomposition_dataset = golden_question_decomposer(decompose_inputs)

Output()

In [12]:
decomposition_results = {
            item["question_id"]: item
            for item in decomposition_dataset.dataset
        }

In [13]:
decomposition_results
# print the results as formatted JSON (without needing CustomJSONEncoder)
print(json.dumps(decomposition_results, indent=4, ensure_ascii=False))
with open('q_decomp_training_5.json', 'w', encoding='utf-8') as f:
    json.dump(decomposition_results, f, indent=4, ensure_ascii=False)



{
    "2hop__42543_20093": {
        "question_id": "2hop__42543_20093",
        "original_question": "What year did the writer of Crazy Little Thing Called Love die?",
        "decomposed_questions": [
            {
                "question": "Who wrote \"Crazy Little Thing Called Love\"?",
                "requires_retrieval": true
            },
            {
                "question": "In what year did <ENTITY_Q1> die?",
                "requires_retrieval": true
            }
        ]
    },
    "2hop__269805_135710": {
        "question_id": "2hop__269805_135710",
        "original_question": "What is the country where Nissedal is located named after?",
        "decomposed_questions": [
            {
                "question": "Which country is Nissedal located in?",
                "requires_retrieval": true
            },
            {
                "question": "What is <ENTITY_Q1> named after?",
                "requires_retrieval": true
            }
        ]
    },
  

# Create Training Dataset

In [14]:
# Convert decomposition_results dict to a list
decomposition_list = list(decomposition_results.values())

print(f"Total examples: {len(decomposition_list)}")
print(f"\nExample structure:")
print(f"Keys: {decomposition_list[0].keys()}")
print(f"\nFirst example:")
print(f"Question ID: {decomposition_list[0]['question_id']}")
print(f"Original Question: {decomposition_list[0]['original_question']}")
print(f"Decomposed Questions: {decomposition_list[0]['decomposed_questions']}")

Total examples: 802

Example structure:
Keys: dict_keys(['question_id', 'original_question', 'decomposed_questions'])

First example:
Question ID: 2hop__42543_20093
Original Question: What year did the writer of Crazy Little Thing Called Love die?
Decomposed Questions: [{'question': 'Who wrote Crazy Little Thing Called Love?', 'requires_retrieval': True}, {'question': 'What year did <ENTITY_Q1> die?', 'requires_retrieval': True}]


In [15]:
from datasets import Dataset

# Create HuggingFace Dataset from the decomposition list
raw_dataset = Dataset.from_list(decomposition_list)

print(f"Dataset info:")
print(raw_dataset)
print(f"\nColumn names: {raw_dataset.column_names}")
print(f"\nFirst example:")
print(raw_dataset[0])

Dataset info:
Dataset({
    features: ['question_id', 'original_question', 'decomposed_questions'],
    num_rows: 802
})

Column names: ['question_id', 'original_question', 'decomposed_questions']

First example:
{'question_id': '2hop__42543_20093', 'original_question': 'What year did the writer of Crazy Little Thing Called Love die?', 'decomposed_questions': [{'question': 'Who wrote Crazy Little Thing Called Love?', 'requires_retrieval': True}, {'question': 'What year did <ENTITY_Q1> die?', 'requires_retrieval': True}]}


In [16]:
import json

def create_chat_messages(example):
    """
    Convert a single example into chat format for training.
    
    Args:
        example: Dict with 'original_question' and 'decomposed_questions' keys
    
    Returns:
        Dict with 'messages' key containing the chat-formatted data
    """
    original_question = example['original_question']
    decomposed_questions = example['decomposed_questions']
    
    # Serialize the decomposed questions to JSON format (this is what the model should output)
    assistant_response = json.dumps(
        {"questions": decomposed_questions},
        indent=4,
        ensure_ascii=False
    )
    
    # Create the instruction prompt for the user
    user_prompt = f"""Your task is to break down a complex multi-hop question into the most efficient sequence of single-hop, **atomic** questions.

## Your Main Goal: Build Smart Bridges, Don't Just Collect Nouns
The most critical skill is to convert complex logical clauses (like "despite," "the country where," "the year before") into a single, powerful **bridging question**. This question should use a known entity as context to find the next one. Avoid finding all the entities separately and then trying to figure out how they connect.

---
## A Simple Analogy for Efficiency

**Question:** "What is the phone number of the mother of the tallest player on the Lakers?"

** Inefficient Path:**
1.  Who are the players on the Lakers?
2.  What are all their heights?
3.  Who is the mother of the tallest player? *(This step is a logical leap)*

** Efficient Path:**
1.  Who is the tallest player on the Lakers?
2.  Who is the mother of `<ENTITY_Q1>`?
3.  What is the phone number of `<ENTITY_Q2>`?

---
## How to Decompose a Question
This process follows a logical flow from high-level analysis to the fine-tuning of your question chain.

### 1. Analyze the Query's Components
First, break down the original question into its fundamental building blocks. Identify the core **entities** (people, places, organizations), their **properties** (attributes like rank, location, date), and the **relationships** that connect them.

### 2. Construct an Atomic Chain
Next, formulate a sequence of questions where each question retrieves a single fact.
* **Isolate Comparisons:** Don't ask "who is faster?" Ask for the specific rank or time of each person involved.
* **Link with Placeholders:** Use `<ENTITY_Qn>` to pass the answer from a previous question (`Qn`) into the next one.

### 3. Optimize for Efficiency and Precision
Your final goal is the **shortest and most direct path** to the answer.
* **Embed Constraints to Build Bridges:** If a piece of information is only a filter (like a date or location), embed it as a constraint in the next question instead of asking for it directly.
**Important note for bridges:** There can be no `<ENTITY_Qn>` in the first question if the nth question DOES NOT require retrieval.

## Formatting
Format each decomposed question as follows:

<decomposition>
Question: [the question text]
Requires retrieval: [true/false]

And provide the response in the following json format:
{{
  "questions": [
    {{
      "question": "the decomposed question text",
      "requires_retrieval": "true/false"
    }}
  ]
}}

Examples:

Input: "What is the birth year of the spouse of the director of Casablanca?"
Output:
{{
    "questions": [
        {{
            "question": "Who directed Casablanca?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who was <ENTITY_Q1>'s spouse?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "What is <ENTITY_Q2>'s birth year?",
            "requires_retrieval": "true"
        }}
    ]
}}

Input: "Which film has the director who is older, Dune or The Dark Knight?"
Output:
{{
    "questions": [
        {{
            "question": "Who directed Dune?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who directed The Dark Knight?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who is older, <ENTITY_Q1> or <ENTITY_Q2>?",
            "requires_retrieval": "true"
        }},
        {{
            "question": "Who is older, <ENTITY_Q1> or <ENTITY_Q2>?",
            "requires_retrieval": "false"
        }}
    ]
}}


IMPORTANT:
    AVOID over-decomposition like this:
    DON'T break "Who is John Doe?" into:
    1. Who is John Doe? → "English"
    2. When was <ENTITY_Q1> born? → "When was English born?"

    DO ask directly: "When was John Doe born?"

Now decompose this question:
Input: "{original_question}"
Output:
"""
    
    # Create the chat messages in the format expected by chat models
    messages = [
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": assistant_response},
    ]
    
    return {"messages": messages}


print("Preprocessing function created successfully!")

Preprocessing function created successfully!


In [17]:
# Apply the preprocessing to create the final training dataset
training_dataset = raw_dataset.map(
    create_chat_messages,
    remove_columns=raw_dataset.column_names,  # Remove original columns, keep only 'messages'
    desc="Creating chat-formatted training data"
)

print(f"Training dataset created!")
print(training_dataset)
print(f"\nColumn names: {training_dataset.column_names}")
print(f"\nFirst example messages:")
print(f"User message (first 500 chars): {training_dataset[0]['messages'][0]['content'][:500]}...")
print(f"\nAssistant response (first 500 chars): {training_dataset[0]['messages'][1]['content'][:500]}...")

Creating chat-formatted training data:   0%|          | 0/802 [00:00<?, ? examples/s]

Training dataset created!
Dataset({
    features: ['messages'],
    num_rows: 802
})

Column names: ['messages']

First example messages:
User message (first 500 chars): Your task is to break down a complex multi-hop question into the most efficient sequence of single-hop, **atomic** questions.

## Your Main Goal: Build Smart Bridges, Don't Just Collect Nouns
The most critical skill is to convert complex logical clauses (like "despite," "the country where," "the year before") into a single, powerful **bridging question**. This question should use a known entity as context to find the next one. Avoid finding all the entities separately and then trying to figure o...

Assistant response (first 500 chars): {
    "questions": [
        {
            "question": "Who wrote Crazy Little Thing Called Love?",
            "requires_retrieval": true
        },
        {
            "question": "What year did <ENTITY_Q1> die?",
            "requires_retrieval": true
        }
    ]
}...


# Test Formatting Function (Run Before Training)

In [18]:
# Test that the chat template works correctly before training
print("Testing chat template compatibility...")

# Load tokenizer for testing
test_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

# Configure padding
if test_tokenizer.pad_token is None:
    test_tokenizer.pad_token = test_tokenizer.eos_token
    test_tokenizer.pad_token_id = test_tokenizer.eos_token_id

test_tokenizer.padding_side = 'right'

# Check if chat template exists
if hasattr(test_tokenizer, 'chat_template') and test_tokenizer.chat_template:
    print("✓ Chat template found!")
    print(f"  Template preview (first 200 chars): {str(test_tokenizer.chat_template)[:200]}...")
else:
    print("✗ WARNING: No chat template found! This may cause errors.")
    print("  Consider using an instruct-tuned model variant.")

# Test with a sample from the training dataset
print("\nTesting formatting with sample data...")
try:
    sample = training_dataset[0]
    print(f"Sample messages structure: {list(sample.keys())}")
    
    # Test the formatting function
    formatted = test_tokenizer.apply_chat_template(
        sample["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    
    print(f"\n✓ Formatting successful!")
    print(f"  Original message length: {len(str(sample['messages']))}")
    print(f"  Formatted text length: {len(formatted)}")
    print(f"\nFormatted output preview (first 500 chars):")
    print(formatted[:500])
    print("\n... [truncated] ...")
    print(f"\nLast 200 chars:")
    print(formatted[-200:])
    
except Exception as e:
    print(f"\n✗ ERROR during formatting: {e}")
    print("  You may need to adjust the formatting function or use a different model.")

print("\n" + "="*60)
print("Test complete! Review the output above before training.")

Testing chat template compatibility...
✓ Chat template found!
  Template preview (first 200 chars): {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more f...

Testing formatting with sample data...
Sample messages structure: ['messages']

✓ Formatting successful!
  Original message length: 4429
  Formatted text length: 4311

Formatted output preview (first 500 chars):
<|im_start|>user
Your task is to break down a complex multi-hop question into the most efficient sequence of single-hop, **atomic** questions.

## Your Main Goal: Build Smart Bridges, Don't Just Collect Nouns
The most critical skill is to convert complex logical clauses (like "despite," "the country where," "the year before") into a single, powerful **bridging question**. This question should use a known entity as context to find the next one. Avoid finding all the entities sepa

# LoRA Fine-Tuning Setup for Local GPU

In [None]:
from dataclasses import dataclass, field
import torch
from datasets import Dataset
from peft import LoraConfig
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM
from trl import SFTTrainer

In [20]:
# =============================================================================
# Model Loading and Training Loop Function (Local GPU Version)
# =============================================================================
def train(model_id, tokenizer, dataset, training_args):
    """
    Train a model with LoRA on local GPU.
    
    Args:
        model_id: HuggingFace model identifier (e.g., "Qwen/Qwen3-8B")
        tokenizer: Tokenizer instance
        dataset: Training dataset with 'messages' column
        training_args: TrainingArguments instance
    """
    dtype = torch.bfloat16 if training_args.bf16 else torch.float32
    
    # Load model for local GPU training
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=dtype,
        device_map="auto",  # Automatically distribute model across available GPUs
        # Optional: use 4-bit quantization to save memory (uncomment if needed)
        # load_in_4bit=True,
        # bnb_4bit_compute_dtype=dtype,
        # bnb_4bit_use_double_quant=True,
        # bnb_4bit_quant_type="nf4",
    )
    
    # LoRA configuration optimized for Qwen3
    # Standard config targets attention and FFN layers
    # Remove "embed_tokens" to avoid potential issues
    lora_config = LoraConfig(
        r=64,  # LoRA rank - higher = more parameters but better adaptation
        lora_alpha=128,  # Scaling factor (typically 2x rank)
        lora_dropout=0.05,  # Dropout for LoRA layers
        target_modules=[
            # Attention layers
            "q_proj",      # Query projection
            "k_proj",      # Key projection
            "v_proj",      # Value projection
            "o_proj",      # Output projection
            # Feed-forward network layers
            "gate_proj",   # Gate projection
            "up_proj",     # Up projection
            "down_proj",   # Down projection
        ],
        bias="none",
        task_type="CAUSAL_LM",
    )
    
    def formatting_function(example):
        """Format a single example using the tokenizer's chat template."""
        return tokenizer.apply_chat_template(
            example["messages"], 
            tokenize=False, 
            add_generation_prompt=False
        )
    
    # Initialize SFTTrainer with LoRA
    # Note: In newer versions of trl, use 'processing_class' instead of 'tokenizer'
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        peft_config=lora_config,
        processing_class=tokenizer,  # Use processing_class for newer trl versions
        train_dataset=dataset,
        formatting_func=formatting_function,  # Function to format each example
        # max_seq_length=4096,  # Maximum sequence length for training
        # packing=True,  # Pack multiple examples into one sequence for efficiency
    )
    
    # Start training
    trainer.train()
    
    # Save final model
    trainer.save_model()
    
    return trainer


# =============================================================================
# Defining the script-specific arguments
# =============================================================================
@dataclass
class ScriptArguments:
    model_id: str = field(
        metadata={"help": "The model that you want to train from the Hugging Face hub."},
    )
    output_dir: str = field(
        default="./question_decomp_lora",
        metadata={"help": "Directory to save the trained model."},
    )
    num_train_epochs: int = field(
        default=3,
        metadata={"help": "Number of training epochs."},
    )
    per_device_train_batch_size: int = field(
        default=4,
        metadata={"help": "Batch size per GPU for training."},
    )
    gradient_accumulation_steps: int = field(
        default=4,
        metadata={"help": "Number of gradient accumulation steps."},
    )
    learning_rate: float = field(
        default=2e-4,
        metadata={"help": "Learning rate for training."},
    )
    warmup_steps: int = field(
        default=100,
        metadata={"help": "Number of warmup steps."},
    )
    logging_steps: int = field(
        default=10,
        metadata={"help": "Log every N steps."},
    )
    save_steps: int = field(
        default=500,
        metadata={"help": "Save checkpoint every N steps."},
    )
    save_total_limit: int = field(
        default=3,
        metadata={"help": "Maximum number of checkpoints to keep."},
    )

# Example: Train the Model

In [21]:
# Set GPU visibility FIRST (before any CUDA operations)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,3"

# Verify GPU configuration
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of visible GPUs: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")

# Example usage - Configure and run training

# Configure model and tokenizer
model_id = "Qwen/Qwen3-8B"  # Change to your desired Qwen model (or Qwen2.5-7B-Instruct, etc.)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Configure padding token properly
# Many chat models don't have a pad token, so we use EOS as pad
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Important: Set padding side to 'right' for training (not inference)
tokenizer.padding_side = 'right'

print(f"\nTokenizer configured:")
print(f"  Model: {model_id}")
print(f"  EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
print(f"  PAD token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
print(f"  BOS token: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
print(f"  Padding side: {tokenizer.padding_side}")

# Create training arguments
training_args = TrainingArguments(
    output_dir="./question_decomp_lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=10,
    save_steps=500,
    save_total_limit=3,
    bf16=True,  # Use bfloat16 if your GPU supports it
    fp16=False,  # Use fp16 if bf16 is not supported
    gradient_checkpointing=True,  # Save memory
    optim="adamw_torch",
    logging_dir="./logs",
    report_to="none",  # Change to "wandb" or "tensorboard" if you want logging
)

# Start training using the training_dataset created above
# trainer = train(model_id, tokenizer, training_dataset, training_args)

# After training, you can save and use the model:
# trainer.save_model("./question_decomp_lora_final")
# Or push to HuggingFace Hub:
# trainer.push_to_hub("your-username/question-decomp-lora")

CUDA available: True
Number of visible GPUs: 4
  GPU 0: NVIDIA RTX A6000
  GPU 1: NVIDIA RTX A6000
  GPU 2: NVIDIA RTX A6000
  GPU 3: NVIDIA RTX A6000

Tokenizer configured:
  Model: Qwen/Qwen3-8B
  EOS token: <|im_end|> (ID: 151645)
  PAD token: <|endoftext|> (ID: 151643)
  BOS token: None (ID: None)
  Padding side: right


# Load Trained LoRA Adapter for Inference

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import json
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
# Path to your trained LoRA adapter
adapter_path = "./question_decomp_lora/final"

# Load the PEFT config to get the base model name
peft_config = PeftConfig.from_pretrained(adapter_path)
print(f"Loading base model: {peft_config.base_model_name_or_path}")
print(f"LoRA config: r={peft_config.r}, alpha={peft_config.lora_alpha}, dropout={peft_config.lora_dropout}")

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Load LoRA adapter on top of base model
model = PeftModel.from_pretrained(base_model, adapter_path)

# Merge adapter weights into base model for faster inference (optional)
# model = model.merge_and_unload()

print("✓ Model loaded successfully!")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(adapter_path)

# Load and apply the custom non-thinking chat template
with open('/home/yigit/codebase/gsw-memory/qwen3_nonthinking.jinja', 'r') as f:
    custom_chat_template = f.read()

tokenizer.chat_template = custom_chat_template
print("✓ Custom chat template loaded from qwen3_nonthinking.jinja")

# Ensure padding is set correctly for inference
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"✓ Tokenizer loaded successfully!")
print(f"  PAD token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
print(f"  EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")

# Set model to evaluation mode
model.eval()
print("✓ Model set to evaluation mode")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading base model: Qwen/Qwen3-8B
LoRA config: r=128, alpha=256, dropout=0.05


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

✓ Model loaded successfully!
✓ Custom chat template loaded from qwen3_nonthinking.jinja
✓ Tokenizer loaded successfully!
  PAD token: <|endoftext|> (ID: 151643)
  EOS token: <|im_end|> (ID: 151645)
✓ Model set to evaluation mode


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig


MODEL_NAME = "Qwen/Qwen3-8B"
ADAPTER_PATH = "/home/yigit/codebase/gsw-memory/playground/question_decomp_local/question_decomp_lora/final"
MERGED_MODEL_PATH = "Qwen3-8B-recipes-gpt5"

# Load base model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

#change tokenizer chat template
with open('/home/yigit/codebase/gsw-memory/qwen3_nonthinking.jinja', 'r') as f:
    custom_chat_template = f.read()

tokenizer.chat_template = custom_chat_template

# Load adapter configuration and model
adapter_config = PeftConfig.from_pretrained(ADAPTER_PATH)
finetuned_model = PeftModel.from_pretrained(model, ADAPTER_PATH, config=adapter_config)

print("Saving tokenizer")
tokenizer.save_pretrained(MERGED_MODEL_PATH)
print("Saving model")
finetuned_model = finetuned_model.merge_and_unload()
finetuned_model.save_pretrained(MERGED_MODEL_PATH)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Saving tokenizer
Saving model


In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

MERGED_MODEL_PATH = "Qwen3-8B-recipes-gpt5"
HUB_MODEL_NAME = "yigitturali/qwen3-8b-qa-decomp-gsw-rank-128-gpt5-golden"

# Load and push tokenizer
tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
tokenizer.push_to_hub(HUB_MODEL_NAME)

# Load and push model
model = AutoModelForCausalLM.from_pretrained(MERGED_MODEL_PATH)
model.push_to_hub(HUB_MODEL_NAME)

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp9_zcux4q/tokenizer.json       : 100%|##########| 11.4MB / 11.4MB            

No files have been modified since last commit. Skipping to prevent empty commit.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...1j/model-00001-of-00007.safetensors:   1%|          | 33.5MB / 4.97GB            

  ...1j/model-00002-of-00007.safetensors:   0%|          | 11.8kB / 4.83GB            

  ...1j/model-00007-of-00007.safetensors:   1%|          | 33.5MB / 3.46GB            

  ...1j/model-00004-of-00007.safetensors:   0%|          |  551kB / 5.00GB            

  ...1j/model-00005-of-00007.safetensors:   0%|          |  551kB / 4.83GB            

  ...1j/model-00006-of-00007.safetensors:   0%|          |  551kB / 4.83GB            

  ...1j/model-00003-of-00007.safetensors:   0%|          | 1.65MB / 4.83GB            

CommitInfo(commit_url='https://huggingface.co/yigitturali/qwen3-8b-qa-decomp-gsw-rank-128-gpt5-golden/commit/a5780bae3536582ee7dc57651a56cf81b3c8106b', commit_message='Upload Qwen3ForCausalLM', commit_description='', oid='a5780bae3536582ee7dc57651a56cf81b3c8106b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yigitturali/qwen3-8b-qa-decomp-gsw-rank-128-gpt5-golden', endpoint='https://huggingface.co', repo_type='model', repo_id='yigitturali/qwen3-8b-qa-decomp-gsw-rank-128-gpt5-golden'), pr_revision=None, pr_num=None)

# Fixed Inference Function (Handles `<think>` tags)

In [6]:
import re

def decompose_question_fixed(question: str, temperature: float = 0.0, max_new_tokens: int = 1024) -> dict:
    """
    Decompose a multi-hop question into atomic sub-questions using the fine-tuned LoRA model.
    FIXED: Properly handles <think> tags and 'assistant' prefix from Qwen models.
    
    Args:
        question: The multi-hop question to decompose
        temperature: Sampling temperature (0.0 for greedy decoding)
        max_new_tokens: Maximum number of tokens to generate
    
    Returns:
        Dictionary with original question and decomposed questions
    """
    
    # Create the same prompt format used during training
    user_prompt = f"""Your task is to break down a complex multi-hop question into the most efficient sequence of single-hop, **atomic** questions.

## Your Main Goal: Build Smart Bridges, Don't Just Collect Nouns
The most critical skill is to convert complex logical clauses (like "despite," "the country where," "the year before") into a single, powerful **bridging question**. This question should use a known entity as context to find the next one. Avoid finding all the entities separately and then trying to figure out how they connect.

---
## A Simple Analogy for Efficiency

**Question:** "What is the phone number of the mother of the tallest player on the Lakers?"

** Inefficient Path:**
1.  Who are the players on the Lakers?
2.  What are all their heights?
3.  Who is the mother of the tallest player? *(This step is a logical leap)*

** Efficient Path:**
1.  Who is the tallest player on the Lakers?
2.  Who is the mother of `<ENTITY_Q1>`?
3.  What is the phone number of `<ENTITY_Q2>`?

---
## How to Decompose a Question
This process follows a logical flow from high-level analysis to the fine-tuning of your question chain.

### 1. Analyze the Query's Components
First, break down the original question into its fundamental building blocks. Identify the core **entities** (people, places, organizations), their **properties** (attributes like rank, location, date), and the **relationships** that connect them.

### 2. Construct an Atomic Chain
Next, formulate a sequence of questions where each question retrieves a single fact.
* **Isolate Comparisons:** Don't ask "who is faster?" Ask for the specific rank or time of each person involved.
* **Link with Placeholders:** Use `<ENTITY_Qn>` to pass the answer from a previous question (`Qn`) into the next one.

### 3. Optimize for Efficiency and Precision
Your final goal is the **shortest and most direct path** to the answer.
* **Embed Constraints to Build Bridges:** If a piece of information is only a filter (like a date or location), embed it as a constraint in the next question instead of asking for it directly.
**Important note for bridges:** There can be no `<ENTITY_Qn>` in the first question if the nth question DOES NOT require retrieval.

## Formatting
Format each decomposed question as follows:

<decomposition>
Question: [the question text]
Requires retrieval: [true/false]

And provide the response in the following json format:
{{{{
  "questions": [
    {{{{
      "question": "the decomposed question text",
      "requires_retrieval": "true/false"
    }}}}
  ]
}}}}

Examples:

Input: "What is the birth year of the spouse of the director of Casablanca?"
Output:
{{{{
    "questions": [
        {{{{
            "question": "Who directed Casablanca?",
            "requires_retrieval": "true"
        }}}},
        {{{{
            "question": "Who was <ENTITY_Q1>'s spouse?",
            "requires_retrieval": "true"
        }}}},
        {{{{
            "question": "What is <ENTITY_Q2>'s birth year?",
            "requires_retrieval": "true"
        }}}}
    ]
}}}}

Input: "Which film has the director who is older, Dune or The Dark Knight?"
Output:
{{{{
    "questions": [
        {{{{
            "question": "Who directed Dune?",
            "requires_retrieval": "true"
        }}}},
        {{{{
            "question": "Who directed The Dark Knight?",
            "requires_retrieval": "true"
        }}}},
        {{{{
            "question": "Who is older, <ENTITY_Q1> or <ENTITY_Q2>?",
            "requires_retrieval": "true"
        }}}},
        {{{{
            "question": "Who is older, <ENTITY_Q1> or <ENTITY_Q2>?",
            "requires_retrieval": "false"
        }}}}
    ]
}}}}


IMPORTANT:
    AVOID over-decomposition like this:
    DON'T break "Who is John Doe?" into:
    1. Who is John Doe? → "English"
    2. When was <ENTITY_Q1> born? → "When was English born?"

    DO ask directly: "When was John Doe born?"

Now decompose this question:
Input: "{question}"
Output:
"""
    
    # Format as chat messages
    messages = [
        {"role": "user", "content": user_prompt}
    ]
    
    # Apply chat template (uses the custom template loaded on the tokenizer)
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=4096
    ).to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=temperature > 0,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode the output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the assistant's response (after "Output:\n")
    if "Output:\n" in generated_text:
        assistant_response = generated_text.split("Output:\n")[-1].strip()
    else:
        # Fallback: take everything after the last occurrence of "assistant"
        assistant_response = generated_text.split("assistant")[-1].strip()
    
    # Remove "assistant" prefix if it still exists at the beginning
    if assistant_response.lower().startswith("assistant"):
        assistant_response = assistant_response[len("assistant"):].strip()
    
    # Remove the <think> tags if present (Qwen models use this for reasoning)
    # Use regex to remove everything between <think> and </think> including the tags
    assistant_response = re.sub(r'<think>.*?</think>', '', assistant_response, flags=re.DOTALL).strip()
    
    # Try to parse as JSON
    try:
        parsed_json = json.loads(assistant_response)
        return {
            "original_question": question,
            "decomposed_questions": parsed_json.get("questions", []),
            "raw_response": assistant_response,
            "success": True
        }
    except json.JSONDecodeError as e:
        return {
            "original_question": question,
            "decomposed_questions": [],
            "raw_response": assistant_response,
            "success": False,
            "error": str(e)
        }

print("✓ Fixed inference function created successfully!")
print("  This version uses the custom chat template set on the tokenizer")
print("  It properly strips <think> tags and 'assistant' prefix")

✓ Fixed inference function created successfully!
  This version uses the custom chat template set on the tokenizer
  It properly strips <think> tags and 'assistant' prefix


# Apply Non-Thinking Chat Template for Re-Training

In [None]:
# Read the non-thinking chat template
with open('/home/yigit/codebase/gsw-memory/qwen3_nonthinking.jinja', 'r') as f:
    non_thinking_template = f.read()

print("Original template (lines 84-86 with add_generation_prompt):")
print(non_thinking_template.split('\n')[83:86])
print()

# Create a truly non-thinking version by removing the <think> tags from generation prompt
non_thinking_template_fixed = non_thinking_template.replace(
    "{{- '<|im_start|>assistant\\n<think>\\n\\n</think>\\n\\n' }}",
    "{{- '<|im_start|>assistant\\n' }}"
)

# Also remove the think tag handling from assistant messages (lines 44-45)
# This ensures training data doesn't include think tags either
non_thinking_template_fixed = non_thinking_template_fixed.replace(
    "{{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}",
    "{{- '<|im_start|>' + message.role + '\\n' + content.lstrip('\\n') }}"
)

print("Modified template - removed <think> tags from generation prompt")
print()

# Apply the fixed template to a new tokenizer for training
from transformers import AutoTokenizer

# Load fresh tokenizer
tokenizer_no_think = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

# Apply the modified chat template
tokenizer_no_think.chat_template = non_thinking_template_fixed

# Configure padding
if tokenizer_no_think.pad_token is None:
    tokenizer_no_think.pad_token = tokenizer_no_think.eos_token
    tokenizer_no_think.pad_token_id = tokenizer_no_think.eos_token_id

tokenizer_no_think.padding_side = 'right'

print("✓ Non-thinking tokenizer created!")
print()

# Test the template
test_messages = [
    {"role": "user", "content": "What is 2+2?"},
    {"role": "assistant", "content": "4"}
]

formatted_train = tokenizer_no_think.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=False
)

formatted_infer = tokenizer_no_think.apply_chat_template(
    [{"role": "user", "content": "What is 2+2?"}],
    tokenize=False,
    add_generation_prompt=True
)

print("Test formatting (training - should have NO <think> tags):")
print(formatted_train)
print()
print("Test formatting (inference - should have NO <think> tags):")
print(formatted_infer)
print()

# Check if think tags are present
if '<think>' in formatted_train or '<think>' in formatted_infer:
    print("⚠ WARNING: <think> tags still present!")
else:
    print("✓ SUCCESS: No <think> tags in formatted output!")

# Re-train with Non-Thinking Template

In [None]:
# Set GPU visibility FIRST (before any CUDA operations)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,3"

# Verify GPU configuration
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of visible GPUs: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")

print("\n" + "="*80)
print("Training Configuration with Non-Thinking Template")
print("="*80)

# Use the tokenizer with non-thinking template created above
model_id = "Qwen/Qwen3-8B"

print(f"\nTokenizer configured:")
print(f"  Model: {model_id}")
print(f"  EOS token: {tokenizer_no_think.eos_token} (ID: {tokenizer_no_think.eos_token_id})")
print(f"  PAD token: {tokenizer_no_think.pad_token} (ID: {tokenizer_no_think.pad_token_id})")
print(f"  BOS token: {tokenizer_no_think.bos_token} (ID: {tokenizer_no_think.bos_token_id})")
print(f"  Padding side: {tokenizer_no_think.padding_side}")
print(f"  Chat template: Modified to remove <think> tags")

# Create training arguments
training_args = TrainingArguments(
    output_dir="./question_decomp_lora_no_think",  # Different output dir
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=10,
    save_steps=500,
    save_total_limit=3,
    bf16=True,  # Use bfloat16 if your GPU supports it
    fp16=False,  # Use fp16 if bf16 is not supported
    gradient_checkpointing=True,  # Save memory
    optim="adamw_torch",
    logging_dir="./logs_no_think",
    report_to="none",  # Change to "wandb" or "tensorboard" if you want logging
)

print(f"\nTraining args:")
print(f"  Output dir: {training_args.output_dir}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")

print("\n" + "="*80)
print("Ready to train! Uncomment the line below to start training:")
print("="*80)
print("# trainer = train(model_id, tokenizer_no_think, training_dataset, training_args)")
print()

# Uncomment to start training:
# trainer = train(model_id, tokenizer_no_think, training_dataset, training_args)

# After training completes, save the model:
# trainer.save_model("./question_decomp_lora_no_think_final")
# tokenizer_no_think.save_pretrained("./question_decomp_lora_no_think_final")

# Verify Template Comparison

In [None]:
# Compare the outputs from original vs non-thinking tokenizer
from transformers import AutoTokenizer

# Load original Qwen3 tokenizer (with default template)
tokenizer_original = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

test_messages = [
    {"role": "user", "content": "What is 2+2?"}
]

print("="*80)
print("COMPARISON: Original vs Non-Thinking Template")
print("="*80)

# Format with original tokenizer
formatted_original = tokenizer_original.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

# Format with non-thinking tokenizer
formatted_no_think = tokenizer_no_think.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=True
)

print("\n1. ORIGINAL TEMPLATE OUTPUT (with <think> tags):")
print("-" * 80)
print(formatted_original)
print("-" * 80)

print("\n2. NON-THINKING TEMPLATE OUTPUT (NO <think> tags):")
print("-" * 80)
print(formatted_no_think)
print("-" * 80)

print("\n3. DIFFERENCE:")
print("-" * 80)
if '<think>' in formatted_original and '<think>' not in formatted_no_think:
    print("✓ SUCCESS: Original has <think> tags, Non-thinking does NOT")
    print(f"\nOriginal ends with: ...{formatted_original[-100:]}")
    print(f"\nNon-thinking ends with: ...{formatted_no_think[-100:]}")
else:
    print("⚠ Something unexpected happened")
    print(f"Original has <think>: {'<think>' in formatted_original}")
    print(f"Non-thinking has <think>: {'<think>' in formatted_no_think}")

print("\n" + "="*80)
print("This confirms the non-thinking template will NOT generate <think> tags!")
print("="*80)

In [8]:
# Test the fixed function
test_question = "What year did the writer of Crazy Little Thing Called Love die?"

print(f"Testing fixed function with: {test_question}\n")
print("Generating decomposition...")

result = decompose_question_fixed(test_question, temperature=0.0)

print("\n" + "="*80)
print("RESULT WITH FIXED FUNCTION:")
print("="*80)
print(f"Success: {result['success']}")
print(f"\nOriginal Question: {result['original_question']}")

if result['success']:
    print(f"\nDecomposed Questions:")
    for i, q in enumerate(result['decomposed_questions'], 1):
        retrieval_marker = "🔍" if q['requires_retrieval'] else "💭"
        print(f"  {retrieval_marker} Q{i}: {q['question']}")
        print(f"      Requires retrieval: {q['requires_retrieval']}")
    
    print(f"\n✓ JSON parsing successful!")
    print(f"Raw JSON response:\n{result['raw_response']}")
else:
    print(f"\n✗ Error: {result.get('error', 'Unknown error')}")
    print(f"\nRaw response:\n{result['raw_response']}")
    
print("\n" + "="*80)

Testing fixed function with: What year did the writer of Crazy Little Thing Called Love die?

Generating decomposition...

RESULT WITH FIXED FUNCTION:
Success: True

Original Question: What year did the writer of Crazy Little Thing Called Love die?

Decomposed Questions:
  🔍 Q1: Who wrote Crazy Little Thing Called Love?
      Requires retrieval: True
  🔍 Q2: When did <ENTITY_Q1> die?
      Requires retrieval: True

✓ JSON parsing successful!
Raw JSON response:
{
    "questions": [
        {
            "question": "Who wrote Crazy Little Thing Called Love?",
            "requires_retrieval": true
        },
        {
            "question": "When did <ENTITY_Q1> die?",
            "requires_retrieval": true
        }
    ]
}



In [9]:
result

{'original_question': 'What year did the writer of Crazy Little Thing Called Love die?',
 'decomposed_questions': [{'question': 'Who wrote Crazy Little Thing Called Love?',
   'requires_retrieval': True},
  {'question': 'When did <ENTITY_Q1> die?', 'requires_retrieval': True}],
 'raw_response': '{\n    "questions": [\n        {\n            "question": "Who wrote Crazy Little Thing Called Love?",\n            "requires_retrieval": true\n        },\n        {\n            "question": "When did <ENTITY_Q1> die?",\n            "requires_retrieval": true\n        }\n    ]\n}',
 'success': True}

In [7]:
# Test with multiple questions from the MuSiQue test set
test_questions = [
    "Who succeeded the first President of Namibia?",
    "What currency is used where Billy Giles died?",
    "When was the first establishment that Mc-Donaldization is named after, open in the country Horndean is located?",
    "When did Napoleon occupy the city where the mother of the woman who brought Louis XVI style to the court died?",
    "How many Germans live in the colonial holding in Aruba’s continent that was governed by Prazeres’s country?",
    "When did the people who captured Malakoff come to the region where Philipsburg is located?"
]

print("Testing multiple questions from various domains...\n")

for i, question in enumerate(test_questions, 1):
    print(f"\n{'='*80}")
    print(f"TEST {i}/{len(test_questions)}")
    print(f"{'='*80}")
    print(f"Question: {question}\n")
    
    result = decompose_question_fixed(question, temperature=0.0)
    
    if result['success']:
        print("✓ Successfully decomposed!\n")
        print("Decomposition:")
        for j, q in enumerate(result['decomposed_questions'], 1):
            retrieval_marker = "🔍" if q['requires_retrieval'] else "💭"
            print(f"  {retrieval_marker} Q{j}: {q['question']}")
    else:
        print(f"✗ Failed to parse JSON")
        print(f"Error: {result.get('error', 'Unknown')}")
        print(f"Raw response: {result['raw_response'][:200]}...")

print(f"\n{'='*80}")
print("Testing complete!")
print(f"{'='*80}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Testing multiple questions from various domains...


TEST 1/6
Question: Who succeeded the first President of Namibia?

✓ Successfully decomposed!

Decomposition:
  🔍 Q1: Who was the first President of Namibia?
  🔍 Q2: Who succeeded <ENTITY_Q1>?

TEST 2/6
Question: What currency is used where Billy Giles died?

✓ Successfully decomposed!

Decomposition:
  🔍 Q1: Where did Billy Giles die?
  🔍 Q2: What currency is used in <ENTITY_Q1>?

TEST 3/6
Question: When was the first establishment that Mc-Donaldization is named after, open in the country Horndean is located?

✓ Successfully decomposed!

Decomposition:
  🔍 Q1: What is the country where Horndean is located?
  🔍 Q2: What is the first establishment that Mc-Donaldization is named after?
  🔍 Q3: When was <ENTITY_Q2> open in <ENTITY_Q1>?

TEST 4/6
Question: When did Napoleon occupy the city where the mother of the woman who brought Louis XVI style to the court died?

✓ Successfully decomposed!

Decomposition:
  🔍 Q1: Who brought the Louis 