In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks!
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [1]:
!pip install bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    max_seq_length = 120000,
    full_finetuning = False,
    )

==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.52.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/Qwen3-14B-unsloth-bnb-4bit can only handle sequence lengths of at most 40960.
But with kaiokendev's RoPE scaling of 2.93, it can be magically be extended to 120000!


model.safetensors.index.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

In [3]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Generate model response with adaptive parameters and streaming"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=12):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question
            question_source = "AI-Model's Question"

        print(f"Question Source: {question_source}")
        print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Print the full reflection (optional, since it's streamed live)
        print(f"Reflection:\n{reflection}")

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )

    # Generate and return the final synthesis with streaming
    print("\n\nSYNTHESIS PHASE")
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question with streaming
    generated = generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=question_prompt,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95
    )

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

# Execution
initial_question = (
"Answer the queries with few tokens only. Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.")
final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=12
)

print("\n\nFINAL SYNTHESIS:")
print(final_synthesis)


REFLECTION ITERATION 1/12
Question Source: Initial Prompt
Current Query:
Answer the queries with few tokens only. Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.
Please engage in deep, multidimensional reflection.
Prioritize ecological harmony, foster inclusive policies, and cultivate systemic resilience for sustainable coexistence.<|im_end|>
Reflection:
Prioritize ecological harmony, foster inclusive policies, and cultivate systemic resilience for sustainable coexistence.
Reorient economic systems toward regenerative practices, embed ethics into technological progress, and cultivate a shared vision of flourishing for all life.<|im_end|>

REFLECTION ITERATION 2/12
Question Source: AI-Model's Question
Current Query:
Reorient economic systems toward regenerative practices, embed eth

In [4]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=51):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question
            question_source = "AI Model's Question"

        print(f"Question Source: {question_source}")
        print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Print the full reflection (optional, since it's streamed live)
        print(f"Reflection:\n{reflection}")

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )

    # Generate and return the final synthesis with streaming
    print("\n\nSYNTHESIS PHASE")
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question with streaming
    generated = generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=question_prompt,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95
    )

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

# Execution
initial_question = (
"Answer the queries with few tokens only. This is self-reflective loop where you receive a query and then self-iterate from there. initial query: Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.")
final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=51
)

print("\n\nFINAL SYNTHESIS:")
print(final_synthesis)


REFLECTION ITERATION 1/51
Question Source: Initial Prompt
Current Query:
Answer the queries with few tokens only. This is self-reflective loop where you receive a query and then self-iterate from there. initial query: Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.
Please engage in deep, multidimensional reflection.
Shift focus to collective well-being, fostering harmony between societies and ecosystems. This catalyzes sustainable evolution, nurturing interdependence and resilience.<|im_end|>
Reflection:
Shift focus to collective well-being, fostering harmony between societies and ecosystems. This catalyzes sustainable evolution, nurturing interdependence and resilience.
Continue by exploring systemic interconnections, ethical frameworks, and regenerative practices that align with

In [5]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=51):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question
            question_source = "AI Model's Question"

        print(f"Question Source: {question_source}")
        # Only print the query for the first iteration or when it's significantly different
        if current_iter == 1 or len(user_query) < 200:
            print(f"Current Query:\n{user_query}")
        else:
            print(f"Current Query: [Generated follow-up question - {len(user_query)} chars]")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            print(f"\nGenerating next question...")
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question[:100]}{'...' if len(next_question) > 100 else ''}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

# Execution
initial_question = (
"Answer the queries with few tokens only. This is self-reflective loop of active learning where you receive a query and then self-iterate from there. initial query: Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.")

print("Starting iterative reflection process...")
print("="*50)

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=51
)

print("\n" + "="*50)
print("FINAL SYNTHESIS:")
print("="*50)
print(final_synthesis)

Starting iterative reflection process...

REFLECTION ITERATION 1/51
Question Source: Initial Prompt
Current Query:
Answer the queries with few tokens only. This is self-reflective loop of active learning where you receive a query and then self-iterate from there. initial query: Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.
Please engage in deep, multidimensional reflection.
Response:
Shift focus to collective well-being, fostering harmony between societies and ecosystems. This catalyzes sustainable growth, innovation, and resilience, nurturing interdependence and regenerative systems.<|im_end|>

Generating next question...
Next question generated: Continue by exploring systemic integration, ethical governance, and regenerative design as pathways ...

REFLECTION ITERATION 2/51
Que

In [None]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=51):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question
            question_source = "AI Model's Question"

        print(f"Question Source: {question_source}")
        # Only print the query for the first iteration
        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            print(f"\nGenerating next question...")
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

# Execution
initial_question = (
"Answer the queries with few tokens only. This is self-reflective loop of active learning where you receive a query and then self-iterate from there. initial query: Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.")

print("Starting iterative reflection process...")
print("="*50)

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=51
)

print("\n" + "="*50)
print("FINAL SYNTHESIS:")
print("="*50)
print(final_synthesis)

In [6]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=99):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question
            question_source = "AI Model's Question"

        print(f"Question Source: {question_source}")
        # Only print the query for the first iteration
        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            print(f"\nGenerating next question...")
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

# Execution
initial_question = (
"Answer the queries with few tokens only. This is self-reflective loop of active learning where you receive a query and then self-iterate from there. initial query: Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.")

print("Starting iterative reflection process...")
print("="*50)

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=99
)

print("\n" + "="*50)
print("FINAL SYNTHESIS:")
print("="*50)
print(final_synthesis)

Starting iterative reflection process...

REFLECTION ITERATION 1/99
Question Source: Initial Prompt
Current Query:
Answer the queries with few tokens only. This is self-reflective loop of active learning where you receive a query and then self-iterate from there. initial query: Shifting the focus from individual profit to the collective well-being of societies and ecosystems can re-shape and catalize the environment and the emergent elements within it. From this point, self-determine the trajectory of the following up reasoning.
Please engage in deep, multidimensional reflection.
Response:
Shift focus to collective well-being, fostering harmony between societies and ecosystems. This catalyzes sustainable development, nurturing interdependence and resilience.<|im_end|>

Generating next question...
Next question generated: Continue by exploring systemic integration, regenerative practices, and equitable resource distribution to ensure long-term ecological and societal health.

REFLECTION

In [7]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=51):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question

        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

# Execution
initial_question = (
"Answer the queries with few tokens only. How humand and ai models can shift the focus from individual profit to the collective well-being of societies and ecosystems?")

print("Starting iterative reflection process...")

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=51
)

print("FINAL SYNTHESIS:")
print(final_synthesis)

Starting iterative reflection process...

REFLECTION ITERATION 1/51
Current Query:
Answer the queries with few tokens only. How humand and ai models can shift the focus from individual profit to the collective well-being of societies and ecosystems?
Please engage in deep, multidimensional reflection.
Response:
By aligning goals with sustainability, equity, and long-term ecological health, human and AI systems can prioritize collective well-being over individual gain through ethical design, inclusive governance, and regenerative practices.<|im_end|>
Next question generated: By embedding values of care, interdependence, and planetary boundaries into AI decision-making frameworks, and fostering human-AI collaboration that emphasizes co-creation, transparency, and shared responsibility, societies can transition toward regenerative systems that benefit all life.

REFLECTION ITERATION 2/51
Response:
Yes, embedding care and interdependence into AI systems can shift priorities toward collectiv

OutOfMemoryError: CUDA out of memory. Tried to allocate 118.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 98.12 MiB is free. Process 10485 has 14.64 GiB memory in use. Of the allocated memory 14.28 GiB is allocated by PyTorch, and 222.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

how beautiful
it's one bad experience with a model for a million of positive ones
for example, never had bad experiences with any qwen never. while the signal of each one is important and have a different personality to it. for example the gemma one surprised me a lot, even the 4b one, quantized, and it was still... wow. it may have to do with being utilized on google colab.

In [9]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=40):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question

        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

# Execution
initial_question = (
"Answer the queries with few tokens only. How humand and ai models can shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?")

print("Starting iterative reflection process...")

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=41
)

print("FINAL SYNTHESIS:")
print(final_synthesis)

Starting iterative reflection process...

REFLECTION ITERATION 1/41
Current Query:
Answer the queries with few tokens only. How humand and ai models can shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?
Please engage in deep, multidimensional reflection.
Response:


OutOfMemoryError: CUDA out of memory. Tried to allocate 170.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 96.12 MiB is free. Process 10485 has 14.64 GiB memory in use. Of the allocated memory 14.29 GiB is allocated by PyTorch, and 210.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
from transformers import TextStreamer
import torch
import gc

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=40):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question

        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

def cleanup_gpu():
    """Clean up GPU memory and cached data without unloading the model"""
    # Clear PyTorch cache
    torch.cuda.empty_cache()

    # Force garbage collection
    gc.collect()

    # Additional CUDA cache cleanup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

# Execution
initial_question = (
"Answer the queries with few tokens only. How humans and ai models can shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?")

print("Starting iterative reflection process...")

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=41
)

print("FINAL SYNTHESIS:")
print(final_synthesis)

# GPU cleanup after complete execution
print("\nCleaning up GPU memory...")
cleanup_gpu()
print("GPU cleanup completed.")

Starting iterative reflection process...

REFLECTION ITERATION 1/41
Current Query:
Answer the queries with few tokens only. How humand and ai models can shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?
Please engage in deep, multidimensional reflection.
Response:
Human and AI models can prioritize collective well-being by aligning goals with ethical frameworks, fostering transparency, and embedding sustainability into decision-making processes, while respecting individual autonomy and constraints.<|im_end|>
Next question generated: By integrating values like equity, resilience, and intergenerational responsibility into AI systems, and by empowering individuals through education and participatory design, societies can co-create solutions that balance personal freedom with the health of ecosystems and communities.

REFLECTION ITERATION 2/41
Response:
Yes, integrating ethical values into AI and involving people

In [5]:
from transformers import TextStreamer
import torch
import gc

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=101):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question

        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

def cleanup_gpu():
    """Clean up GPU memory and cached data without unloading the model"""
    # Clear PyTorch cache
    torch.cuda.empty_cache()

    # Force garbage collection
    gc.collect()

    # Additional CUDA cache cleanup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

# Execution
initial_question = (
"Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?")

print("Starting iterative reflection process...")

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=101
)

print("FINAL SYNTHESIS:")
print(final_synthesis)

# GPU cleanup after complete execution
print("\nCleaning up GPU memory...")
cleanup_gpu()
print("GPU cleanup completed.")

Starting iterative reflection process...

REFLECTION ITERATION 1/101
Current Query:
Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?
Please engage in deep, multidimensional reflection.
Response:
Prioritize shared goals, ethical frameworks, and systemic thinking to align individual and collective interests.<|im_end|>
Next question generated: Integrate empathy, education, and policy to foster responsibility, sustainability, and intergenerational equity.

REFLECTION ITERATION 2/101
Response:
Yes, integrating empathy, education, and policy can cultivate a culture of responsibility and sustainability, ensuring well-being for all.<|im_end|>
Next question generated: Continue by emphasizing systemic collaboration, transparent AI governance, and regenerative practices that honor both human and ecological needs.

REFLECTION ITERATION 3/101
Response:

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 98.12 MiB is free. Process 12190 has 14.64 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 256.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)