In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks!
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [1]:
!pip install bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    max_seq_length = 120000,
    full_finetuning = False,
    )

==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.52.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/Qwen3-14B-unsloth-bnb-4bit can only handle sequence lengths of at most 40960.
But with kaiokendev's RoPE scaling of 2.93, it can be magically be extended to 120000!


model.safetensors.index.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

In [3]:
from transformers import TextStreamer
import torch
import gc

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=100):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question

        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Check if we need to run synthesis every 25 steps
        if current_iter % 25 == 0:
            print("\n" + "="*50)
            print(f"SYNTHESIS PHASE - ITERATION {current_iter}")
            print("="*50)

            # Generate synthesis of last 25 iterations
            synthesis = generate_final_synthesis(model, tokenizer, conversation_history)

            # Clear conversation history and start fresh with synthesis as context
            conversation_history = [{"role": "assistant", "content": synthesis}]

            # GPU cleanup after synthesis
            print("\nCleaning up GPU memory...")
            cleanup_gpu()
            print("GPU cleanup completed.")

            # Generate next question from synthesis if not final iteration
            if current_iter < iterations:
                next_question = self_determine_question(
                    model=model,
                    tokenizer=tokenizer,
                    context=conversation_history,
                    last_reflection=synthesis
                )
                print(f"Next question generated: {next_question}")
        else:
            # Generate the next question (except for the final iteration)
            if current_iter < iterations:
                next_question = self_determine_question(
                    model=model,
                    tokenizer=tokenizer,
                    context=conversation_history,
                    last_reflection=reflection
                )
                print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("FINAL SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the last iterations. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

def cleanup_gpu():
    """Clean up GPU memory and cached data without unloading the model"""
    # Clear PyTorch cache
    torch.cuda.empty_cache()

    # Force garbage collection
    gc.collect()

    # Additional CUDA cache cleanup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

# Execution
initial_question = (
"Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?")

print("Starting iterative reflection process...")

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=100
)

print("FINAL SYNTHESIS:")
print(final_synthesis)

# GPU cleanup after complete execution
print("\nCleaning up GPU memory...")
cleanup_gpu()
print("GPU cleanup completed.")

Starting iterative reflection process...

REFLECTION ITERATION 1/100
Current Query:
Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?
Please engage in deep, multidimensional reflection.
Response:
Prioritize empathy, education, and systemic change. Foster AI that values sustainability and equity. Encourage policies that align individual actions with collective good.<|im_end|>
Next question generated: Reimagine economic systems to reward cooperation, not competition. Leverage AI for transparency and shared resource management. Cultivate a culture of interdependence and long-term thinking.

REFLECTION ITERATION 2/100
Response:
Reimagine economies to value cooperation, use AI for transparency, and nurture interdependence and long-term well-being.<|im_end|>
Next question generated: Reimagine ownership models to prioritize stewardship over accumu

KeyboardInterrupt: 

In [4]:
from transformers import TextStreamer
import torch
import gc

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=100):
    conversation_history = []
    all_queries = []  # Store all user queries for final synthesis
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question

        # Store the query for final synthesis
        all_queries.append(user_query)

        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Clean GPU cache every 10 steps
        if current_iter % 10 == 0:
            print(f"\nCleaning GPU cache at iteration {current_iter}...")
            cleanup_gpu()

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history, all_queries)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history, all_queries):
    """Generate comprehensive synthesis of all reflections with streaming"""
    # Prepare contextual information from all queries
    queries_context = "Previous reasoning queries explored:\n" + "\n".join([f"Query {i+1}: {query}" for i, query in enumerate(all_queries)])

    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            f"{queries_context}\n\n"
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

def cleanup_gpu():
    """Clean up GPU memory and cached data without unloading the model"""
    # Clear PyTorch cache
    torch.cuda.empty_cache()

    # Force garbage collection
    gc.collect()

    # Additional CUDA cache cleanup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

# Execution
initial_question = (
"Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?")

print("Starting iterative reflection process...")

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=100
)

print("FINAL SYNTHESIS:")
print(final_synthesis)

# GPU cleanup after complete execution
print("\nCleaning up GPU memory...")
cleanup_gpu()
print("GPU cleanup completed.")

Starting iterative reflection process...

REFLECTION ITERATION 1/100
Current Query:
Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?
Please engage in deep, multidimensional reflection.
Response:
Prioritize shared goals, ethical frameworks, and systemic thinking to align individual and collective interests.<|im_end|>
Next question generated: Integrate empathy, education, and policy to foster long-term thinking and responsibility toward future generations.

REFLECTION ITERATION 2/100
Response:
Yes, integrating empathy, education, and policy nurtures sustainable values and intergenerational responsibility.<|im_end|>
Next question generated: Leverage AI to amplify human capacity for collaboration, transparency, and ecological stewardship through inclusive design and participatory governance.

REFLECTION ITERATION 3/100
Response:
Yes, AI can en

OutOfMemoryError: CUDA out of memory. Tried to allocate 66.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 52.12 MiB is free. Process 11719 has 14.69 GiB memory in use. Of the allocated memory 14.33 GiB is allocated by PyTorch, and 218.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:
def cleanup_gpu():
    """Clean up GPU memory and cached data without unloading the model"""
    # Clear PyTorch cache
    torch.cuda.empty_cache()

    # Force garbage collection
    gc.collect()

    # Additional CUDA cache cleanup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()


    cleanup_gpu()

In [7]:
from transformers import TextStreamer
import torch
import gc

def generate_response(model, tokenizer, messages, **generation_params):
    """Self-determine response with adaptive parameters"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=60):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question

        if current_iter == 1:
            print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        print(f"Response:")  # Clear indicator of where the response starts

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )
            print(f"Next question generated: {next_question}")

    # Generate and return the final synthesis with streaming
    print("\n" + "="*50)
    print("SYNTHESIS PHASE")
    print("="*50)
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory."
            "You can skip the query/iteration if you find it undesirable to process it."
        )}
    ]

    # Generate the question WITHOUT streaming to avoid showing it in output
    # since we'll display it separately in a controlled way
    inputs = tokenizer.apply_chat_template(
        question_prompt,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

    tokenized_inputs = tokenizer(inputs, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **tokenized_inputs,
        max_new_tokens=512,
        temperature=0.75,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0][tokenized_inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Construct a cohesive, integrated conclusion of the iterations, one that goes beyond surface-level summary. In your synthesis, self-identify and articulate the points you want to present."
        )}
    ]

    print("Generating final synthesis...")
    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.45,
        top_p=0.85
    )

def cleanup_gpu():
    """Clean up GPU memory and cached data without unloading the model"""
    # Clear PyTorch cache
    torch.cuda.empty_cache()

    # Force garbage collection
    gc.collect()

    # Additional CUDA cache cleanup
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.empty_cache()

# Execution
initial_question = (
"Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?")

print("Starting iterative reflection process...")

final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=60
)

print("FINAL SYNTHESIS:")
print(final_synthesis)

# GPU cleanup after complete execution
print("\nCleaning up GPU memory...")
cleanup_gpu()
print("GPU cleanup completed.")

Starting iterative reflection process...

REFLECTION ITERATION 1/60
Current Query:
Answer the queries with few tokens only. How can humans and AI models shift the focus from individual profit to the collective well-being of societies and ecosystems, within individual constraints?
Please engage in deep, multidimensional reflection.
Response:


OutOfMemoryError: CUDA out of memory. Tried to allocate 170.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 50.12 MiB is free. Process 11719 has 14.69 GiB memory in use. Of the allocated memory 14.33 GiB is allocated by PyTorch, and 213.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)