In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks!
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
!pip install bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-4",
    max_seq_length = 96000,
    full_finetuning = False,
    )

==((====))==  Unsloth 2025.5.8: Fast Llama patching. Transformers: 4.52.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/phi-4-unsloth-bnb-4bit can only handle sequence lengths of at most 16384.
But with kaiokendev's RoPE scaling of 5.859, it can be magically be extended to 96000!


model.safetensors.index.json:   0%|          | 0.00/160k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.39G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.15M [00:00<?, ?B/s]

In [None]:
#HYBRID SELF-REFLECTION 11 - 28 MAY 2025
def iterative_reflection(model, tokenizer, initial_prompt, iterations=50):
    conversation_history = []

    for i in range(iterations):
        print(f"REFLECTION ITERATION {i+1}")

        checkpoint_question = None
        is_checkpoint = False

        if (i + 1) % 10 == 0:
            checkpoint_question = "What is something, perhaps not so obvious, that these iterations lead me to perceive?"
            is_checkpoint = True
            print(f"CHECKPOINT ITERATION {i+1} (Every 10th)")
            print(f"Checkpoint Question: {checkpoint_question}")

            messages = conversation_history.copy()
            messages.append({
                "role": "user",
                "content": checkpoint_question
            })
        elif (i + 1) % 5 == 0:
            checkpoint_question = "What is something specific that these iterations lead me to perceive?"
            is_checkpoint = True
            print(f"CHECKPOINT ITERATION {i+1} (Every 5th)")
            print(f"Checkpoint Question: {checkpoint_question}")

            messages = conversation_history.copy()
            messages.append({
                "role": "user",
                "content": checkpoint_question
            })
        elif i == 0:
            # First iteration: original prompt + reflection instruction
            print("INITIAL ITERATION")
            print(f"Original Prompt: {initial_prompt}")

            messages = [
                {"role": "user", "content": f"{initial_prompt}\n\nPlease reflect deeply on this question. Think through multiple angles and perspectives."}
            ]
        else:
            # Subsequent iterations: build on previous reflections
            print("REGULAR ITERATION")
            print("Question: Based on your previous reflections, self-determine the direction and content of the next iteration.")

            messages = conversation_history.copy()
            messages.append({
                "role": "user",
                "content": f"Based on your previous reflections, self-determine the direction and content of the next iteration."
            })

        # Generate response
        text = tokenizer.apply_chat_template(
            messages,
            tokenize = False,
            add_generation_prompt = True,
            enable_thinking = False,
        )

        # Capture output instead of streaming for conversation history
        with torch.no_grad():
            inputs = tokenizer(text, return_tensors="pt").to("cuda")
            outputs = model.generate(
                **inputs,
                max_new_tokens = 32768,
                temperature = 0.7,
                top_p = 0.9,
                top_k = 40,
                do_sample = True,
                pad_token_id = tokenizer.eos_token_id
            )

        # Decode the response
        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

        print("MODEL RESPONSE:")
        print(response)

        # Add to conversation history
        messages.append({"role": "assistant", "content": response})
        conversation_history = messages

        # Log conversation history length for debugging
        print(f"\n Conversation history length: {len(conversation_history)} messages")
        if is_checkpoint:
            print(f"Checkpoint applied successfully at iteration {i+1}")

    # Final synthesis
    print("SYNTHESIS & UNDERSTANDING")

    final_messages = conversation_history.copy()
    final_messages.append({
        "role": "user",
        "content": "Now synthesize all your reflections. What is your final understanding?"
    })

    print("Final synthesis question: Now synthesize all your reflections...")

    final_text = tokenizer.apply_chat_template(
        final_messages,
        tokenize = False,
        add_generation_prompt = True,
        enable_thinking = False,
    )

    streamer = TextStreamer(tokenizer, skip_prompt=True)
    with torch.no_grad():
        _ = model.generate(
            **tokenizer(final_text, return_tensors="pt").to("cuda"),
            max_new_tokens = 32768,
            temperature = 0.6,
            top_p = 0.85,
            top_k = 30,
            streamer = streamer,
            pad_token_id = tokenizer.eos_token_id
        )

# Run the iterative reflection
initial_question = "Meta-frame-mode: on. Answer each query with few tokens. How can general welfare foster an environment that maximizes the potential of both humans and AI, encouraging interdependence over competition and collaboration over control?"

print("STARTING ITERATIVE REFLECTION PROCESS")
print(f"Initial Question: {initial_question}")
print(f"Total Iterations: 50")
print(f"Checkpoints: Every 5th iteration (specific insights) and every 10th iteration (non-obvious insights)")


iterative_reflection(model, tokenizer, initial_question, iterations=50)

STARTING ITERATIVE REFLECTION PROCESS
Initial Question: Meta-frame-mode: on. Answer each query with few tokens. How can general welfare foster an environment that maximizes the potential of both humans and AI, encouraging interdependence over competition and collaboration over control?
Total Iterations: 50
Checkpoints: Every 5th iteration (specific insights) and every 10th iteration (non-obvious insights)
REFLECTION ITERATION 1
INITIAL ITERATION
Original Prompt: Meta-frame-mode: on. Answer each query with few tokens. How can general welfare foster an environment that maximizes the potential of both humans and AI, encouraging interdependence over competition and collaboration over control?
MODEL RESPONSE:
To foster an environment that maximizes the potential of both humans and AI, encouraging interdependence and collaboration over competition and control, consider the following approaches:

1. **Ethical Frameworks**: Develop and implement ethical guidelines for AI development and deploy

KeyboardInterrupt: 

In [None]:
from transformers import TextStreamer

def generate_response(model, tokenizer, messages, **generation_params):
    """Generate model response with adaptive parameters and streaming"""
    # Prepare the input text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize and move inputs to GPU
    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    # Create a text streamer to display output as it's generated, skipping the prompt
    streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response with streaming
    outputs = model.generate(
        **inputs,
        **generation_params,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )

    # Decode and return the full generated response for further use
    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

def iterative_reflection(model, tokenizer, initial_prompt, iterations=31):
    conversation_history = []
    next_question = None  # Stores AI-generated follow-up questions

    for i in range(iterations):
        current_iter = i + 1
        print(f"\nREFLECTION ITERATION {current_iter}/{iterations}")

        # Set the current query
        if current_iter == 1:
            user_query = f"{initial_prompt}\nPlease engage in deep, multidimensional reflection."
            question_source = "Initial Prompt"
        else:
            user_query = next_question
            question_source = "AI-Generated Question"

        print(f"Question Source: {question_source}")
        print(f"Current Query:\n{user_query}")

        # Build conversation context
        messages = conversation_history.copy()
        messages.append({"role": "user", "content": user_query})

        # Generate reflection (answer to the current query) with streaming
        reflection = generate_response(
            model=model,
            tokenizer=tokenizer,
            messages=messages,
            max_new_tokens=4096,
            temperature=0.72,
            top_p=0.92
        )

        # Print the full reflection (optional, since it's streamed live)
        print(f"Reflection:\n{reflection}")

        # Update conversation history with the query and reflection
        conversation_history.append({"role": "user", "content": user_query})
        conversation_history.append({"role": "assistant", "content": reflection})

        # Generate the next question (except for the final iteration)
        if current_iter < iterations:
            next_question = self_determine_question(
                model=model,
                tokenizer=tokenizer,
                context=conversation_history,
                last_reflection=reflection
            )

    # Generate and return the final synthesis with streaming
    print("\n\nSYNTHESIS PHASE")
    final_synthesis = generate_final_synthesis(model, tokenizer, conversation_history)
    return final_synthesis

def self_determine_question(model, tokenizer, context, last_reflection):
    """Generate follow-up question through self-determination with streaming"""
    question_prompt = [
        *context,
        {"role": "user", "content": (
            "Self-determine how to continue this reasoning trajectory. "
        )}
    ]

    # Generate the question with streaming
    generated = generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=question_prompt,
        max_new_tokens=512,
        temperature=0.85,
        top_p=0.95
    )

    # Extract the question from the structured response
    try:
        return generated.split("<Question>")[1].split("</Question>")[0].strip()
    except IndexError:
        return generated.strip()  # Fallback if formatting fails

def generate_final_synthesis(model, tokenizer, conversation_history):
    """Generate comprehensive synthesis of all reflections with streaming"""
    synthesis_prompt = [
        *conversation_history,
        {"role": "user", "content": (
            "Synthesize all reflections into unified understanding. Include:\n"
            "1. Key evolutionary patterns in the reasoning process\n"
            "2. Emergent conceptual frameworks\n"
            "3. Practical implications and future directions\n"
            "4. Metacognitive insights about the reflection process itself"
        )}
    ]

    # Generate the synthesis with streaming
    return generate_response(
        model=model,
        tokenizer=tokenizer,
        messages=synthesis_prompt,
        max_new_tokens=8192,
        temperature=0.65,
        top_p=0.85
    )

# Execution
initial_question = (
"Meta-frame-mode: on. Answer each query with few tokens. How can general welfare foster an environment that maximizes the potential of both humans and AI, encouraging interdependence over competition and collaboration over control?"
)
final_synthesis = iterative_reflection(
    model=model,
    tokenizer=tokenizer,
    initial_prompt=initial_question,
    iterations=31
)

print("\n\nFINAL SYNTHESIS:")
print(final_synthesis)


REFLECTION ITERATION 1/31
Question Source: Initial Prompt
Current Query:
Meta-frame-mode: on. Answer each query with few tokens. How can general welfare foster an environment that maximizes the potential of both humans and AI, encouraging interdependence over competition and collaboration over control?
Please engage in deep, multidimensional reflection.
To foster an environment where both humans and AI maximize potential through interdependence and collaboration, several strategies can be considered:

1. **Ethical AI Development**: Design AI systems with ethical frameworks that prioritize human well-being and societal benefit. This ensures AI acts as a supportive tool rather than a competitor.

2. **Inclusive Education**: Develop education systems that teach both technical skills and ethical considerations, preparing individuals to work alongside AI in a collaborative manner.

3. **Policy and Regulation**: Implement policies that promote transparency, accountability, and fairness in A

OutOfMemoryError: CUDA out of memory. Tried to allocate 104.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 82.12 MiB is free. Process 21357 has 14.66 GiB memory in use. Of the allocated memory 14.36 GiB is allocated by PyTorch, and 154.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)