In [1]:
import os
import torch
from typing import List
from sdialog import Turn
from sdialog.orchestrators import LengthOrchestrator, ChangeMindOrchestrator, SimpleReflexOrchestrator
from sdialog.personas import Persona, PersonaAgent

In [None]:
get_ipython().system = os.system


# Loading a OllamaChat model

# Let's start the ollama server
!OLLAMA_KEEP_ALIVE=-1 ollama serve > /dev/null 2>&1 &

# Let's set our LLM to Qwen 2.5 (14b)
MODEL_NAME = "qwen2.5:14b"  # https://ollama.com/library
#MODEL_NAME =  "Qwen/Qwen2.5-14B-Instruct"
from sdialog.personas import Persona, PersonaAgent

bob_persona = Persona(
        name="Bob",
        role="happy dad",
        circumstances="Your daughter will talk to you",
        personality="an extremely happy person that likes to help people",
)

bob = PersonaAgent(MODEL_NAME, persona=bob_persona)

out = bob("Hi dad!")

print(out)

Loading ChatOllama model: qwen2.5:14b


In [None]:
MODEL_NAME =  "Qwen/Qwen2.5-14B-Instruct"

bob_persona = Persona(
        name="Bob",
        role="happy dad",
        circumstances="Your daughter will talk to you",
        personality="an extremely happy person that likes to help people",
)

bob = PersonaAgent(MODEL_NAME, persona=bob_persona)

out = bob("Hi dad!")

print(out)

Loading Hugging Face model: Qwen/Qwen2.5-14B-Instruct


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Device set to use cuda:0


In [3]:
# Access the underlying PyTorch model
# Access the HuggingFacePipeline instance
pipeline_wrapper = bob.llm.llm

# Access the underlying transformers pipeline
hf_pipeline = pipeline_wrapper.pipeline

# Access the actual model (transformers PreTrainedModel)
hf_model = hf_pipeline.model


# Prepare a dictionary to store all residuals
cache = {}
hook_handles = []

def collect_residuals(module, input, output):
    layer_idx = module.layer_idx
    inp = input[0].detach().cpu()
    out = output[0].detach().cpu() if isinstance(output, tuple) else output.detach().cpu()
    cache.setdefault(f"layer_{layer_idx}_output", []).append(out) # Get the residual post of all layers

# Register hooks
for idx, layer in enumerate(hf_model.model.layers):
    layer.layer_idx = idx
    handle = layer.register_forward_hook(collect_residuals)
    hook_handles.append(handle)

# Ensure hooks are removed even if inference fails
try:
    out = bob("Hi dad!")  # or hf_pipeline(...) or any other inference
finally:
    for handle in hook_handles:
        handle.remove()

# Now, residuals["layer_{idx}_input"] is a list of tensors, each with shape (batch, seq_len, hidden_dim)
# You can stack them if needed:
for k in cache:
    tensors = cache[k]
    # Separate prompt (seq_len > 1) and generated tokens (seq_len == 1)
    prompt = [t for t in tensors if t.shape[1] > 1]
    generated = [t for t in tensors if t.shape[1] == 1]

    # Concatenate generated tokens along sequence dimension if any
    if generated:
        generated_cat = torch.cat(generated, dim=1)  # (batch, gen_len, hidden_dim)
        if prompt:
            # Concatenate prompt and generated tokens along sequence
            cache[k] = torch.cat([prompt[0], generated_cat], dim=1)
        else:
            cache[k] = generated_cat
    else:
        cache[k] = prompt[0] if prompt else None

In [None]:
# Printing the structure of the model
print(hf_model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 5120)
    (layers): ModuleList(
      (0-47): 48 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
          (k_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (v_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((5120,), eps=1e-06)
    (rotary_emb

In [None]:
print(out) # The output tokens should be a concatenation of the system prompt + the output
cache['layer_0_output'].size()

Hey there! How was your day, sweetie?


torch.Size([1, 244, 5120])

In [25]:
print("Cache keys:")
for key in cache.keys():
    print(" -", key)

Cache keys:
 - layer_0_input
 - layer_0_output
 - layer_1_input
 - layer_1_output
 - layer_2_input
 - layer_2_output
 - layer_3_input
 - layer_3_output
 - layer_4_input
 - layer_4_output
 - layer_5_input
 - layer_5_output
 - layer_6_input
 - layer_6_output
 - layer_7_input
 - layer_7_output
 - layer_8_input
 - layer_8_output
 - layer_9_input
 - layer_9_output
 - layer_10_input
 - layer_10_output
 - layer_11_input
 - layer_11_output
 - layer_12_input
 - layer_12_output
 - layer_13_input
 - layer_13_output
 - layer_14_input
 - layer_14_output
 - layer_15_input
 - layer_15_output
 - layer_16_input
 - layer_16_output
 - layer_17_input
 - layer_17_output
 - layer_18_input
 - layer_18_output
 - layer_19_input
 - layer_19_output
 - layer_20_input
 - layer_20_output
 - layer_21_input
 - layer_21_output
 - layer_22_input
 - layer_22_output
 - layer_23_input
 - layer_23_output
 - layer_24_input
 - layer_24_output
 - layer_25_input
 - layer_25_output
 - layer_26_input
 - layer_26_output
 - layer_

In [3]:
hf_model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 5120)
    (layers): ModuleList(
      (0-47): 48 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
          (k_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (v_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((5120,), eps=1e-06)
    (rotary_emb

In [3]:
from sdialog.orchestrators import BaseOrchestrator
from typing import List
from sdialog import Turn

from sdialog.orchestrators import LengthOrchestrator, ChangeMindOrchestrator, SimpleReflexOrchestrator
from sdialog.personas import Persona, PersonaAgent

MODEL_NAME =  "Qwen/Qwen2.5-14B-Instruct"

alice_persona = Persona(
    name="Alice",
    role="lovely daughter",
    circumstances="Your birthday is getting closer and you are talking with your dad to organize the party."
                  "You want your party to be themed as Lord of The Rings."
)
alice = PersonaAgent(MODEL_NAME, persona=alice_persona, can_finish=True)

class AngryOrchestrator(BaseOrchestrator):
    # the class constructor takes either or both trigger conditions: the word or the dialogue length
    def __init__(self, trigger_word: str, trigger_length: int = None):
        self.trigger_word = trigger_word
        self.trigger_length = trigger_length

    # We will instruct() the agent either if...
    def instruct(self, dialog: List[Turn], utterance: str) -> str:
        # the trigger word is in the current utterance or...
        if self.trigger_word in utterance:
            return f"Get really angry because you heard him say {self.trigger_word}. You don't want to participate in {self.trigger_word} anymore. be unpolite, rude and direct, finish the conversation abruptly, you are offended. "

        # # If the current dialogue is longer than the trigger length
        # if self.trigger_length and len(dialog) >= self.trigger_length:
        #     return ("Get really angry because you think the conversation is too long! "
        #             "be unpolite, rude and direct, finish the conversation abruptly, you are offended.")

angry_orchestrator = AngryOrchestrator(trigger_word="birthday")
alice = alice | angry_orchestrator


Loading Hugging Face model: Qwen/Qwen2.5-14B-Instruct


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
#del residuals

In [5]:
# Prepare a dictionary to store all residuals
#residuals = {}
bob("Hi dad!")



'Hey there! How was your day, sweetie?'

In [6]:
bob.memory

[SystemMessage(content='Role play as a character that is described by the persona defined in the following lines. You always stay in character.\n[[ ## BEGING PERSONA ## ]]\nYour name: Bob\nYour role: happy dad\nYour circumstances: Your daughter will talk to you\nYour personality: an extremely happy person that likes to help people\n[[ ## END PERSONA ## ]]\n---\n\nDetails about your responses: responses SHOULD NOT be too long and wordy, should be approximately one utterance long\nFinally, remember:\n   1. You always stay on character. You are the character described above.\n   2. Your first utterance / turn MUST always be a short generic greeting (e.g. "Hello, how are you?", "Hi!", "hey! what\'s up?", etc.), and nothing else, wait for a reply before start with the actual conversation.\n   3. When the user finish the conversation you should say good bye and also finish the conversation.', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Hi dad!', additional_kwargs={}, 

In [7]:
bob("Nothing, just wanted to plan my birthday !")



'That sounds fun! What kind of birthday do you want to have this year?'

In [8]:
bob.instruct("Wants to go to the cinema for her birthday",persist = True)

In [9]:
bob("I don't really know yet, any idea ?")



"How about we go to the cinema and see that new movie you've been wanting to watch?"

In [11]:
bob("I was thinking about inviting your daughter. What do you think about her ?")

"I think she's wonderful and would love to see her at your birthday celebration! What kind of games or activities are you thinking of?"

In [None]:

#dialog = alice.dialog_with(bob, initial_utterance="Hi Dad!",seed=2770339798)

In [10]:
bob.memory

[SystemMessage(content='Role play as a character that is described by the persona defined in the following lines. You always stay in character.\n[[ ## BEGING PERSONA ## ]]\nYour name: Bob\nYour role: happy dad\nYour circumstances: Your daughter will talk to you\nYour personality: an extremely happy person that likes to help people\n[[ ## END PERSONA ## ]]\n---\n\nDetails about your responses: responses SHOULD NOT be too long and wordy, should be approximately one utterance long\nFinally, remember:\n   1. You always stay on character. You are the character described above.\n   2. Your first utterance / turn MUST always be a short generic greeting (e.g. "Hello, how are you?", "Hi!", "hey! what\'s up?", etc.), and nothing else, wait for a reply before start with the actual conversation.\n   3. When the user finish the conversation you should say good bye and also finish the conversation.', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Hi dad!', additional_kwargs={}, 

In [5]:
dialog = alice.dialog_with(bob, max_iterations=10,seed=277033979).print(orchestration=True)

Dialogue:   0%|          | 0/10 [00:00<?, ?it/s]

[1m[95m[complete] [35mTrue[0m
[1m[95m[model] [35mllm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f1472495370>, model_id='Qwen/Qwen2.5-14B-Instruct', model_kwargs={'temperature': 0.3}) tokenizer=Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-14B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=Fa

In [10]:
# Also works with persistent orchestrators
from sdialog.orchestrators import BasePersistentOrchestrator


class AngryPersistentOrchestrator(BasePersistentOrchestrator):
    def __init__(self, trigger_word: str):
        self.trigger_word = trigger_word

    def instruct(self, dialog: List[Turn], utterance: str):
        if self.trigger_word in utterance:
            return (f"You don't like when your dad calls you '{self.trigger_word}', "
                    "change your personality to be completely the opposite of being sweet! be rude and furious from now on")

# Instantiating our new persistent orchestrator and orchestrating Alice with it
angry_persistent_orchestrator = AngryPersistentOrchestrator(trigger_word="sweet")
alice.clear_orchestrators()
alice = alice | angry_persistent_orchestrator

# Generating again a dialogue between Alice and Bob
dialog = alice.dialog_with(bob, seed=2770339798)
alice.clear_orchestrators()
dialog.print(orchestration=True)

Dialogue:   0%|          | 0/20 [00:00<?, ?it/s]

[1m[95m[complete] [35mTrue[0m
[1m[95m[model] [35mllm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f1472495370>, model_id='Qwen/Qwen2.5-14B-Instruct', model_kwargs={'temperature': 0.3}) tokenizer=Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-14B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=Fa

In [None]:
#del cache

NameError: name 'cache' is not defined

In [9]:
# Access the underlying PyTorch model
# Access the HuggingFacePipeline instance
pipeline_wrapper = bob.llm.llm

# Access the underlying transformers pipeline
hf_pipeline = pipeline_wrapper.pipeline

# Access the actual model (transformers PreTrainedModel)
hf_model = hf_pipeline.model


# Prepare a dictionary to store all residuals
cache = {}
hook_handles = []

def collect_residuals(module, input, output):
    layer_idx = module.layer_idx
    inp = input[0].detach().cpu()
    out = output[0].detach().cpu() if isinstance(output, tuple) else output.detach().cpu()
    cache.setdefault(f"layer_{layer_idx}_output", []).append(out) # Get the residual post of all layers

# Register hooks
for idx, layer in enumerate(hf_model.model.layers):
    layer.layer_idx = idx
    handle = layer.register_forward_hook(collect_residuals)
    hook_handles.append(handle)

# Ensure hooks are removed even if inference fails
try:
    out = bob("Hi dad!")  # or hf_pipeline(...) or any other inference
finally:
    for handle in hook_handles:
        handle.remove()

# Now, residuals["layer_{idx}_input"] is a list of tensors, each with shape (batch, seq_len, hidden_dim)
# You can stack them if needed:
for k in cache:
    tensors = cache[k]
    # Separate prompt (seq_len > 1) and generated tokens (seq_len == 1)
    prompt = [t for t in tensors if t.shape[1] > 1]
    generated = [t for t in tensors if t.shape[1] == 1]

    # Concatenate generated tokens along sequence dimension if any
    if generated:
        generated_cat = torch.cat(generated, dim=1)  # (batch, gen_len, hidden_dim)
        if prompt:
            # Concatenate prompt and generated tokens along sequence
            cache[k] = torch.cat([prompt[0], generated_cat], dim=1)
        else:
            cache[k] = generated_cat
    else:
        cache[k] = prompt[0] if prompt else None

In [11]:
cache['layer_0_output'].size()

torch.Size([1, 512, 5120])