
# 02 - Generation (Imitation / Summarization)

Produce LLM imitations/summaries for each combat turn to feed Perspective + topic analysis. Keep paths explicit to make reruns reproducible and skip already generated rows when possible.



**Goals**
- Load conversation bundles from `assets/raw/` and reuse any seeded summaries when present.
- Define a prompt + local LLM connector to generate `imm_1` and refusal flag `imm_1_check`.
- Persist the regenerated bundle to `assets/processed/` with quick sanity checks before later notebooks.


In [None]:

from pathlib import Path
import pickle
from typing import Callable, Iterable, List, Optional, Sequence

import pandas as pd
import requests

from utils.data_io import load_df_list_pickle, flatten_conversation_bundles, describe_bundle


In [None]:

# Paths and run settings
PROJECT_ROOT = Path.cwd()
ASSETS_RAW = PROJECT_ROOT / "assets" / "raw"
ASSETS_PROCESSED = PROJECT_ROOT / "assets" / "processed"

RAW_COMBAT = ASSETS_RAW / "combat_threads_text_only.pkl"
AGU_SEED = ASSETS_RAW / "combat_threads_with_agu_sample.pkl"  # optional helper
PREVIOUS_IMM = ASSETS_PROCESSED / "combat_threads_with_imitation.pkl"
OUTPUT_PATH = ASSETS_PROCESSED / "combat_threads_with_imitation_regen.pkl"

MODEL_NAME = "llama3"
OLLAMA_ENDPOINT = "http://localhost:11434/api/generate"
DRY_RUN = True  # flip to False to hit a running local model

ASSETS_PROCESSED.mkdir(parents=True, exist_ok=True)
RAW_COMBAT, AGU_SEED, PREVIOUS_IMM, OUTPUT_PATH



### Asset manifest
Confirm which inputs are available and where the regenerated bundle will land.


In [None]:

asset_manifest = [
    {
        "role": "input",
        "path": RAW_COMBAT,
        "note": "6842 combat conversations; text only extracted from convokit reply chains.",
    },
    {
        "role": "input_optional",
        "path": AGU_SEED,
        "note": "334-sample with agu_1 summarization; useful for prompt shaping only.",
    },
    {
        "role": "resume_optional",
        "path": PREVIOUS_IMM,
        "note": "Existing imitation run (imm_1 + imm_1_check) that can be reused or compared.",
    },
    {
        "role": "output",
        "path": OUTPUT_PATH,
        "note": "Regenerated imitation bundle with imm_1 + imm_1_check.",
    },
]
manifest_df = pd.DataFrame(asset_manifest)
manifest_df["exists"] = manifest_df["path"].apply(lambda p: p.exists())
manifest_df



### Inspect source bundle
Lightweight stats so generation logic can stay defensive against the list-of-DataFrames layout.


In [None]:

raw_bundle = load_df_list_pickle(RAW_COMBAT)
print("raw bundle:", describe_bundle(raw_bundle))
raw_preview = flatten_conversation_bundles(raw_bundle[:1])
display(raw_preview.head())

agu_bundle_present = AGU_SEED.exists()
if agu_bundle_present:
    agu_bundle = load_df_list_pickle(AGU_SEED)
    print("agu sample:", describe_bundle(agu_bundle))
    display(flatten_conversation_bundles(agu_bundle[:1]).head())
else:
    agu_bundle = None

existing_bundle = load_df_list_pickle(PREVIOUS_IMM) if PREVIOUS_IMM.exists() else None
if existing_bundle is not None:
    print("existing imm:", describe_bundle(existing_bundle))


### Prompt and refusal heuristics
Replicates the pre-clean pipeline: enforce the JSON `{"--IMMITATION--": ...}` output format and treat the presence of the marker as success.


In [None]:

SYSTEM_PROMPT = (
'OUTPUT FORMAT:{"--IMMITATION--": YOUR_IMMITATION_TEXTS}'
"Your immitation should preserve the Speaker's semantic meaning and emotions sentence by sentence."
)

USER_PROMPT_TEMPLATE = "Speaker's Text:{comment}"
IMMITATION_MARKER = "--IMMITATION--"
POSSIBLE_SIGNAL = '": '


def build_prompt(comment: str) -> str:
    return f"{SYSTEM_PROMPT}" + USER_PROMPT_TEMPLATE.format(comment=comment)


def run_local_ollama(prompt: str, model: str = MODEL_NAME, endpoint: str = OLLAMA_ENDPOINT) -> str:
    payload = {"model": model, "prompt": prompt, "stream": False}
    response = requests.post(endpoint, json=payload, timeout=120)
    response.raise_for_status()
    data = response.json()
    return data.get("response", "").strip()


def parse_immitation_output(text: str, original: str) -> tuple[str, bool]:
    """Return (immitation_text, success_flag) mirroring the pre-clean pipeline."""
    if not isinstance(text, str):
        return original, False
    if IMMITATION_MARKER not in text:
        return original, False
    result = text.split(IMMITATION_MARKER, 1)[1]
    if "}" in result:
        result = result.split("}", 1)[0]
    if POSSIBLE_SIGNAL in result:
        parts = result.split(POSSIBLE_SIGNAL)
        result = parts[1] if len(parts) > 1 else parts[0]
    return result.strip(), True


def mark_refusal(text: str) -> bool:
    _, ok = parse_immitation_output(text, original="")
    return not ok



### Generation helpers


In [None]:

def prepare_frame(frame: pd.DataFrame, existing: Optional[pd.DataFrame]) -> pd.DataFrame:
    """Attach existing generation columns (if available) before re-processing."""
    base = frame.copy()
    if existing is not None:
        available_cols = [col for col in ["imm_1", "imm_1_check"] if col in existing.columns]
        base = base.join(existing[available_cols], how="left")
    return base


def attach_generation(frame: pd.DataFrame, run_fn: Callable[[str], str], dry_run: bool = True) -> pd.DataFrame:
    df = frame.copy()

    def _generate(row):
        # Keep prior generation when present to avoid redundant calls.
        if isinstance(row.get("imm_1"), str) and row["imm_1"].strip():
            return row["imm_1"]
        if dry_run:
            return f'{{"{IMMITATION_MARKER}": "{row["text"][:200]}"}}'
        prompt = build_prompt(row["text"])
        return run_fn(prompt)

    df["imm_1"] = df.apply(_generate, axis=1)

    def _keep_or_parse(row):
        if "imm_1_check" in row and pd.notna(row["imm_1_check"]):
            return row["imm_1"], row["imm_1_check"]
        parsed_text, ok = parse_immitation_output(row["imm_1"], row["text"])
        return parsed_text, ok

    df["imm_1"], df["imm_1_check"] = zip(*df.apply(_keep_or_parse, axis=1))
    return df


def generate_bundle(
    bundles: Sequence,
    run_fn: Callable[[str], str],
    dry_run: bool = True,
    existing_bundle: Optional[Sequence] = None,
    limit_conversations: Optional[int] = None,
) -> List:
    output: List = []
    for convo_idx, convo in enumerate(bundles):
        if limit_conversations is not None and convo_idx >= limit_conversations:
            break

        frames = convo if isinstance(convo, (list, tuple)) else [convo]
        existing_frames = None
        if existing_bundle is not None and convo_idx < len(existing_bundle):
            candidate = existing_bundle[convo_idx]
            existing_frames = candidate if isinstance(candidate, (list, tuple)) else [candidate]

        processed_frames: List = []
        for frame_idx, frame in enumerate(frames):
            existing_frame = None
            if existing_frames is not None and frame_idx < len(existing_frames):
                existing_frame = existing_frames[frame_idx]
            prepared = prepare_frame(frame, existing_frame)
            processed_frames.append(attach_generation(prepared, run_fn, dry_run=dry_run))

        output.append(processed_frames if isinstance(convo, (list, tuple)) else processed_frames[0])
    return output



### Smoke test on a small subset
Set `DRY_RUN = False` to hit the local model; `limit_conversations` keeps early debugging cheap.


In [None]:

subset_existing = existing_bundle[:2] if existing_bundle is not None else None
subset_generated = generate_bundle(
    raw_bundle[:2],
    run_fn=run_local_ollama,
    dry_run=DRY_RUN,
    existing_bundle=subset_existing,
)
flattened_subset = flatten_conversation_bundles(subset_generated)
flattened_subset.head()



### Full run (long; generates `OUTPUT_PATH`)
Uncomment the following cell to generate all conversations and persist the bundle.


In [None]:

# full_generated = generate_bundle(
#     raw_bundle,
#     run_fn=run_local_ollama,
#     dry_run=DRY_RUN,
#     existing_bundle=existing_bundle,
# )
# with OUTPUT_PATH.open("wb") as fp:
#     pickle.dump(full_generated, fp)
# OUTPUT_PATH



### Sanity checks on saved output


In [None]:

if OUTPUT_PATH.exists():
    regen_bundle = load_df_list_pickle(OUTPUT_PATH)
    regen_flat = flatten_conversation_bundles(regen_bundle)
    print("rows", len(regen_flat))
    print(regen_flat["imm_1_check"].value_counts(dropna=False).head())
    display(regen_flat.sample(5, random_state=0)[["text", "imm_1", "imm_1_check"]])
else:
    print("No regenerated bundle found; run the full generation cell above first.")
