In [5]:
from __future__ import annotations

import json

from pathlib import Path
from collections import defaultdict
import numpy as np
import re
from graph.graph import AssociatedGraph
from utils.preprocessing import create_graphs
from itertools import combinations
import pandas as pd
from typing import Dict, Tuple, List, Literal, Optional


In [6]:
class obj:
    def __init__(self, dict1):
        self.__dict__.update(dict1)
 
def dict2obj(dict1):
    return json.loads(json.dumps(dict1), object_hook=obj)

args = dict2obj({
        "folder_path": "Analysis/selected_strs_renumber/without_TCR",
        "manifest": "manifest_analysis.json",
        "serd_config": None,
        "files_name": None,
        "output_path": None,
        "run_name": None,
        "check_depth": False,
        "check_rsa": True,
        "centroid_threshold": 8.5,
        "distance_diff_threshold": 3,
        "depth_filter": None,
        "depth_bins": 3,
        "rsa_filter": 0.1,
        "rsa_bins": 3,
        "distance_bins": 3,
        "classes_path": None,
        "exclude_waters": False})

checks = {
        "depth": args.check_depth,
        "rsa": args.check_rsa,
}

association_config = {
    "centroid_threshold": args.centroid_threshold,
    "distance_diff_threshold": args.distance_diff_threshold,
    "rsa_filter": args.rsa_filter,
    "depth_filter": args.depth_filter,
    "rsa_bins": args.rsa_bins,
    "depth_bins": args.depth_bins,
    "distance_bins": args.distance_bins,
    "checks": checks,
    "classes_path": args.classes_path
}



In [7]:
manifest = {
  "settings": {
    "run_name": None,
    "output_path": None,
    "debug": True,
    "track_steps": True,
    "centroid_threshold": 8.5,
    "centroid_granularity": "ca_only",
    "exclude_waters": False,
    "check_rsa": True,
    "check_depth": False,
    "rsa_filter": 0.1,
    "depth_filter": 10.0,
    "distance_diff_threshold": 3,

    "distance_bins": 3,
    "rsa_bins": 3,
    "depth_bins": 3,

    "serd_config": None,

    "classes_": {
      "residues": {
        "HID": ["ALA", "VAL", "LEU", "ILE", "MET"],
        "POL": ["SER", "THR", "ASN", "GLN", "CYS"],
        "POS": ["LYS", "ARG", "HIS"],
        "NEG": ["ASP", "GLU"],
        "ARO": ["PHE", "TYR", "TRP"],
        "ESP": ["GLY", "PRO"]
      }
    }
  },

  "inputs": [],
  "constrains": {
    "MHC1": {
      "chains": ["C"],
      "residues": {
        "A": [18, 19, 42, 43, 44, 54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 83, 84, 85, 89, 108, 109, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171]
      }
    },
    "MHC2": {
      "chains": ["C"],
      "residues": {
        "A": [39, 53, 54, 55, 57, 58, 60, 61, 62, 64, 65, 67, 68, 69, 71]
      }
    }
  }
}


In [8]:
diffMHC_diffPep = pd.read_csv("Analysis/crossreact_processed_diff_MHC_diff_pep_helder.csv")
diffMHC_SamePep = pd.read_csv("Analysis/crossreact_processed_diff_MHC_same_pep_helder.csv")
sameMHC_diffPep = pd.read_csv("Analysis/crossreact_processed_same_MHC_diff_pep_helder.csv")
rawCross = pd.read_csv("Analysis/crossreact_tcrs_v4.csv")

In [9]:
rawCross

Unnamed: 0,TCR_ID,TRA,TRB,peptide,MHCseq,MHCseq_ref,allele,allele_blast,mismatches,Score,...,TRA_ref,TRB_ref,TCR_pair_id,MHC_allele_id,MHC_allele_id_interface,peptide_id,pMHC_id,pep_crossreact,mhc_crossreact,pmhc_crossreact
0,PDB4ms8,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,SPAEAGFFL,MGPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRY...,GPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRYE...,,H-2Ld,"F9Y,V13T,P16R,I24T,N31D,A50V,K132R",3,...,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,0,0,0,3,3,True,False,True
1,PDB3tjh,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,SPLDSLWWI,MGPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRY...,GPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRYE...,,H-2Ld,"F9Y,V13T,P16R,I24T,N31D,A50V,I67V,W98R,K132R",3,...,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,0,5,1,11,11,True,False,True
2,PDB4mxq,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,SPAPRPLDL,MGPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRY...,GPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRYE...,,H-2Ld,"F9Y,V13T,P16R,I24T,N31D,A50V,K132R",3,...,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,0,0,0,18,19,True,False,True
3,PDB3tfk,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,QLSDVPMDL,MGPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRY...,GPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRYE...,,H-2Ld,"F9Y,V13T,P16R,I24T,N31D,A50V,I67V,W98R,K132R",3,...,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,0,5,1,89,99,True,False,True
4,PDB4n0c,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,MPAGRPWDL,MGPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRY...,GPHSMRYYETATSRRGLGEPRYTSVGYVDDKEFVRFDSDAENPRYE...,,H-2Ld,"F9Y,V13T,P16R,I24T,N31D,A50V,K132R",3,...,AQSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQM...,EAAVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLI...,0,0,0,57,61,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,PDB7dzm,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,AGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMRGLELLI...,TPQDLNTML,GGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRE...,GGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRE...,,B*81:02,A1G,3,...,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,AGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMRGLELLI...,34,25,24,74,81,False,True,False
111,PDB7dzn,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,DAGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMRGLELL...,TPQDLNTML,MGGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPR...,GGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPRE...,,B*42:30,A2G,3,...,DAKTTQPPSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQY...,AGVIQSPRHEVTEMGQEVTLRCKPISGHNSLFWYRQTMMRGLELLI...,34,27,25,74,88,False,True,False
112,PDB8eo8,GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQL...,AVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLM...,LPFDKATIM,GSHSMRYFYTAMSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRTE...,MRVTAPRTVLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...,B*35:01:01:01,,,3,...,GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQL...,VVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMA...,35,13,9,87,97,True,False,False
113,PDB8enh,GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQL...,AVVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLM...,LPFEKSTIM,GSHSMRYFYTAMSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRTE...,MRVTAPRTVLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...,B*35:01:01:01,,,3,...,GENVEQHPSTLSVQEGDSAVIKCTYSDSASNYFPWYKQELGKRPQL...,VVSQHPSRVICKSGTSVKIECRSLDFQATTMFWYRQFPKQSLMLMA...,35,13,9,67,74,True,False,False


In [10]:
crossDf = rawCross[["TCR_ID", "TCR_pair_id", "MHC_allele_id", "peptide_id", "pMHC_id", "PDB_ID"]]
crossDf

Unnamed: 0,TCR_ID,TCR_pair_id,MHC_allele_id,peptide_id,pMHC_id,PDB_ID
0,PDB4ms8,0,0,3,3,4MS8
1,PDB3tjh,0,5,11,11,3TJH
2,PDB4mxq,0,0,18,19,4MXQ
3,PDB3tfk,0,5,89,99,3TFK
4,PDB4n0c,0,0,57,61,4N0C
...,...,...,...,...,...,...
110,PDB7dzm,34,25,74,81,7DZM
111,PDB7dzn,34,27,74,88,7DZN
112,PDB8eo8,35,13,87,97,8EO8
113,PDB8enh,35,13,67,74,8ENH


In [11]:
def get_protein_keys(original_graphs: dict):
    keys = list(original_graphs.keys())
    if all(isinstance(k, str) and k.isdigit() for k in keys):
        return [str(i) for i in sorted(map(int, keys))]
    return keys

# ---------- projection helpers ----------
def project_nodes_unique(frame_nodes, p):
    return set(n[p] for n in frame_nodes)

def project_nodes_instances(frame_nodes, p):
    return [n[p] for n in frame_nodes]

def chain_signature(node_tuple):
    """Map an associated node tuple -> chain signature string, e.g. ('A:ARG:23','A:ARG:34') -> 'AA'."""
    chains = []
    for lab in node_tuple:
        # tolerate tuples of tuples or plain strings
        s = lab if isinstance(lab, str) else str(lab)
        chains.append(s.split(":")[0] if ":" in s else s)
    return "".join(chains)

def unique_chain_signatures(frame_nodes):
    """Return a sorted, de-duplicated list of chain signatures present in this frame."""
    sigs = {chain_signature(n) for n in frame_nodes}
    return sorted(sigs)

def chain_combo_key(node_tuple) -> str:
    """e.g., ('A:ARG:23','A:ARG:34') -> 'AA'."""
    return ''.join(str(part).split(':', 1)[0] for part in node_tuple)

# ---------- per-protein node metrics ----------
def  node_similarity_for_protein(frame, original_graphs, protein_keys, p):
    nodes_assoc = frame.get("nodes", [])
    if not nodes_assoc:
        return None

    prot_key = protein_keys[p]
    og = original_graphs[prot_key]
    prot_name = og.get("name", prot_key)

    Vp = set(og["nodes"]) 

    inst = project_nodes_instances(nodes_assoc, p)
    Up   = set(inst)

    total_orig = len(Vp) if Vp else 0
    node_coverage = (len(Up) / total_orig) if total_orig else 0.0

    total_inst = len(inst)
    unique_cnt = len(Up)
    duplication_ratio = (len(Up) / total_inst) if total_inst else 1.0
    duplication_rate  = 1.0 - duplication_ratio
    avg_multiplicity  = (total_inst / unique_cnt) if unique_cnt else float('inf')

    groups = defaultdict(set)  # chain_key -> set of unique residues (for protein p)
    for node_tuple in nodes_assoc:
        key = ''.join(str(part).split(':', 1)[0] for part in node_tuple) # ('A:ARG:23','A:ARG:34') -> 'AA'.
        groups[key].add(node_tuple[p])  # only the residue from protein p

    unique_nodes_per_chain = {k: len(v) for k, v in groups.items()}
    # store as JSON so it round-trips through CSV cleanly
    unique_nodes_per_chain_json = json.dumps(unique_nodes_per_chain, ensure_ascii=False)

    return dict(
        protein_index=p,
        protein_key=prot_key,
        protein_name=prot_name,
        total_nodes_associated=len(nodes_assoc),
        total_nodes_original=total_orig,
        frame_nodes_instances=total_inst,
        frame_nodes_unique=len(Up),
        node_coverage=node_coverage,
        duplication_ratio=duplication_ratio,
        duplication_rate=duplication_rate,
        avg_multiplicity=avg_multiplicity,
        unique_nodes_per_chain=unique_nodes_per_chain_json
    )

def wmean(x, w):
    x = np.asarray(x, float); w = np.asarray(w, float)
    s = w.sum()
    return float(np.sum(x*w)/s) if s > 0 else np.nan

def wstd(x, w):
    x = np.asarray(x, float); w = np.asarray(w, float)
    m = wmean(x, w)
    s = w.sum()
    return float(np.sqrt(np.sum(w*(x-m)**2)/s)) if s > 0 else np.nan

def wmedian(x, w):
    x = np.asarray(x, float); w = np.asarray(w, float)
    if w.sum() == 0: return np.nan
    order = np.argsort(x); x = x[order]; w = w[order]
    cw = np.cumsum(w)/w.sum()
    return float(x[np.searchsorted(cw, 0.5)])

def wtrimmed_mean(x, w, trim=0.10):
    x = np.asarray(x, float); w = np.asarray(w, float)
    if w.sum() == 0: return np.nan
    order = np.argsort(x); x = x[order]; w = w[order]
    cw = np.cumsum(w)/w.sum()
    keep = (cw >= trim) & (cw <= 1.0-trim)
    if not np.any(keep): keep = np.ones_like(cw, dtype=bool)
    return wmean(x[keep], w[keep])

def ivw_mean_proportions(cov, n):
    cov = np.asarray(cov, float); n = np.asarray(n, float)
    p = ((cov*n) + 0.5) / (n + 1.0)
    var = p*(1.0-p) / (n + 1.0) + 1e-12
    w = 1.0/var
    return wmean(p, w)

In [9]:
# ---------- frame-level summary (nodes only) ----------
def summarize_frame_nodes(df_fp_nodes_for_frame):
    if df_fp_nodes_for_frame.empty:
        return {}
    cov = df_fp_nodes_for_frame["node_coverage"].values
    n   = df_fp_nodes_for_frame["total_nodes_original"].values
    w   = n

    return {
        "node_cov_wmean":       wmean(cov, w),
        "node_cov_wmedian":     wmedian(cov, w),
        "node_cov_wtrimmed":    wtrimmed_mean(cov, w, trim=0.10),
        "node_cov_ivw_meta":    ivw_mean_proportions(cov, n),
        "node_cov_wstd":        wstd(cov, w),
        "node_cov_p10":         float(np.percentile(cov, 10)),
        "node_cov_p50":         float(np.percentile(cov, 50)),
        "node_cov_p90":         float(np.percentile(cov, 90)),
        "n_proteins":           int(len(cov)),
        "mean_dup_rate":        float(df_fp_nodes_for_frame.get("duplication_rate", pd.Series([np.nan])).mean()),
        "mean_graph_size":      float(np.mean(n)),
        "sum_graph_size":       int(np.sum(n)),
    }



# ---------- evaluate a single frame ----------
def evaluate_frame_nodes(component_id, frame_id, data):
    comp_key = str(component_id)
    frm_key  = str(frame_id)
    frame = data[comp_key]["frames"][frm_key]

    original_graphs = data["original_graphs"]
    protein_keys = get_protein_keys(original_graphs)

    nodes_assoc = frame.get("nodes", [])
    chain_sigs = unique_chain_signatures(nodes_assoc)
    n_unique_chain_sigs = len(chain_sigs)

    rows = []
    if nodes_assoc:
        n_prot = len(nodes_assoc[0])
        for p in range(n_prot):
            r = node_similarity_for_protein(frame, original_graphs, protein_keys, p)
            if r is not None:
                rows.append(r)

    df = pd.DataFrame(rows)
    if df.empty:
        summary = {}
    else:
        summary = dict(
            component=component_id,
            frame=frame_id,
            n_proteins=len(df),
            node_cov_mean=float(df["node_coverage"].mean()),
            node_cov_median=float(df["node_coverage"].median()),
            node_cov_min=float(df["node_coverage"].min()),
            node_cov_std=float(df["node_coverage"].std(ddof=0) if len(df)>1 else 0.0),
            dup_ratio_mean=float(df["duplication_ratio"].mean()),
            unique_chain_signatures=chain_sigs,                # NEW: list like ['AA','AC',...]
            n_unique_chain_signatures=n_unique_chain_sigs      # NEW: integer count
        )
    return df, summary

# ---------- evaluate ALL frames (nodes only) ----------
def evaluate_all_frames_nodes(json_path):
    data = json.loads(Path(json_path).read_text())
    component_ids = [k for k in data.keys() if k != "original_graphs"]
    try:
        component_ids = sorted(component_ids, key=lambda x: int(x))
    except Exception:
        pass

    all_fp, summaries = [], []
    for comp_id in component_ids:
        frames = data[comp_id]["frames"]
        frame_ids = list(frames.keys())
        try:
            frame_ids = sorted(frame_ids, key=lambda x: int(x))
        except Exception:
            pass

        for frm_id in frame_ids:
            df_fp, summ = evaluate_frame_nodes(comp_id, frm_id, data)
            if not df_fp.empty:
                df_fp.insert(0, "component_id", comp_id)
                df_fp.insert(1, "frame_id", frm_id)
                all_fp.append(df_fp)
            if summ:
                summaries.append(summ)

    df_fp_nodes = pd.concat(all_fp, ignore_index=True) if all_fp else pd.DataFrame()
    df_frames_nodes = pd.DataFrame(summaries).sort_values(
        ["node_cov_mean","node_cov_median","node_cov_min","node_cov_std"],
        ascending=[False, False, False, True]
    )
    return df_fp_nodes, df_frames_nodes

def evaluate_all_frames_nodes_weighted(json_path):
    df_fp_nodes, _ = evaluate_all_frames_nodes(json_path)

    summaries = []
    if df_fp_nodes.empty:
        return df_fp_nodes, pd.DataFrame()

    for (comp_id, frame_id), g in df_fp_nodes.groupby(["component_id", "frame_id"], dropna=False):
        s = summarize_frame_nodes(g)
        s.update({"component_id": comp_id, "frame_id": frame_id})
        summaries.append(s)

    df_frames_nodes_w = pd.DataFrame(summaries)
    cols = ["component_id", "frame_id"] + [c for c in df_frames_nodes_w.columns if c not in ("component_id", "frame_id")]
    df_frames_nodes_w = df_frames_nodes_w[cols]

    df_frames_nodes_w = df_frames_nodes_w.sort_values(
        ["node_cov_wmean","node_cov_wmedian","node_cov_p10","node_cov_wstd"],
        ascending=[False, False, False, True]
    )
    return df_fp_nodes, df_frames_nodes_w

# --- Helpers ---------------------------------------------------------------


def _make_json_from_associated_graph(G, out_json: Path) -> None:
    """
    Serialize an AssociatedGraph into the expected JSON format.

    Parameters
    ----------
    G : AssociatedGraph
        Instance already built.
    out_json : pathlib.Path
        Output path (will be created/overwritten).
    """
    graphs_raw = G.graph_data
    payload: Dict = {"original_graphs": {}}

    for graph_raw in graphs_raw:
        pdb_file = graph_raw["pdb_file"]
        _id = graph_raw["id"]

        m = re.search(r'noTCR_([A-Za-z0-9]{4})\.trunc', pdb_file, re.IGNORECASE)
        name = m[1] if m else f"id{_id}"

        nodes = list(graph_raw["graph"].nodes)
        edges = list(graph_raw["graph"].edges)
        neighbors = {str(n): [str(nb) for nb in graph_raw["graph"].neighbors(n)] for n in nodes}

        payload["original_graphs"][_id] = {
            "name": name,
            "nodes": nodes,
            "edges": edges,
            "neighbors": neighbors,
        }

    for j, comps in enumerate(G.associated_graphs):
        payload[j] = {"comp": j, "frames": {}}
        for i in range(len(comps[0])):
            nodes = list(comps[0][i].nodes)
            edges = list(comps[0][i].edges)
            neighbors = {str(n): [str(nb) for nb in comps[0][i].neighbors(n)] for n in nodes}
            payload[j]["frames"][i] = {"nodes": nodes, "edges": edges, "neighbors": neighbors}

    out_json.parent.mkdir(parents=True, exist_ok=True)
    with open(out_json, "w") as f:
        json.dump(payload, f, indent=4)


def _save_eval_tables(out_dir: Path, df_fp_nodes: pd.DataFrame, df_frames_nodes_w: pd.DataFrame) -> None:
    """
    Save per-run evaluation tables.

    Parameters
    ----------
    out_dir : pathlib.Path
        Destination directory.
    df_fp_nodes : pandas.DataFrame
    df_frames_nodes_w : pandas.DataFrame
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    if not df_fp_nodes.empty:
        df_fp_nodes.to_csv(out_dir / "nodes_per_protein.csv", index=False)
    if not df_frames_nodes_w.empty:
        cols = list(df_frames_nodes_w.columns)
        for lead in ["component_id", "frame_id"]:
            if lead in cols:
                cols = [lead] + [c for c in cols if c != lead]
        df_frames_nodes_w[cols].to_csv(out_dir / "nodes_summary_weighted.csv", index=False)


def _build_associated_graph(files_name: str, run_name: str, out_dir: Path, manifest: Dict):
    """
    Build graphs + AssociatedGraph and persist JSON.

    Parameters
    ----------
    files_name : str
        Comma-separated file list passed to `args.files_name`.
    run_name : str
        Run identifier.
    out_dir : pathlib.Path
        Output directory.
    args : Any
        Argument namespace consumed by `create_graphs`.
    association_config : dict
        Passed to AssociatedGraph.

    Returns
    -------
    (G, json_path) : tuple
        AssociatedGraph instance and JSON path written.
    """
    manifest["inputs"] = [
        {
        "path": files_name,
        "enable_tui": False,
        "extensions": [".pdb", ".pdb.gz", ".cif"],
        "constrains": [
            { "name": "MHC1"}
        ]
        }
    ]
    manifest["settings"]["run_name"] = run_name
    manifest["settings"]["output_path"] = str(out_dir)

    graphs = create_graphs(manifest)
    S = manifest["settings"]
    checks = {
        "depth": S.get("check_depth"),
        "rsa":   S.get("check_rsa"),
    }
    association_config = {
        "centroid_threshold":          S.get("centroid_threshold"),
        "distance_diff_threshold":     S.get("distance_diff_threshold"),
        "rsa_filter":                  S.get("rsa_filter"),
        "depth_filter":                S.get("depth_filter"),
        "rsa_bins":                    S.get("rsa_bins"),
        "depth_bins":                  S.get("depth_bins"),
        "distance_bins":               S.get("distance_bins"),
        "checks":                      checks,
        "exclude_waters":              S.get("exclude_waters"),
        "classes": S.get("classes", {}),
    }
    G = AssociatedGraph(
        graphs=graphs,
        output_path=manifest["settings"]["output_path"],
        run_name=manifest["settings"]["run_name"],
        association_config=association_config
    )
    # Optional figures
    G.draw_graph_interactive(show=False, save=True)
    G.align_all_frames()
    G.create_pdb_per_protein()

    json_path = out_dir / f"graph_{run_name}.json"
    _make_json_from_associated_graph(G, json_path)
    return G, json_path


# --- Public API ------------------------------------------------------------

def run_allxall_per_group(
    cross_df: pd.DataFrame,
    manifest: dict,
    root: str = "Analysis/CrossGraphs"
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Run the "All × All" flow once per TCR_pair_id and aggregate outputs.

    Parameters
    ----------
    cross_df : pandas.DataFrame
        Must contain columns 'TCR_pair_id' and 'PDB_ID'.
    args : Any
        Namespace consumed by `create_graphs`.
    association_config : dict
        Passed to AssociatedGraph.
    root : str, default "Analysis/CrossGraphs"
        Root output directory.

    Returns
    -------
    (df_all_fp, df_all_frames) : tuple of DataFrame
        Global aggregates across groups (can be empty DataFrames).
    """
    root = Path(root)
    all_fp, all_frames = [], []

    for pair_id, group in cross_df.groupby("TCR_pair_id"):
        group_dir = root / str(pair_id) / "All"
        pdb_ids = [str(x).strip() for x in group["PDB_ID"]]
        files = [f"Analysis/selected_strs_renumber/without_TCR/noTCR_{pid.lower()}.trunc.fit_renum.pdb" for pid in pdb_ids]

        files_name = ",".join(files)
        run_name = str(pair_id)

        try:
            _, json_path = _build_associated_graph(files_name, run_name, group_dir, manifest)
        except Exception as e:
            print(f"[SKIP] Could not build group {pair_id}: {e}")
            continue

        try:
            df_fp_nodes, df_frames_nodes_w = evaluate_all_frames_nodes_weighted(str(json_path))
            if "frame_nodes_unique" in df_fp_nodes.columns and "total_nodes_associated" not in df_fp_nodes.columns:
                df_fp_nodes["total_nodes_associated"] = df_fp_nodes["frame_nodes_unique"]

        except Exception as e:
            print(f"[SKIP] Evaluation failed for {pair_id}: {e}")
            continue


        out_dir = group_dir
        out_dir.mkdir(parents=True, exist_ok=True)
        if not df_fp_nodes.empty:
            df_fp_nodes.to_csv(out_dir / "nodes_per_protein.csv", index=False)
            df_fp_nodes.insert(0, "pair_id", pair_id)
            all_fp.append(df_fp_nodes)
        if not df_frames_nodes_w.empty:
            cols = list(df_frames_nodes_w.columns)
            for lead in ["component_id", "frame_id"]:
                if lead in cols:
                    cols = [lead] + [c for c in cols if c != lead]
            df_frames_nodes_w[cols].to_csv(out_dir / "nodes_summary_weighted.csv", index=False)
            df_frames_nodes_w.insert(0, "pair_id", pair_id)
            all_frames.append(df_frames_nodes_w)

        _save_eval_tables(group_dir, df_fp_nodes, df_frames_nodes_w)

    df_all_fp = pd.concat(all_fp, ignore_index=True) if all_fp else pd.DataFrame()
    df_all_frames = pd.concat(all_frames, ignore_index=True) if all_frames else pd.DataFrame()

    if not df_all_fp.empty:
        df_all_fp.to_csv(root / "ALL_nodes_per_protein.csv", index=False)
    if not df_all_frames.empty:
        lead = ["pair_id", "component_id", "frame_id"]
        cols = lead + [c for c in df_all_frames.columns if c not in lead]
        df_all_frames = df_all_frames[cols]
        df_all_frames.to_csv(root / "ALL_nodes_summary_weighted.csv", index=False)

    print("All×All completed.")
    return df_all_fp, df_all_frames


def run_pairwise_per_group(
    cross_df: pd.DataFrame,
    manifest: dict,
    root: str = "Analysis/CrossGraphs",
    score_column: str = "node_cov_wmean"
) -> Dict[str, Dict[str, pd.DataFrame]]:
    """
    Run the Pairwise flow (within each TCR_pair_id) and build similarity matrices.
    For each pair, also save the standard per-run evaluation tables.

    Parameters
    ----------
    cross_df : pandas.DataFrame
        Must contain columns 'TCR_pair_id' and 'PDB_ID'.
    args : Any
        Namespace consumed by `create_graphs`.
    association_config : dict
        Passed to AssociatedGraph.
    root : str, default "Analysis/CrossGraphs"
        Root output directory.
    score_column : str, default "node_cov_wmean"
        Frame-level column used to score a pair.

    Returns
    -------
    dict
        Mapping pair_id -> {"max": DataFrame, "mean": DataFrame}
    """
    out: Dict[str, Dict[str, pd.DataFrame]] = {}

    for pair_id, group in cross_df.groupby("TCR_pair_id"):
        refs = [str(x).strip() for x in group["PDB_ID"]]
        files_map = {r: f"noTCR_{r.lower()}.trunc.fit_renum.pdb" for r in refs}

        idx = refs
        M_max = pd.DataFrame(0.0, index=idx, columns=idx)
        M_mean = pd.DataFrame(0.0, index=idx, columns=idx)

        group_dir = Path(root) / str(pair_id) / "Pairs"
        group_dir.mkdir(parents=True, exist_ok=True)

        for r1, r2 in combinations(refs, 2):
            files_name = ",".join([files_map[r1], files_map[r2]])
            run_name = f"{pair_id}_{r1}_{r2}"
            out_dir = group_dir / f"{r1}_{r2}"

            try:
                _, json_path = _build_associated_graph(files_name, run_name, out_dir, manifest)
            except Exception as e:
                print(f"[SKIP] Could not build pair ({r1},{r2}): {e}")
                continue

            try:
                df_fp_nodes, df_frames_nodes_w = evaluate_all_frames_nodes_weighted(str(json_path))
                if "frame_nodes_unique" in df_fp_nodes.columns and "total_nodes_associated" not in df_fp_nodes.columns:
                    df_fp_nodes["total_nodes_associated"] = df_fp_nodes["frame_nodes_unique"]

            except Exception as e:
                print(f"[SKIP] Evaluation failed for pair ({r1},{r2}): {e}")
                continue

            # Save standard per-run outputs for the pair
            _save_eval_tables(out_dir, df_fp_nodes, df_frames_nodes_w)

            # Pair score from frames
            if not df_frames_nodes_w.empty and score_column in df_frames_nodes_w.columns:
                max_score = float(df_frames_nodes_w[score_column].max())
                mean_score = float(df_frames_nodes_w[score_column].mean())
            else:
                max_score = 0.0
                mean_score = 0.0

            M_max.loc[r1, r2] = M_max.loc[r2, r1] = max_score
            M_mean.loc[r1, r2] = M_mean.loc[r2, r1] = mean_score
            M_max.loc[r1, r1] = M_max.loc[r2, r2] = 1.0
            M_mean.loc[r1, r1] = M_mean.loc[r2, r2] = 1.0

        # Save matrices for the group
        M_max.to_csv(group_dir / f"matrix_{score_column}_MAX.csv")
        M_mean.to_csv(group_dir / f"matrix_{score_column}_MEAN.csv")
        print(f"Pairwise matrices saved in {group_dir}")

        out[str(pair_id)] = {"max": M_max, "mean": M_mean}

    print("Pairwise completed.")
    return out


def run_cross_analysis(
    mode: Literal["all", "pairwise", "both"],
    cross_df: pd.DataFrame,
    manifest: dict,
    root: str = "Analysis/CrossGraphs",
    score_column: str = "node_cov_wmean"
):
    """
    Unified entry point for running All×All, Pairwise, or both.

    Parameters
    ----------
    mode : {"all", "pairwise", "both"}
        Which analysis to run.
    cross_df : pandas.DataFrame
        Must contain columns 'TCR_pair_id' and 'PDB_ID'.
    args : Any
        Namespace consumed by `create_graphs`.
    association_config : dict
        Passed to AssociatedGraph.
    root : str, default "Analysis/CrossGraphs"
        Root output directory.
    score_column : str, default "node_cov_wmean"
        Frame-level column used to score a pair (Pairwise mode).

    Returns
    -------
    dict
        Results container. Keys present depend on the selected mode:
        - "all": {"df_all_fp": DataFrame, "df_all_frames": DataFrame}
        - "pairwise": {"matrices": dict(pair_id -> {"max": df, "mean": df})}
        - "both": union of the above.
    """
    results: Dict[str, object] = {}

    if mode in ("all", "both"):
        df_all_fp, df_all_frames = run_allxall_per_group(cross_df, manifest, root=root)
        results["all"] = {"df_all_fp": df_all_fp, "df_all_frames": df_all_frames}

    if mode in ("pairwise", "both"):
        matrices = run_pairwise_per_group(cross_df, manifest, root=root, score_column=score_column)
        results["pairwise"] = {"matrices": matrices}

    return results


In [10]:
res_both = run_cross_analysis(
    mode="both",
    cross_df=crossDf,
    manifest=manifest,
    root="Analysis/CrossGraphs_10_CA_NewMethod"
)

  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


  centroids = df.groupby("node_id", sort=False, group_keys=False).apply(_centroid_for_group)


Subgraph exposed_residues created with success!


KeyboardInterrupt: 

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Residue tracker for CrossSteps/<run_id> artifacts (read‑only)
# ──────────────────────────────────────────────────────────────────────────────

from __future__ import annotations
import re, json, pickle
from pathlib import Path
from typing import Any, Dict, List, Tuple, Iterable, Optional

import numpy as np
import pandas as pd
import networkx as nx

# ──────────────────────────────────────────────────────────────────────────────
# Small loaders & format helpers
# ──────────────────────────────────────────────────────────────────────────────

def _load_pickle(p: Path):
    with open(p, "rb") as f:
        return pickle.load(f)

def _load_npy(p: Path) -> np.ndarray:
    return np.load(p)

def _sorted_run_files(run_dir: Path) -> List[Path]:
    def key(p: Path):
        m = re.match(r"^(\d{3})_", p.name)
        return int(m.group(1)) if m else 999999
    return sorted([p for p in run_dir.iterdir() if p.is_file() and not p.name.startswith("_")], key=key)

def _parse_res_string(s: str) -> Tuple[str, str, str]:
    # "C:ASP:4" -> (chain, resname, resnum_str)
    ch, resname, resnum = s.split(":")
    return ch, resname, str(resnum)

def _idx_to_res_string(idx: int, residue_maps_unique: Dict[int, Tuple[str, str, str]]) -> str:
    # residue_maps_unique[idx] = (chain, resnum_str, resname)
    ch, resnum_str, resname = residue_maps_unique[idx]
    return f"{ch}:{resname}:{resnum_str}"

# ──────────────────────────────────────────────────────────────────────────────
# Reader for CrossSteps/<run_id> drops
# ──────────────────────────────────────────────────────────────────────────────

class RunReader:
    """
    Minimal reader for a CrossSteps/<run_id> directory.
    Only reads; never writes.
    """

    def __init__(self, run_dir: str | Path):
        self.base = Path(run_dir).resolve()
        if not self.base.exists():
            raise FileNotFoundError(self.base)
        self.files = _sorted_run_files(self.base)

        # index by "short key" (strip the NNN_ prefix and extension)
        self.by_key: Dict[str, Path] = {}
        for p in self.files:
            stem = re.sub(r"^\d{3}_", "", p.stem)
            self.by_key[stem] = p

        self._maps: Optional[Dict[str, Any]] = None
        self._inv_maps: Optional[Dict[int, Dict[Tuple[str, str, str], int]]] = None
        self._graph_collection: Optional[Dict[str, Any]] = None
        self._cross_combos: Optional[Dict[Any, Any]] = None
        self._graphs_bundle: Optional[Any] = None
        self._dm_thresh: Optional[np.ndarray] = None
        self._dm_prune: Optional[np.ndarray] = None

    # core blobs ---------------------------------------------------------------

    def maps(self) -> Dict[str, Any]:
        if self._maps is None:
            p = self.by_key.get("association_product_maps")
            if p is None:
                raise RuntimeError("association_product_maps.pkl not found")
            self._maps = _load_pickle(p)
        return self._maps

    def inv_maps(self) -> Dict[int, Dict[Tuple[str, str, str], int]]:
        if self._inv_maps is None:
            p = self.by_key.get("association_product_inv_maps")
            if p is None:
                raise RuntimeError("association_product_inv_maps.pkl not found")
            self._inv_maps = _load_pickle(p)
        return self._inv_maps

    def graph_collection(self) -> Dict[str, Any]:
        if self._graph_collection is None:
            p = self.by_key.get("association_product_graph_collection")
            if p is None:
                raise RuntimeError("association_product_graph_collection.pkl not found")
            self._graph_collection = _load_pickle(p)
        return self._graph_collection

    def cross_combos(self) -> Dict[Any, Any]:
        if self._cross_combos is None:
            p = self.by_key.get("association_product_cross_combos")
            if p is None:
                raise RuntimeError("association_product_cross_combos.pkl not found")
            self._cross_combos = _load_pickle(p)
        return self._cross_combos

    def graphs_bundle(self):
        if self._graphs_bundle is None:
            p = self.by_key.get("association_product_Graphs")
            if p is None:
                raise RuntimeError("association_product_Graphs.pkl not found")
            self._graphs_bundle = _load_pickle(p)
        return self._graphs_bundle

    def dm_thresholded(self) -> np.ndarray:
        if self._dm_thresh is None:
            p = self.by_key.get("association_product_dm_thresh")
            if p is None:
                raise RuntimeError("association_product_dm_thresh.npy not found")
            self._dm_thresh = _load_npy(p)
        return self._dm_thresh

    def dm_pruned(self) -> np.ndarray:
        if self._dm_prune is None:
            p = self.by_key.get("association_product_dm_prune")
            if p is None:
                raise RuntimeError("association_product_dm_prune.npy not found")
            self._dm_prune = _load_npy(p)
        return self._dm_prune

    # utilities ---------------------------------------------------------------

    def list_components(self) -> List[int]:
        return [comp_id for (_graphs, comp_id) in self.graphs_bundle()]

    def components_with_nodes_indices(self) -> list[int]:
        out = []
        for cid in self.list_components():
            if f"comp_id_{cid}_nodes_indices" in self.by_key:
                out.append(cid)
        return sorted(out)

    def nodes_indices(self, comp_id: int) -> list[list[int]]:
        key = f"comp_id_{comp_id}_nodes_indices"
        p = self.by_key.get(key)
        if p is None:
            raise RuntimeError(
                f"{key}.pkl not found. comp_id_0 never has nodes_indices; use an id >= 1."
            )
        return _load_pickle(p)

    def comp_matrices(self, comp_id: int) -> Dict[str, np.ndarray]:
        key = f"comp_id_{comp_id}_matrices_mul"
        p = self.by_key.get(key)
        if p is None:
            raise RuntimeError(f"{key}.pkl not found")
        return _load_pickle(p)

    def comp_maps(self, comp_id: int) -> Dict[str, Any]:
        key = f"comp_id_{comp_id}_maps_mul"
        p = self.by_key.get(key)
        if p is None:
            raise RuntimeError(f"{key}.pkl not found")
        return _load_pickle(p)

# ──────────────────────────────────────────────────────────────────────────────
# Residue‑centric tracker
# ──────────────────────────────────────────────────────────────────────────────

class ResidueTracker:
    """
    High‑level read‑only inspector for one or more residues.
    """

    def __init__(self, reader: RunReader):
        self.r = reader
        self._maps = self.r.maps()
        self._inv = self.r.inv_maps()
        self._gc = self.r.graph_collection()
        self._dmT = self.r.dm_thresholded()
        self._dmP = self.r.dm_pruned()
        self._runiq = self._maps["residue_maps_unique"]  # global_index -> (chain, resnum_str, resname)
        self._rfull = self._maps["full_residue_maps"]    # per‑protein dicts: (chain,resnum,resname)->local

    # ── presence & indices ───────────────────────────────────────────────────

    def residue_indices(self, residue: str) -> pd.DataFrame:
        """
        For "C:ASP:4", report where it exists:
          • protein index
          • local index (within that protein graph order)
          • global index (flattened across proteins; matches dm_* matrices)
          • presence in the original per‑graph nodes
        """
        ch, resname, resnum = _parse_res_string(residue)
        triple = (ch, resnum, resname)  # shape used by maps

        rows = []
        offset = 0
        # build offsets from residue_maps_unique_break (already made in pipeline)
        uniq_break = self._maps.get("residue_maps_unique_break")
        # fallback: compute offsets from ranges_graph in metadata
        ranges = self._maps.get("ranges_graph") or self._gc.get("ranges_graph") or {}

        for p, local_map in enumerate(self._rfull):
            present = triple in local_map
            local_idx = local_map.get(triple, None)

            if uniq_break and p in uniq_break:
                gmap = uniq_break[p]
                gidx = gmap.get(local_idx, None) if local_idx is not None else None
            else:
                # derive global by offset
                if ranges and p in ranges:
                    offset = ranges[p][0]
                else:
                    # compute offset by summing sizes up to p
                    offset = sum(len(m) for m in self._rfull[:p])
                gidx = (offset + local_idx) if local_idx is not None else None

            in_graph_nodes = residue in set(self._gc["nodes_graphs"][p])

            rows.append({
                "residue": residue,
                "protein": p,
                "present": bool(present),
                "local_index": local_idx,
                "global_index": gidx,
                "in_nodes_graph": bool(in_graph_nodes),
            })

        return pd.DataFrame(rows)

    # ── node‑level extras: RSA, depth, coordinates (centroids) ───────────────

    def residue_node_metrics(self, residue: str) -> pd.DataFrame:
        """
        For each protein where the residue exists, return:
          • rsa value (if available)
          • depth value (if available)
          • centroid from the structure graph (if available)
        """
        ch, resname, resnum = _parse_res_string(residue)
        triple = (ch, resnum, resname)

        rows = []
        for p, local_map in enumerate(self._rfull):
            if triple not in local_map:
                continue
            li = local_map[triple]

            # RSA
            rsa_val = None
            try:
                rsa_map: np.ndarray = self._gc["rsa_maps"][p]
                if isinstance(rsa_map, np.ndarray) and 0 <= li < rsa_map.shape[0]:
                    rsa_val = float(rsa_map[li])
            except Exception:
                pass

            # Depth
            depth_val = None
            try:
                depths_list: List[np.ndarray] = self._gc.get("depths_maps", [])
                if p < len(depths_list):
                    dvec = depths_list[p]
                    if isinstance(dvec, np.ndarray) and 0 <= li < dvec.shape[0]:
                        depth_val = float(dvec[li])
            except Exception:
                pass

            # Coordinates / centroid if provided in graph.graph["coords"]
            coords = None
            try:
                G = self._gc["graphs"][p]
                label = residue  # node label is the "A:GLU:154" string
                if label in G and "x" in G.nodes[label] and "y" in G.nodes[label] and "z" in G.nodes[label]:
                    coords = (float(G.nodes[label]["x"]), float(G.nodes[label]["y"]), float(G.nodes[label]["z"]))
            except Exception:
                pass

            rows.append({
                "residue": residue,
                "protein": p,
                "local_index": li,
                "rsa": rsa_val,
                "depth": depth_val,
                "centroid_xyz": coords,
            })

        return pd.DataFrame(rows)

    # ── DM hits (global space) ────────────────────────────────────────────────

    def dm_neighbors(self, residue: str, *, matrix: str = "thresholded") -> pd.DataFrame:
        """
        Using global indices, list other residues that are in contact with this one
        in the chosen global matrix (“thresholded” or “pruned”).
        """
        pres = self.residue_indices(residue)
        pres = pres[pres["global_index"].notna()]
        if pres.empty:
            return pd.DataFrame(columns=["residue", "protein", "global_index", "neighbor_global", "neighbor_label", "dm_value"])

        M = self._dmT if matrix == "thresholded" else self._dmP
        rows = []
        for _, row in pres.iterrows():
            gi = int(row["global_index"])
            vec = M[gi]
            js = np.where(vec > 0)[0]
            for j in js:
                rows.append({
                    "residue": residue,
                    "protein": int(row["protein"]),
                    "global_index": gi,
                    "neighbor_global": int(j),
                    "neighbor_label": _idx_to_res_string(int(j), self._runiq),
                    "dm_value": float(M[gi, j]),
                })
        return pd.DataFrame(rows)

    # ── triads & cross‑combos participation ──────────────────────────────────

    def triads_hits(self, residue: str) -> pd.DataFrame:
        """
        For each protein, lists tokens where the residue participates and the
        matching triad tuples (triads_full) that contain it.
        """
        out_rows = []
        ch, resname, resnum = _parse_res_string(residue)
        target = f"{ch}:{resname}:{resnum}"

        triads_list = self._gc["triads"]  # list[dict] per protein
        for p, triads in enumerate(triads_list):
            for token, payload in triads.items():
                # payload: {"count": int, "triads_full": [triplet tuples …]}
                hits = [tri for tri in payload.get("triads_full", []) if target in tri[:3]]
                if not hits:
                    continue
                out_rows.append({
                    "protein": p,
                    "token": token,
                    "n_hits": len(hits),
                    "triads_full": hits,
                })
        return pd.DataFrame(out_rows)

    def cross_combos_hits(self, residue: str) -> pd.DataFrame:
        """
        In cross_combos (token → list of combinations), list combinations where
        any per‑protein triad includes the residue in its first 3 elements.
        """
        ch, resname, resnum = _parse_res_string(residue)
        target = f"{ch}:{resname}:{resnum}"

        cc = self.r.cross_combos()
        rows = []
        for token, combos in cc.items():
            # combo is a tuple of triads (one triad per protein)
            matches = []
            for combo in combos:
                if any(target in tri[:3] for tri in combo):
                    matches.append(combo)
            if matches:
                rows.append({
                    "token": token,
                    "n_combos": len(matches),
                    "combos": matches[:50],  # keep preview manageable
                })
        return pd.DataFrame(rows)

    # ── components / frames participation ────────────────────────────────────

    def components_frames(self, residue: str) -> pd.DataFrame:
        """
        For every component >= 1, and every frame graph inside it,
        report whether the residue occurs in its node tuples and how many times.
        """
        bundle = self.r.graphs_bundle()  # list of (list[nx.Graph], comp_id)
        rows = []
        for graphs, cid in bundle:
            if cid == 0:
                continue
            for frame_id, G in enumerate(graphs):
                if not isinstance(G, nx.Graph):
                    continue
                count = 0
                for node in G.nodes():
                    # node is a tuple of residue strings (one per protein)
                    if isinstance(node, tuple) and any(str(residue) == str(x) for x in node):
                        count += 1
                if count:
                    rows.append({"component": cid, "frame": frame_id, "occurrences": count})
        return pd.DataFrame(rows)

    # ── variation context in component matrices ──────────────────────────────

    def component_variation_context(self, residue: str) -> Dict[int, Dict[str, Any]]:
        """
        For each component that has matrices_mul/maps_mul:
          • rows in maps_mul['possible_nodes'] where the residue's global index appears
          • the corresponding rows/cols from dm_possible_nodes and adj_possible_nodes
        Returns comp_id → {"rows": [...], "dm_slices": [...], "adj_slices": [...]}
        """
        # collect all global indices this residue maps to (one per protein)
        pres = self.residue_indices(residue)
        gidxs = [int(x) for x in pres["global_index"].dropna().unique().tolist()]

        out: Dict[int, Dict[str, Any]] = {}
        for cid in self.r.components_with_nodes_indices():
            mats = self.r.comp_matrices(cid)   # {"dm_possible_nodes", "adj_possible_nodes"}
            cmap = self.r.comp_maps(cid)       # contains "possible_nodes": {row_idx -> [glob_i_per_protein]}

            poss = cmap.get("possible_nodes", {})
            # normalize to dict[int]->list[int]
            norm_poss = {}
            for k, v in poss.items():
                key = int(k) if not isinstance(k, int) else k
                vv = list(v) if isinstance(v, (list, tuple, np.ndarray)) else [int(v)]
                norm_poss[key] = [int(x) for x in vv]

            # rows where any of the residue's global indices appear
            hits = []
            for row_idx, glob_list in norm_poss.items():
                if any(g == h for g in gidxs for h in glob_list):
                    hits.append((int(row_idx), glob_list))

            if not hits:
                continue

            dm = mats["dm_possible_nodes"]
            adj = mats["adj_possible_nodes"]

            dm_slices, adj_slices = [], []
            for row_idx, _ in hits:
                dm_slices.append(dm[row_idx, :])
                adj_slices.append(adj[row_idx, :])

            out[cid] = {
                "rows": hits,
                "dm_rows": dm_slices,
                "adj_rows": adj_slices,
            }
        return out

    # ── one‑shot convenience ─────────────────────────────────────────────────

    def track_one(self, residue: str) -> Dict[str, Any]:
        pres   = self.residue_indices(residue)
        node   = self.residue_node_metrics(residue)
        triads = self.triads_hits(residue)
        combos = self.cross_combos_hits(residue)
        frames = self.components_frames(residue)
        varctx = self.component_variation_context(residue)
        return {
            "presence": pres,
            "node_metrics": node,
            "triads_df": triads,
            "cross_combos_df": combos,
            "frames_df": frames,
            "variation_context": varctx,
        }

    def track_batch(self, residues: Iterable[str]) -> pd.DataFrame:
        rows = []
        for r in residues:
            d = self.track_one(r)
            # quick overview row
            rows.append({
                "residue": r,
                "proteins_found": d["presence"]["present"].sum(),
                "in_frames": int(d["frames_df"]["occurrences"].sum()) if not d["frames_df"].empty else 0,
                "n_triads_hits": int(d["triads_df"]["n_hits"].sum()) if not d["triads_df"].empty else 0,
                "n_cross_combos": int(d["cross_combos_df"]["n_combos"].sum()) if not d["cross_combos_df"].empty else 0,
            })
        return pd.DataFrame(rows).sort_values(["proteins_found", "in_frames", "n_cross_combos", "n_triads_hits"], ascending=False).reset_index(drop=True)




In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Residue tracking over CrossSteps artifacts — read‑only (no writes)
# ──────────────────────────────────────────────────────────────────────────────
from __future__ import annotations
import re, json, pickle
from pathlib import Path
from typing import Any, Dict, List, Tuple, Iterable, Optional
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import networkx as nx

# ──────────────────────────────────────────────────────────────────────────────
# Basic loaders
# ──────────────────────────────────────────────────────────────────────────────

def _load_pickle(p: Path):
    with open(p, "rb") as f:
        return pickle.load(f)

def _load_npy(p: Path) -> np.ndarray:
    return np.load(p)

def _sorted_run_files(run_dir: Path) -> List[Path]:
    def key(p: Path):
        m = re.match(r"^(\d{3})_", p.name)
        return int(m.group(1)) if m else 999999
    return sorted([p for p in run_dir.iterdir() if p.is_file()], key=key)

def _idx_to_res_string(idx: int, residue_maps_unique: Dict[int, Tuple[str, str, str]]) -> str:
    ch, resnum_str, resname = residue_maps_unique[idx]
    return f"{ch}:{resname}:{resnum_str}"

def _res_string_to_tuple(s: str) -> Tuple[str,str,str]:
    ch, resname, resnum = s.split(":")
    return (ch, resnum, resname)  # your inv_maps key order

# ──────────────────────────────────────────────────────────────────────────────
# RunReader: read only from CrossSteps/<run_id>
# ──────────────────────────────────────────────────────────────────────────────

class RunReader:
    """
    Thin reader for a CrossSteps/<run_id> directory.
    It builds a quick index ("by_key") using filenames without the numeric prefix.
    """
    def __init__(self, run_dir: str | Path):
        self.base = Path(run_dir).resolve()
        if not self.base.exists():
            raise FileNotFoundError(self.base)
        self.files = _sorted_run_files(self.base)

        self.by_key: Dict[str, Path] = {}
        for p in self.files:
            stem = re.sub(r"^\d{3}_", "", p.stem)
            self.by_key[stem] = p

        self._maps: Optional[Dict[str, Any]] = None
        self._inv_maps: Optional[Dict[int, Dict[Tuple[str,str,str], int]]] = None
        self._gc: Optional[Dict[str, Any]] = None
        self._graphs_bundle: Optional[Any] = None
        self._dm_thresh: Optional[np.ndarray] = None
        self._dm_prune: Optional[np.ndarray] = None

    def meta(self) -> Dict[str, Any]:
        p = self.base / "_meta.json"
        return json.loads(p.read_text()) if p.exists() else {}

    def maps(self) -> Dict[str, Any]:
        if self._maps is None:
            p = self.by_key.get("association_product_maps")
            if p is None:
                raise RuntimeError("association_product_maps.pkl not found")
            self._maps = _load_pickle(p)
        return self._maps

    def inv_maps(self) -> Dict[int, Dict[Tuple[str,str,str], int]]:
        if self._inv_maps is None:
            p = self.by_key.get("association_product_inv_maps")
            if p is None:
                raise RuntimeError("association_product_inv_maps.pkl not found")
            self._inv_maps = _load_pickle(p)
        return self._inv_maps

    def graph_collection(self) -> Dict[str, Any]:
        if self._gc is None:
            p = self.by_key.get("association_product_graph_collection")
            if p is None:
                raise RuntimeError("association_product_graph_collection.pkl not found")
            self._gc = _load_pickle(p)
        return self._gc

    def cross_combos(self) -> Dict[Any, Any]:
        p = self.by_key.get("association_product_cross_combos")
        return _load_pickle(p) if p else {}

    def triad_graph_edges(self) -> Optional[set]:
        p = self.by_key.get("association_product_triad_graph")
        return _load_pickle(p) if p else None

    def graphs_bundle(self):
        if self._graphs_bundle is None:
            p = self.by_key.get("association_product_Graphs")
            if p is None:
                raise RuntimeError("association_product_Graphs.pkl not found")
            self._graphs_bundle = _load_pickle(p)
        return self._graphs_bundle

    def dm_thresh(self) -> np.ndarray:
        if self._dm_thresh is None:
            p = self.by_key.get("association_product_dm_thresh")
            if p is None:
                raise RuntimeError("association_product_dm_thresh.npy not found")
            self._dm_thresh = _load_npy(p)
        return self._dm_thresh

    def dm_prune(self) -> np.ndarray:
        if self._dm_prune is None:
            p = self.by_key.get("association_product_dm_prune")
            if p is None:
                raise RuntimeError("association_product_dm_prune.npy not found")
            self._dm_prune = _load_npy(p)
        return self._dm_prune

    def components(self) -> List[int]:
        """
        Return sorted component IDs that have per‑component artifacts.
        comp_id_0 is the "base" and intentionally has no nodes_indices/matrices_mul.
        """
        out = set()
        for k in self.by_key:
            m = re.match(r"comp_id_(\d+)_", k)
            if m:
                out.add(int(m.group(1)))
        return sorted(out)

    # Per‑component helpers (skip comp_id_0 for nodes_indices / matrices_mul)
    def comp_nodes_indices(self, comp_id: int) -> Optional[List[List[int]]]:
        p = self.by_key.get(f"comp_id_{comp_id}_nodes_indices")
        return _load_pickle(p) if p else None

    def comp_matrices_mul(self, comp_id: int) -> Optional[Dict[str, np.ndarray]]:
        p = self.by_key.get(f"comp_id_{comp_id}_matrices_mul")
        return _load_pickle(p) if p else None

    def comp_maps_mul(self, comp_id: int) -> Optional[Dict[str, Any]]:
        p = self.by_key.get(f"comp_id_{comp_id}_maps_mul")
        return _load_pickle(p) if p else None

# ──────────────────────────────────────────────────────────────────────────────
# ResidueTracker: read-only analytics for one or many residues
# ──────────────────────────────────────────────────────────────────────────────

class ResidueTracker:
    """
    Track a residue like 'C:ASP:4' across:
      • presence in each source graph
      • global index (inv_maps)
      • contact-map distances (dm_thresh/dm_prune) against all residues
      • RSA per source graph
      • component+frame appearances (association_product_Graphs)
      • variation context from per-component matrices (dm_possible_nodes/adj_possible_nodes)
      • combo/triad hits (cross_combos)
    """
    def __init__(self, reader: RunReader):
        self.r = reader
        self._maps = self.r.maps()
        self._inv = self.r.inv_maps()
        self._gc   = self.r.graph_collection()
        self._rmap_unique: Dict[int, Tuple[str,str,str]] = self._maps["residue_maps_unique"]  # global→(chain,resnum,resname)

        # quick "residue string" sets per protein graph
        self._nodes_per_graph: List[List[str]] = self._gc["nodes_graphs"]

        # RSA arrays per graph (in same node order as nodes_graphs)
        self._rsa_maps: List[np.ndarray] = self._gc["rsa_maps"]

        # contact maps per graph (already embedded globally in dm_* via indices + ranges)
        self._ranges = self._maps["ranges_graph"] if "ranges_graph" in self._maps else self._infer_ranges()

    def _infer_ranges(self) -> List[Tuple[int,int]]:
        # Fallback if ranges weren't persisted inside maps (you do persist in metadata)
        # Use metadata from matrices_dict in your pipeline if present; otherwise derive from nodes_graphs lengths
        sizes = [len(nodes) for nodes in self._nodes_per_graph]
        out, cur = [], 0
        for n in sizes:
            out.append((cur, cur+n))
            cur += n
        return out

    def _resolve(self, residue: str) -> Dict[str, Any]:
        """
        Map 'C:ASP:4' to:
          • per‑protein index (if present)
          • global index (via inv_maps[k])
        """
        tup = _res_string_to_tuple(residue)
        per_protein_idx = {}
        global_idx = {}
        for k, inv in self._inv.items():
            g = inv.get(tup)
            if g is not None:
                global_idx[k] = g
        # Also tell whether it’s present in the original node set for each protein:
        present = {}
        for k, nodes in enumerate(self._nodes_per_graph):
            present[k] = residue in nodes
            if present[k]:
                per_protein_idx[k] = nodes.index(residue)
        return {"tuple": tup, "per_protein_idx": per_protein_idx, "global_idx": global_idx, "present": present}

    # — Presence, RSA, distances (global matrix views) --------------------------------

    def presence_table(self, residue: str) -> pd.DataFrame:
        res = self._resolve(residue)
        rows = []
        for k in range(len(self._nodes_per_graph)):
            rows.append({
                "residue": residue,
                "protein": k,
                "present": bool(res["present"].get(k, False)),
                "global_index": res["global_idx"].get(k, np.nan),
                "per_protein_index": res["per_protein_idx"].get(k, np.nan),
            })
        return pd.DataFrame(rows)

    def rsa_rows(self, residue: str) -> pd.DataFrame:
        """
        Return RSA values for the residue in each source protein (if present).
        """
        res = self._resolve(residue)
        out = []
        for k, nodes in enumerate(self._nodes_per_graph):
            if residue in nodes:
                idx = nodes.index(residue)
                rsa = float(self._rsa_maps[k][idx])
                out.append({"protein": k, "residue": residue, "rsa": rsa})
        return pd.DataFrame(out)

    def distances_rows(self, residue: str) -> pd.DataFrame:
        """
        Use global dm_thresh / dm_prune as a full distance lookup against every residue.
        Rows are (protein, other_residue, dm_thresh, dm_prune). NaNs are preserved.
        """
        dmT = self.r.dm_thresh()
        dmP = self.r.dm_prune()
        res = self._resolve(residue)
        rows = []

        if not res["global_idx"]:
            return pd.DataFrame(columns=["protein","residue","other_residue","dm_thresh","dm_prune"])

        for k, g_idx in res["global_idx"].items():
            start, end = self._ranges[k]
            for other_g in range(start, end):
                other_res = _idx_to_res_string(other_g, self._rmap_unique)
                rows.append({
                    "protein": k,
                    "residue": residue,
                    "other_residue": other_res,
                    "dm_thresh": dmT[g_idx, other_g],
                    "dm_prune":  dmP[g_idx, other_g],
                })
        return pd.DataFrame(rows)

    # — Combo / Triad hits ------------------------------------------------------

    def triad_combo_hits(self, residue: str) -> pd.DataFrame:
        """
        Scan cross_combos for any triad where residue appears.
        Each combo is a tuple of triads (one per protein):
           tri = (r1, r2, r3, ..., distances...)
        We check tri[:3] only for residue membership.
        """
        cc = self.r.cross_combos()

        if not cc:
            return pd.DataFrame(columns=["token","combo_idx","protein","triad_3res"])

        res_hits = []
        target = residue
        for token, combos in cc.items():
            for ci, combo in enumerate(combos):
                for pi, tri in enumerate(combo):
                    tri3 = tri[:3]
                    if target in tri3:
                        res_hits.append({
                            "token": token,
                            "combo_idx": ci,
                            "protein": pi,
                            "triad_3res": tri3
                        })
        return pd.DataFrame(res_hits)

    # — Frames / Associated graphs ---------------------------------------------

    def frames_rows(self, residue: str) -> pd.DataFrame:
        """
        Find all (component, frame) graph nodes containing this residue.
        Nodes in the final graphs are residue tuples (one per protein).
        """
        bundle = self.r.graphs_bundle()   # list of (list[nx.Graph], comp_id)
        rows = []
        for graphs, comp_id in bundle:
            # comp_id_0 is the base combo graph; its nodes are usually not residue tuples for frames
            for frame_id, G in enumerate(graphs):
                for node in G.nodes():
                    if isinstance(node, tuple) and any(
                        (isinstance(x, str) and x == residue) for x in node
                    ):
                        rows.append({"component": comp_id, "frame": frame_id, "node": node})
        return pd.DataFrame(rows)

    # — Variation context from per-component matrices --------------------------

    def _component_possibles_for_residue(self, comp_id: int, residue: str) -> List[Tuple[int, List[int]]]:
        """
        For comp_id>=1, find rows in maps_mul['possible_nodes'] that include the residue’s global index (per protein).
        Returns list of (row_idx, [global_idx_per_protein]).
        """
        maps_mul = self.r.comp_maps_mul(comp_id)
        if not maps_mul:
            return []
        poss = maps_mul.get("possible_nodes", {})
        res = self._resolve(residue)
        gidxs = [res["global_idx"].get(p) for p in sorted(res["global_idx"].keys())]
        gidxs = [g for g in gidxs if g is not None]
        if not gidxs:
            return []

        hits = []
        for row_idx, glob_list in poss.items():
            # poss values are lists of global indices for that candidate node
            if isinstance(glob_list, list) and any(g in glob_list for g in gidxs):
                hits.append((int(row_idx), list(map(int, glob_list))))
        return hits

    def variation_context(self, residue: str) -> Dict[int, Dict[str, Any]]:
        """
        For each comp_id>=1, return:
          • rows in possible_nodes that contain the residue
          • dm_possible_nodes and adj_possible_nodes (the per‑component “variation matrices”)
        """
        out: Dict[int, Dict[str, Any]] = {}
        for comp_id in self.r.components():
            if comp_id == 0:
                continue
            matrices = self.r.comp_matrices_mul(comp_id) or {}
            ctx_rows = self._component_possibles_for_residue(comp_id, residue)
            out[comp_id] = {
                "possible_rows_hit": ctx_rows,  # list[(row_idx, [global idx list])]
                "dm_possible_nodes_shape": tuple(matrices.get("dm_possible_nodes", np.empty((0,))).shape),
                "adj_possible_nodes_shape": tuple(matrices.get("adj_possible_nodes", np.empty((0,))).shape),
            }
        return out

    # — One-call report ---------------------------------------------------------

    def track_one(self, residue: str) -> Dict[str, Any]:
        pres  = self.presence_table(residue)
        rsa   = self.rsa_rows(residue)
        dist  = self.distances_rows(residue)
        frames= self.frames_rows(residue)
        combos= self.triad_combo_hits(residue)
        varcx = self.variation_context(residue)

        partners = []
        for k in range(len(self._nodes_per_graph)):
            dfk = dist[dist["protein"]==k].copy()
            if not dfk.empty:
                # mark as contact if dm_thresh is finite and >0 (your pipeline uses 0/NaN for “no edge”)
                mask = (~dfk["dm_thresh"].isna()) & (dfk["dm_thresh"]>0)
                counts = dfk.loc[mask, "other_residue"].value_counts()
                partners.append(pd.DataFrame({
                    "protein": k,
                    "partner": counts.index,
                    "count_thresh_contact": counts.values
                }))
        partners_df = pd.concat(partners, ignore_index=True) if partners else pd.DataFrame(columns=["protein","partner","count_thresh_contact"])

        return {
            "presence": pres,                    # per source graph presence/global idx
            "rsa": rsa,                          # RSA per protein (if present)
            "distances": dist,                   # dm_thresh/dm_prune rows against all residues in that protein
            "frames": frames,                    # where it appears in final graphs
            "combos": combos,                    # cross_combos occurrences
            "variation_context": varcx,          # shapes + rows hit in per‑component matrices
            "partners_summary": partners_df,     # quick contact counts
        }

    # — Batch ---------------------------------------------------------------

    def track_many(self, residues: Iterable[str]) -> Dict[str, Dict[str, Any]]:
        return {r: self.track_one(r) for r in residues}

# ──────────────────────────────────────────────────────────────────────────────
# Example usage (Jupyter)
# ──────────────────────────────────────────────────────────────────────────────

# Point to a finished run directory (must contain the association_product_* files)
RUN_DIR = "CrossSteps/3tjh_3tfk"    # <- change to your run folder
rr = RunReader(RUN_DIR)
rt = ResidueTracker(rr)

# Single residue
res = "A:VAL:66"
out = rt.track_one(res)
display(out["presence"])
display(out["rsa"].head())
display(out["distances"].head())
display(out["frames"].head())
display(out["combos"].head())
print(out["variation_context"])
display(out["partners_summary"].head())

# Many residues
# targets = ["A:ALA:45", "C:GLU:2", "C:ASP:4"]
# bundle = rt.track_many(targets)
# # to peek one of them:
# display(bundle["A:ALA:45"]["presence"])


In [None]:
data = {('ALA', 'GLU', 'GLY', 0, 0, 0, 2, 2, 2, 3, 3, 2): [(('A:ALA:158', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 3, 2), ('A:ALA:158', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 3, 2)), (('A:ALA:158', 'A:GLU:163', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 3, 2), ('A:ALA:158', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 3, 2))], ('GLY', 'GLU', 'TYR', 0, 0, 0, 1, 2, 2, 2, 3, 2): [(('A:GLY:151', 'A:GLU:154', 'A:TYR:155', 0, 0, 0, 1, 2, 2, 2, 3, 2), ('A:GLY:151', 'A:GLU:154', 'A:TYR:155', 0, 0, 0, 1, 2, 2, 2, 3, 2))], ('GLU', 'VAL', 'TRP', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:GLU:161', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:GLU:161', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('ARG', 'ALA', 'TYR', 0, 0, 0, 1, 2, 2, 2, 3, 3): [(('A:ARG:157', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 1, 2, 2, 2, 3, 3), ('A:ARG:157', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 1, 2, 2, 2, 3, 3))], ('GLN', 'ARG', 'GLU', 0, 0, 0, 2, 1, 1, 3, 4, 3): [(('A:GLN:65', 'A:ARG:62', 'A:GLU:58', 0, 0, 0, 2, 1, 1, 3, 4, 3), ('A:GLN:72', 'A:ARG:75', 'A:GLU:19', 0, 0, 0, 2, 1, 1, 3, 4, 3))], ('ARG', 'THR', 'VAL', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:ARG:79', 'A:THR:80', 'A:VAL:76', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:ARG:79', 'A:THR:80', 'A:VAL:76', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('ARG', 'VAL', 'GLU', 0, 0, 0, 2, 1, 1, 3, 5, 2): [(('A:ARG:108', 'A:VAL:165', 'A:GLU:166', 0, 0, 0, 2, 1, 1, 3, 5, 2), ('A:ARG:108', 'A:VAL:165', 'A:GLU:166', 0, 0, 0, 2, 1, 1, 3, 5, 2))], ('ARG', 'GLU', 'GLY', 0, 0, 0, 1, 2, 2, 3, 4, 2): [(('A:ARG:157', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:157', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 1, 2, 2, 3, 4, 2)), (('A:ARG:157', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:75', 'A:GLU:19', 'A:GLY:18', 0, 0, 0, 1, 2, 2, 3, 4, 2)), (('A:ARG:157', 'A:GLU:154', 'A:GLY:151', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:157', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 1, 2, 2, 3, 4, 2)), (('A:ARG:157', 'A:GLU:154', 'A:GLY:151', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:75', 'A:GLU:19', 'A:GLY:18', 0, 0, 0, 1, 2, 2, 3, 4, 2)), (('A:ARG:75', 'A:GLU:19', 'A:GLY:18', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:157', 'A:GLU:161', 'A:GLY:162', 0, 0, 0, 1, 2, 2, 3, 4, 2)), (('A:ARG:75', 'A:GLU:19', 'A:GLY:18', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:75', 'A:GLU:19', 'A:GLY:18', 0, 0, 0, 1, 2, 2, 3, 4, 2))], ('TRP', 'GLU', 'VAL', 0, 0, 0, 1, 2, 2, 2, 3, 2): [(('A:TRP:167', 'A:GLU:166', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 2, 3, 2), ('A:TRP:167', 'A:GLU:166', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 2, 3, 2))], ('GLU', 'TRP', 'VAL', 0, 0, 0, 2, 1, 1, 2, 2, 3): [(('A:GLU:166', 'A:TRP:167', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 2, 2, 3), ('A:GLU:166', 'A:TRP:167', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 2, 2, 3))], ('ARG', 'VAL', 'GLU', 0, 0, 0, 2, 1, 1, 3, 4, 3): [(('A:ARG:108', 'A:VAL:165', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 3, 4, 3), ('A:ARG:108', 'A:VAL:165', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 3, 4, 3))], ('ALA', 'GLU', 'ARG', 0, 0, 0, 2, 2, 2, 3, 2, 3): [(('A:ALA:158', 'A:GLU:161', 'A:ARG:157', 0, 0, 0, 2, 2, 2, 3, 2, 3), ('A:ALA:158', 'A:GLU:161', 'A:ARG:157', 0, 0, 0, 2, 2, 2, 3, 2, 3)), (('A:ALA:158', 'A:GLU:154', 'A:ARG:157', 0, 0, 0, 2, 2, 2, 3, 2, 3), ('A:ALA:158', 'A:GLU:161', 'A:ARG:157', 0, 0, 0, 2, 2, 2, 3, 2, 3))], ('ALA', 'GLU', 'GLY', 0, 0, 0, 2, 2, 2, 3, 4, 2): [(('A:ALA:158', 'A:GLU:154', 'A:GLY:151', 0, 0, 0, 2, 2, 2, 3, 4, 2), ('A:ALA:158', 'A:GLU:154', 'A:GLY:151', 0, 0, 0, 2, 2, 2, 3, 4, 2))], ('GLN', 'GLU', 'LYS', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:GLN:72', 'A:GLU:71', 'A:LYS:68', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:GLN:72', 'A:GLU:71', 'A:LYS:68', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('ARG', 'VAL', 'GLU', 0, 0, 0, 2, 1, 1, 3, 6, 3): [(('A:ARG:108', 'A:VAL:165', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 3, 6, 3), ('A:ARG:108', 'A:VAL:165', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 3, 6, 3))], ('GLY', 'GLN', 'LYS', 0, 0, 0, 1, 2, 2, 2, 2, 3): [(('A:GLY:69', 'A:GLN:72', 'A:LYS:68', 0, 0, 0, 1, 2, 2, 2, 2, 3), ('A:GLY:69', 'A:GLN:72', 'A:LYS:68', 0, 0, 0, 1, 2, 2, 2, 2, 3))], ('ARG', 'ILE', 'LYS', 0, 0, 0, 1, 1, 1, 3, 3, 3): [(('A:ARG:144', 'A:ILE:142', 'A:LYS:146', 0, 0, 0, 1, 1, 1, 3, 3, 3), ('A:ARG:144', 'A:ILE:142', 'A:LYS:146', 0, 0, 0, 1, 1, 1, 3, 3, 3))], ('GLU', 'TYR', 'GLY', 0, 0, 0, 2, 1, 1, 2, 2, 3): [(('A:GLU:154', 'A:TYR:155', 'A:GLY:151', 0, 0, 0, 2, 1, 1, 2, 2, 3), ('A:GLU:154', 'A:TYR:155', 'A:GLY:151', 0, 0, 0, 2, 1, 1, 2, 2, 3))], ('ALA', 'GLY', 'GLU', 0, 0, 0, 2, 1, 1, 3, 3, 2): [(('A:ALA:158', 'A:GLY:162', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 3, 3, 2), ('A:ALA:158', 'A:GLY:162', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 3, 3, 2)), (('A:ALA:158', 'A:GLY:162', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 3, 3, 2), ('A:ALA:158', 'A:GLY:162', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 3, 3, 2))], ('ALA', 'ARG', 'GLU', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:ALA:158', 'A:ARG:157', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:ALA:158', 'A:ARG:157', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 2, 3, 3)), (('A:ALA:158', 'A:ARG:157', 'A:GLU:154', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:ALA:158', 'A:ARG:157', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('ALA', 'GLU', 'VAL', 0, 0, 0, 2, 2, 2, 3, 5, 3): [(('A:ALA:158', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 2, 2, 2, 3, 5, 3), ('A:ALA:158', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 2, 2, 2, 3, 5, 3)), (('A:ALA:158', 'A:GLU:163', 'A:VAL:165', 0, 0, 0, 2, 2, 2, 3, 5, 3), ('A:ALA:158', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 2, 2, 2, 3, 5, 3))], ('ARG', 'ARG', 'VAL', 0, 0, 0, 2, 1, 1, 3, 3, 2): [(('A:ARG:79', 'A:ARG:75', 'A:VAL:76', 0, 0, 0, 2, 1, 1, 3, 3, 2), ('A:ARG:79', 'A:ARG:75', 'A:VAL:76', 0, 0, 0, 2, 1, 1, 3, 3, 2))], ('GLU', 'ALA', 'TYR', 0, 0, 0, 2, 2, 2, 3, 5, 3): [(('A:GLU:161', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 5, 3), ('A:GLU:161', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 5, 3)), (('A:GLU:163', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 5, 3), ('A:GLU:161', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 5, 3))], ('THR', 'ARG', 'VAL', 0, 0, 0, 1, 2, 2, 2, 3, 3): [(('A:THR:80', 'A:ARG:79', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 2, 3, 3), ('A:THR:80', 'A:ARG:79', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 2, 3, 3))], ('ARG', 'ARG', 'GLU', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:ARG:79', 'A:ARG:75', 'A:GLU:71', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:ARG:79', 'A:ARG:75', 'A:GLU:71', 0, 0, 0, 2, 1, 1, 3, 5, 3)), (('A:ARG:79', 'A:ARG:75', 'A:GLU:71', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:ARG:79', 'A:ARG:75', 'A:GLU:19', 0, 0, 0, 2, 1, 1, 3, 5, 3)), (('A:ARG:79', 'A:ARG:75', 'A:GLU:19', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:ARG:79', 'A:ARG:75', 'A:GLU:71', 0, 0, 0, 2, 1, 1, 3, 5, 3)), (('A:ARG:79', 'A:ARG:75', 'A:GLU:19', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:ARG:79', 'A:ARG:75', 'A:GLU:19', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('GLU', 'ALA', 'GLU', 0, 0, 0, 2, 2, 2, 3, 4, 3): [(('A:GLU:161', 'A:ALA:158', 'A:GLU:163', 0, 0, 0, 2, 2, 2, 3, 4, 3), ('A:GLU:154', 'A:ALA:158', 'A:GLU:161', 0, 0, 0, 2, 2, 2, 3, 4, 3)), (('A:GLU:154', 'A:ALA:158', 'A:GLU:161', 0, 0, 0, 2, 2, 2, 3, 4, 3), ('A:GLU:154', 'A:ALA:158', 'A:GLU:161', 0, 0, 0, 2, 2, 2, 3, 4, 3))], ('GLU', 'GLY', 'GLU', 0, 0, 0, 2, 1, 1, 2, 4, 2): [(('A:GLU:161', 'A:GLY:162', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 2, 4, 2), ('A:GLU:161', 'A:GLY:162', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 2, 4, 2)), (('A:GLU:161', 'A:GLY:162', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 2, 4, 2), ('A:GLU:161', 'A:GLY:162', 'A:GLU:166', 0, 0, 0, 2, 1, 1, 2, 4, 2))], ('GLU', 'ARG', 'TYR', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:GLU:161', 'A:ARG:157', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:GLU:161', 'A:ARG:157', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('GLN', 'VAL', 'THR', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:GLN:72', 'A:VAL:76', 'A:THR:80', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:GLN:72', 'A:VAL:76', 'A:THR:80', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('GLN', 'GLY', 'GLU', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:GLN:54', 'A:GLY:56', 'A:GLU:58', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:GLN:54', 'A:GLY:56', 'A:GLU:58', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('GLU', 'GLY', 'GLU', 0, 0, 0, 2, 1, 1, 3, 4, 2): [(('A:GLU:148', 'A:GLY:151', 'A:GLU:154', 0, 0, 0, 2, 1, 1, 3, 4, 2), ('A:GLU:148', 'A:GLY:151', 'A:GLU:154', 0, 0, 0, 2, 1, 1, 3, 4, 2))], ('GLU', 'ARG', 'ILE', 0, 0, 0, 2, 1, 1, 3, 4, 3): [(('A:GLU:148', 'A:ARG:144', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 3, 4, 3), ('A:GLU:148', 'A:ARG:144', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 3, 4, 3))], ('ALA', 'GLY', 'VAL', 0, 0, 0, 2, 1, 1, 3, 5, 2): [(('A:ALA:158', 'A:GLY:162', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 3, 5, 2), ('A:ALA:158', 'A:GLY:162', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 3, 5, 2))], ('GLY', 'TYR', 'THR', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:GLY:83', 'A:TYR:84', 'A:THR:80', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:GLY:83', 'A:TYR:84', 'A:THR:80', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('ARG', 'VAL', 'THR', 0, 0, 0, 2, 1, 1, 3, 2, 3): [(('A:ARG:79', 'A:VAL:76', 'A:THR:80', 0, 0, 0, 2, 1, 1, 3, 2, 3), ('A:ARG:79', 'A:VAL:76', 'A:THR:80', 0, 0, 0, 2, 1, 1, 3, 2, 3))], ('GLY', 'THR', 'TYR', 0, 0, 0, 2, 1, 1, 3, 2, 3): [(('A:GLY:83', 'A:THR:80', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 3, 2, 3), ('A:GLY:83', 'A:THR:80', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 3, 2, 3))], ('GLY', 'TYR', 'TYR', 0, 0, 0, 2, 1, 1, 2, 3, 2): [(('A:GLY:83', 'A:TYR:84', 'A:TYR:85', 0, 0, 0, 2, 1, 1, 2, 3, 2), ('A:GLY:83', 'A:TYR:84', 'A:TYR:85', 0, 0, 0, 2, 1, 1, 2, 3, 2))], ('ARG', 'ILE', 'ARG', 0, 0, 0, 1, 1, 1, 3, 2, 3): [(('A:ARG:144', 'A:ILE:142', 'A:ARG:145', 0, 0, 0, 1, 1, 1, 3, 2, 3), ('A:ARG:144', 'A:ILE:142', 'A:ARG:145', 0, 0, 0, 1, 1, 1, 3, 2, 3))], ('ARG', 'GLU', 'VAL', 0, 0, 0, 1, 2, 2, 3, 5, 3): [(('A:ARG:157', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 3, 5, 3), ('A:ARG:157', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 3, 5, 3))], ('TYR', 'GLY', 'TYR', 0, 0, 0, 1, 2, 2, 2, 2, 3): [(('A:TYR:84', 'A:GLY:83', 'A:TYR:85', 0, 0, 0, 1, 2, 2, 2, 2, 3), ('A:TYR:84', 'A:GLY:83', 'A:TYR:85', 0, 0, 0, 1, 2, 2, 2, 2, 3))], ('GLY', 'ALA', 'TYR', 0, 0, 0, 1, 2, 2, 3, 5, 3): [(('A:GLY:162', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 1, 2, 2, 3, 5, 3), ('A:GLY:162', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 1, 2, 2, 3, 5, 3))], ('ALA', 'GLU', 'TYR', 0, 0, 0, 2, 2, 2, 3, 3, 2): [(('A:ALA:158', 'A:GLU:154', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 3, 2), ('A:ALA:158', 'A:GLU:154', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 3, 2))], ('GLY', 'GLN', 'VAL', 0, 0, 0, 1, 2, 2, 2, 4, 3): [(('A:GLY:69', 'A:GLN:72', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 2, 4, 3), ('A:GLY:69', 'A:GLN:72', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 2, 4, 3))], ('GLU', 'VAL', 'HIS', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:GLU:161', 'A:VAL:165', 'A:HIS:169', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:GLU:161', 'A:VAL:165', 'A:HIS:169', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('GLU', 'GLN', 'LYS', 0, 0, 0, 1, 2, 2, 2, 3, 3): [(('A:GLU:71', 'A:GLN:72', 'A:LYS:68', 0, 0, 0, 1, 2, 2, 2, 3, 3), ('A:GLU:71', 'A:GLN:72', 'A:LYS:68', 0, 0, 0, 1, 2, 2, 2, 3, 3))], ('GLU', 'VAL', 'GLU', 0, 0, 0, 2, 1, 1, 3, 4, 3): [(('A:GLU:161', 'A:VAL:165', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 3, 4, 3), ('A:GLU:161', 'A:VAL:165', 'A:GLU:163', 0, 0, 0, 2, 1, 1, 3, 4, 3))], ('ARG', 'GLU', 'GLY', 0, 0, 0, 1, 2, 2, 3, 4, 3): [(('A:ARG:144', 'A:GLU:148', 'A:GLY:151', 0, 0, 0, 1, 2, 2, 3, 4, 3), ('A:ARG:62', 'A:GLU:58', 'A:GLY:56', 0, 0, 0, 1, 2, 2, 3, 4, 3)), (('A:ARG:144', 'A:GLU:148', 'A:GLY:151', 0, 0, 0, 1, 2, 2, 3, 4, 3), ('A:ARG:144', 'A:GLU:148', 'A:GLY:151', 0, 0, 0, 1, 2, 2, 3, 4, 3))], ('ARG', 'ARG', 'THR', 0, 0, 0, 1, 2, 2, 3, 4, 2): [(('A:ARG:75', 'A:ARG:79', 'A:THR:80', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:75', 'A:ARG:79', 'A:THR:80', 0, 0, 0, 1, 2, 2, 3, 4, 2))], ('ARG', 'ILE', 'TYR', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:ARG:145', 'A:ILE:142', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:ARG:145', 'A:ILE:142', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('GLY', 'TYR', 'ILE', 0, 0, 0, 2, 1, 1, 2, 4, 3): [(('A:GLY:83', 'A:TYR:84', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 2, 4, 3), ('A:GLY:83', 'A:TYR:84', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 2, 4, 3))], ('GLU', 'GLY', 'VAL', 0, 0, 0, 2, 1, 1, 2, 3, 2): [(('A:GLU:161', 'A:GLY:162', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 2, 3, 2), ('A:GLU:161', 'A:GLY:162', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 2, 3, 2)), (('A:GLU:163', 'A:GLY:162', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 2, 3, 2), ('A:GLU:161', 'A:GLY:162', 'A:VAL:165', 0, 0, 0, 2, 1, 1, 2, 3, 2))], ('GLN', 'ARG', 'GLU', 0, 0, 0, 2, 1, 1, 2, 4, 3): [(('A:GLN:72', 'A:ARG:75', 'A:GLU:19', 0, 0, 0, 2, 1, 1, 2, 4, 3), ('A:GLN:65', 'A:ARG:62', 'A:GLU:58', 0, 0, 0, 2, 1, 1, 2, 4, 3))], ('ALA', 'GLU', 'LEU', 0, 0, 0, 2, 2, 2, 3, 5, 3): [(('A:ALA:158', 'A:GLU:161', 'A:LEU:109', 0, 0, 0, 2, 2, 2, 3, 5, 3), ('A:ALA:158', 'A:GLU:161', 'A:LEU:109', 0, 0, 0, 2, 2, 2, 3, 5, 3))], ('ARG', 'VAL', 'TRP', 0, 0, 0, 2, 1, 1, 3, 6, 3): [(('A:ARG:108', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 2, 1, 1, 3, 6, 3), ('A:ARG:108', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 2, 1, 1, 3, 6, 3))], ('ARG', 'ARG', 'ILE', 0, 0, 0, 1, 2, 2, 2, 3, 3): [(('A:ARG:144', 'A:ARG:145', 'A:ILE:142', 0, 0, 0, 1, 2, 2, 2, 3, 3), ('A:ARG:144', 'A:ARG:145', 'A:ILE:142', 0, 0, 0, 1, 2, 2, 2, 3, 3))], ('GLU', 'LYS', 'ILE', 0, 0, 0, 2, 1, 1, 3, 4, 3): [(('A:GLU:148', 'A:LYS:146', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 3, 4, 3), ('A:GLU:148', 'A:LYS:146', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 3, 4, 3))], ('ARG', 'LYS', 'ILE', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:ARG:145', 'A:LYS:146', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:ARG:145', 'A:LYS:146', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('ARG', 'VAL', 'THR', 0, 0, 0, 1, 1, 1, 2, 4, 3): [(('A:ARG:75', 'A:VAL:76', 'A:THR:80', 0, 0, 0, 1, 1, 1, 2, 4, 3), ('A:ARG:75', 'A:VAL:76', 'A:THR:80', 0, 0, 0, 1, 1, 1, 2, 4, 3))], ('GLU', 'ALA', 'GLY', 0, 0, 0, 2, 2, 2, 3, 5, 3): [(('A:GLU:154', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 5, 3), ('A:GLU:154', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 5, 3))], ('GLN', 'GLY', 'LYS', 0, 0, 0, 2, 1, 1, 2, 3, 2): [(('A:GLN:72', 'A:GLY:69', 'A:LYS:68', 0, 0, 0, 2, 1, 1, 2, 3, 2), ('A:GLN:72', 'A:GLY:69', 'A:LYS:68', 0, 0, 0, 2, 1, 1, 2, 3, 2))], ('ARG', 'ARG', 'GLN', 0, 0, 0, 1, 2, 2, 2, 4, 3): [(('A:ARG:144', 'A:ARG:145', 'A:GLN:149', 0, 0, 0, 1, 2, 2, 2, 4, 3), ('A:ARG:144', 'A:ARG:145', 'A:GLN:149', 0, 0, 0, 1, 2, 2, 2, 4, 3))], ('ARG', 'ILE', 'TYR', 0, 0, 0, 1, 1, 1, 3, 5, 3): [(('A:ARG:144', 'A:ILE:142', 'A:TYR:84', 0, 0, 0, 1, 1, 1, 3, 5, 3), ('A:ARG:144', 'A:ILE:142', 'A:TYR:84', 0, 0, 0, 1, 1, 1, 3, 5, 3))], ('GLU', 'ARG', 'VAL', 0, 0, 0, 1, 1, 1, 3, 4, 2): [(('A:GLU:71', 'A:ARG:75', 'A:VAL:76', 0, 0, 0, 1, 1, 1, 3, 4, 2), ('A:GLU:71', 'A:ARG:75', 'A:VAL:76', 0, 0, 0, 1, 1, 1, 3, 4, 2))], ('GLY', 'VAL', 'TRP', 0, 0, 0, 1, 1, 1, 2, 4, 3): [(('A:GLY:162', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 1, 1, 1, 2, 4, 3), ('A:GLY:162', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 1, 1, 1, 2, 4, 3))], ('ARG', 'ALA', 'GLY', 0, 0, 0, 1, 2, 2, 2, 4, 3): [(('A:ARG:157', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 1, 2, 2, 2, 4, 3), ('A:ARG:157', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 1, 2, 2, 2, 4, 3))], ('GLU', 'GLY', 'TYR', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:GLU:148', 'A:GLY:151', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:GLU:148', 'A:GLY:151', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('ARG', 'GLU', 'GLN', 0, 0, 0, 1, 2, 2, 3, 4, 2): [(('A:ARG:144', 'A:GLU:148', 'A:GLN:149', 0, 0, 0, 1, 2, 2, 3, 4, 2), ('A:ARG:144', 'A:GLU:148', 'A:GLN:149', 0, 0, 0, 1, 2, 2, 3, 4, 2))], ('GLY', 'GLN', 'LYS', 0, 0, 0, 1, 2, 2, 3, 2, 2): [(('A:GLY:69', 'A:GLN:65', 'A:LYS:68', 0, 0, 0, 1, 2, 2, 3, 2, 2), ('A:GLY:69', 'A:GLN:65', 'A:LYS:68', 0, 0, 0, 1, 2, 2, 3, 2, 2))], ('ARG', 'GLU', 'LYS', 0, 0, 0, 1, 2, 2, 3, 3, 3): [(('A:ARG:144', 'A:GLU:148', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 3, 3, 3), ('A:ARG:144', 'A:GLU:148', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 3, 3, 3))], ('GLU', 'ALA', 'GLY', 0, 0, 0, 2, 2, 2, 3, 2, 3): [(('A:GLU:161', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 2, 3), ('A:GLU:161', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 2, 3)), (('A:GLU:163', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 2, 3), ('A:GLU:161', 'A:ALA:158', 'A:GLY:162', 0, 0, 0, 2, 2, 2, 3, 2, 3))], ('GLN', 'GLY', 'GLN', 0, 0, 0, 2, 1, 1, 3, 5, 2): [(('A:GLN:65', 'A:GLY:69', 'A:GLN:72', 0, 0, 0, 2, 1, 1, 3, 5, 2), ('A:GLN:65', 'A:GLY:69', 'A:GLN:72', 0, 0, 0, 2, 1, 1, 3, 5, 2))], ('ALA', 'TYR', 'ARG', 0, 0, 0, 2, 1, 1, 3, 2, 3): [(('A:ALA:158', 'A:TYR:155', 'A:ARG:157', 0, 0, 0, 2, 1, 1, 3, 2, 3), ('A:ALA:158', 'A:TYR:155', 'A:ARG:157', 0, 0, 0, 2, 1, 1, 3, 2, 3))], ('THR', 'TYR', 'TYR', 0, 0, 0, 1, 1, 1, 3, 4, 2): [(('A:THR:80', 'A:TYR:84', 'A:TYR:85', 0, 0, 0, 1, 1, 1, 3, 4, 2), ('A:THR:80', 'A:TYR:84', 'A:TYR:85', 0, 0, 0, 1, 1, 1, 3, 4, 2))], ('ARG', 'TYR', 'GLY', 0, 0, 0, 1, 1, 1, 3, 4, 3): [(('A:ARG:157', 'A:TYR:155', 'A:GLY:151', 0, 0, 0, 1, 1, 1, 3, 4, 3), ('A:ARG:157', 'A:TYR:155', 'A:GLY:151', 0, 0, 0, 1, 1, 1, 3, 4, 3))], ('ARG', 'ILE', 'LYS', 0, 0, 0, 2, 1, 1, 3, 2, 3): [(('A:ARG:145', 'A:ILE:142', 'A:LYS:146', 0, 0, 0, 2, 1, 1, 3, 2, 3), ('A:ARG:145', 'A:ILE:142', 'A:LYS:146', 0, 0, 0, 2, 1, 1, 3, 2, 3))], ('ALA', 'TYR', 'GLU', 0, 0, 0, 2, 1, 1, 3, 3, 2): [(('A:ALA:158', 'A:TYR:155', 'A:GLU:154', 0, 0, 0, 2, 1, 1, 3, 3, 2), ('A:ALA:158', 'A:TYR:155', 'A:GLU:154', 0, 0, 0, 2, 1, 1, 3, 3, 2))], ('THR', 'GLY', 'TYR', 0, 0, 0, 1, 2, 2, 3, 3, 2): [(('A:THR:80', 'A:GLY:83', 'A:TYR:84', 0, 0, 0, 1, 2, 2, 3, 3, 2), ('A:THR:80', 'A:GLY:83', 'A:TYR:84', 0, 0, 0, 1, 2, 2, 3, 3, 2))], ('ILE', 'TYR', 'THR', 0, 0, 0, 1, 1, 1, 3, 4, 3): [(('A:ILE:142', 'A:TYR:84', 'A:THR:80', 0, 0, 0, 1, 1, 1, 3, 4, 3), ('A:ILE:142', 'A:TYR:84', 'A:THR:80', 0, 0, 0, 1, 1, 1, 3, 4, 3))], ('ARG', 'ARG', 'ILE', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:ARG:145', 'A:ARG:144', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:ARG:145', 'A:ARG:144', 'A:ILE:142', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('GLU', 'VAL', 'TRP', 0, 0, 0, 2, 1, 1, 2, 2, 3): [(('A:GLU:166', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 2, 1, 1, 2, 2, 3), ('A:GLU:166', 'A:VAL:165', 'A:TRP:167', 0, 0, 0, 2, 1, 1, 2, 2, 3))], ('ILE', 'TYR', 'TYR', 0, 0, 0, 1, 1, 1, 3, 4, 2): [(('A:ILE:142', 'A:TYR:84', 'A:TYR:85', 0, 0, 0, 1, 1, 1, 3, 4, 2), ('A:ILE:142', 'A:TYR:84', 'A:TYR:85', 0, 0, 0, 1, 1, 1, 3, 4, 2))], ('ARG', 'GLU', 'LEU', 0, 0, 0, 1, 2, 2, 3, 4, 3): [(('A:ARG:157', 'A:GLU:161', 'A:LEU:109', 0, 0, 0, 1, 2, 2, 3, 4, 3), ('A:ARG:157', 'A:GLU:161', 'A:LEU:109', 0, 0, 0, 1, 2, 2, 3, 4, 3))], ('THR', 'GLY', 'TYR', 0, 0, 0, 1, 2, 2, 3, 4, 3): [(('A:THR:80', 'A:GLY:83', 'A:TYR:85', 0, 0, 0, 1, 2, 2, 3, 4, 3), ('A:THR:80', 'A:GLY:83', 'A:TYR:85', 0, 0, 0, 1, 2, 2, 3, 4, 3))], ('GLY', 'GLU', 'LYS', 0, 0, 0, 1, 2, 2, 3, 4, 3): [(('A:GLY:151', 'A:GLU:148', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 3, 4, 3), ('A:GLY:151', 'A:GLU:148', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 3, 4, 3))], ('GLY', 'VAL', 'HIS', 0, 0, 0, 1, 1, 1, 2, 4, 3): [(('A:GLY:162', 'A:VAL:165', 'A:HIS:169', 0, 0, 0, 1, 1, 1, 2, 4, 3), ('A:GLY:162', 'A:VAL:165', 'A:HIS:169', 0, 0, 0, 1, 1, 1, 2, 4, 3))], ('ILE', 'ARG', 'LYS', 0, 0, 0, 1, 2, 2, 3, 3, 2): [(('A:ILE:142', 'A:ARG:145', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 3, 3, 2), ('A:ILE:142', 'A:ARG:145', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 3, 3, 2))], ('ARG', 'LEU', 'GLU', 0, 0, 0, 2, 1, 1, 2, 4, 3): [(('A:ARG:108', 'A:LEU:109', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 2, 4, 3), ('A:ARG:108', 'A:LEU:109', 'A:GLU:161', 0, 0, 0, 2, 1, 1, 2, 4, 3))], ('GLU', 'VAL', 'GLU', 0, 0, 0, 2, 1, 1, 3, 4, 2): [(('A:GLU:161', 'A:VAL:165', 'A:GLU:166', 0, 0, 0, 2, 1, 1, 3, 4, 2), ('A:GLU:161', 'A:VAL:165', 'A:GLU:166', 0, 0, 0, 2, 1, 1, 3, 4, 2))], ('GLU', 'VAL', 'GLY', 0, 0, 0, 2, 1, 1, 3, 2, 2): [(('A:GLU:161', 'A:VAL:165', 'A:GLY:162', 0, 0, 0, 2, 1, 1, 3, 2, 2), ('A:GLU:161', 'A:VAL:165', 'A:GLY:162', 0, 0, 0, 2, 1, 1, 3, 2, 2)), (('A:GLU:163', 'A:VAL:165', 'A:GLY:162', 0, 0, 0, 2, 1, 1, 3, 2, 2), ('A:GLU:161', 'A:VAL:165', 'A:GLY:162', 0, 0, 0, 2, 1, 1, 3, 2, 2))], ('ARG', 'ALA', 'GLU', 0, 0, 0, 1, 2, 2, 2, 3, 3): [(('A:ARG:157', 'A:ALA:158', 'A:GLU:161', 0, 0, 0, 1, 2, 2, 2, 3, 3), ('A:ARG:157', 'A:ALA:158', 'A:GLU:161', 0, 0, 0, 1, 2, 2, 2, 3, 3)), (('A:ARG:157', 'A:ALA:158', 'A:GLU:154', 0, 0, 0, 1, 2, 2, 2, 3, 3), ('A:ARG:157', 'A:ALA:158', 'A:GLU:161', 0, 0, 0, 1, 2, 2, 2, 3, 3))], ('ARG', 'THR', 'GLY', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:ARG:79', 'A:THR:80', 'A:GLY:83', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:ARG:79', 'A:THR:80', 'A:GLY:83', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('GLU', 'GLY', 'TYR', 0, 0, 0, 2, 1, 1, 2, 2, 3): [(('A:GLU:154', 'A:GLY:151', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 2, 2, 3), ('A:GLU:154', 'A:GLY:151', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 2, 2, 3))], ('TYR', 'THR', 'VAL', 0, 0, 0, 1, 1, 1, 3, 5, 3): [(('A:TYR:84', 'A:THR:80', 'A:VAL:76', 0, 0, 0, 1, 1, 1, 3, 5, 3), ('A:TYR:84', 'A:THR:80', 'A:VAL:76', 0, 0, 0, 1, 1, 1, 3, 5, 3))], ('GLU', 'GLN', 'VAL', 0, 0, 0, 1, 2, 2, 2, 4, 3): [(('A:GLU:71', 'A:GLN:72', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 2, 4, 3), ('A:GLU:71', 'A:GLN:72', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 2, 4, 3))], ('GLU', 'ALA', 'TYR', 0, 0, 0, 2, 2, 2, 3, 2, 3): [(('A:GLU:154', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 2, 3), ('A:GLU:154', 'A:ALA:158', 'A:TYR:155', 0, 0, 0, 2, 2, 2, 3, 2, 3))], ('ALA', 'TYR', 'GLY', 0, 0, 0, 2, 1, 1, 3, 4, 3): [(('A:ALA:158', 'A:TYR:155', 'A:GLY:151', 0, 0, 0, 2, 1, 1, 3, 4, 3), ('A:ALA:158', 'A:TYR:155', 'A:GLY:151', 0, 0, 0, 2, 1, 1, 3, 4, 3))], ('ARG', 'VAL', 'ARG', 0, 0, 0, 1, 1, 1, 2, 3, 3): [(('A:ARG:75', 'A:VAL:76', 'A:ARG:79', 0, 0, 0, 1, 1, 1, 2, 3, 3), ('A:ARG:75', 'A:VAL:76', 'A:ARG:79', 0, 0, 0, 1, 1, 1, 2, 3, 3))], ('ALA', 'ARG', 'TYR', 0, 0, 0, 2, 1, 1, 2, 3, 3): [(('A:ALA:158', 'A:ARG:157', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 2, 3, 3), ('A:ALA:158', 'A:ARG:157', 'A:TYR:155', 0, 0, 0, 2, 1, 1, 2, 3, 3))], ('GLY', 'TYR', 'TYR', 0, 0, 0, 2, 1, 1, 3, 2, 2): [(('A:GLY:83', 'A:TYR:85', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 3, 2, 2), ('A:GLY:83', 'A:TYR:85', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 3, 2, 2))], ('ARG', 'ARG', 'LYS', 0, 0, 0, 1, 2, 2, 2, 3, 2): [(('A:ARG:144', 'A:ARG:145', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 2, 3, 2), ('A:ARG:144', 'A:ARG:145', 'A:LYS:146', 0, 0, 0, 1, 2, 2, 2, 3, 2))], ('GLY', 'GLU', 'VAL', 0, 0, 0, 1, 2, 2, 2, 2, 3): [(('A:GLY:162', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 2, 2, 3), ('A:GLY:162', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 2, 2, 3)), (('A:GLY:162', 'A:GLU:163', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 2, 2, 3), ('A:GLY:162', 'A:GLU:161', 'A:VAL:165', 0, 0, 0, 1, 2, 2, 2, 2, 3))], ('GLN', 'GLY', 'LYS', 0, 0, 0, 2, 1, 1, 3, 2, 2): [(('A:GLN:65', 'A:GLY:69', 'A:LYS:68', 0, 0, 0, 2, 1, 1, 3, 2, 2), ('A:GLN:65', 'A:GLY:69', 'A:LYS:68', 0, 0, 0, 2, 1, 1, 3, 2, 2))], ('GLY', 'THR', 'VAL', 0, 0, 0, 2, 1, 1, 3, 5, 3): [(('A:GLY:83', 'A:THR:80', 'A:VAL:76', 0, 0, 0, 2, 1, 1, 3, 5, 3), ('A:GLY:83', 'A:THR:80', 'A:VAL:76', 0, 0, 0, 2, 1, 1, 3, 5, 3))], ('ARG', 'THR', 'TYR', 0, 0, 0, 2, 1, 1, 2, 4, 3): [(('A:ARG:79', 'A:THR:80', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 2, 4, 3), ('A:ARG:79', 'A:THR:80', 'A:TYR:84', 0, 0, 0, 2, 1, 1, 2, 4, 3))], ('ARG', 'ARG', 'VAL', 0, 0, 0, 1, 2, 2, 3, 2, 3): [(('A:ARG:75', 'A:ARG:79', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 3, 2, 3), ('A:ARG:75', 'A:ARG:79', 'A:VAL:76', 0, 0, 0, 1, 2, 2, 3, 2, 3))]}

In [None]:
def find_residue(data, residue):
    """
    Procura todas as chaves e listas do dicionário `data` 
    onde pelo menos uma tupla contenha o resíduo `residue`.

    Args:
        data (dict): Dicionário de dados no formato que você mostrou.
        residue (str): Nome do resíduo a procurar (ex: 'GLY').

    Returns:
        dict: Sub-dicionário contendo apenas as chaves e valores filtrados.
    """
    result = {}

    for key, tuples_list in data.items():
        # Checa se o resíduo está na chave
        key_has_residue = residue in key

        # Filtra apenas os pares que contêm o resíduo em alguma das tuplas
        filtered_pairs = [pair for pair in tuples_list if residue in pair[0] or residue in pair[1]]

        if key_has_residue or filtered_pairs:
            result[key] = filtered_pairs if filtered_pairs else tuples_list

    return result


# Exemplo de uso:
residue = "A:ALA:158"
filtered = find_residue(data, residue)

for k, v in filtered.items():
    print(f"Chave: {k}")
    for pair in v:
        print("   ", pair)





DEBUG:CRSProtein:Found triad: ('ASP', 'LEU', 'TRP', 0, 0, 0, 1, 1, 1, 3, 4, 2) | ('C:ASP:4', 'C:LEU:6', 'C:TRP:7', 0, 0, 0, 1, 1, 1, 3, 4, 2)

DEBUG:CRSProtein:Found triad: ('ASP', 'LEU', 'TRP', 0, 0, 0, 1, 1, 1, 3, 5, 3) | ('C:ASP:4', 'C:LEU:6', 'C:TRP:8', 0, 0, 0, 1, 1, 1, 3, 5, 3)

INFO:root:N Nodes: 49 | N Edges: 148 | N Triad: 349 | Unique Triad: 332
DEBUG:root:Counters: {1: 316, 2: 15, 3: 1}
DEBUG:CRSProtein:Found triad: ('TYR', 'ASP', 'VAL', 0, 0, 0, 1, 2, 2, 3, 6, 3) | ('A:TYR:155', 'C:ASP:4', 'A:VAL:66', 0, 0, 0, 1, 2, 2, 3, 6, 3)

DEBUG:CRSProtein:Found triad: ('TYR', 'ASP', 'VAL', 0, 0, 0, 1, 2, 2, 3, 4, 2) | ('A:TYR:155', 'C:ASP:4', 'C:VAL:5', 0, 0, 0, 1, 2, 2, 3, 4, 2)

DEBUG:CRSProtein:Found triad: ('VAL', 'ASP', 'VAL', 0, 0, 0, 1, 2, 2, 3, 3, 2) | ('A:VAL:66', 'C:ASP:4', 'C:VAL:5', 0, 0, 0, 1, 2, 2, 3, 3, 2)

DEBUG:CRSProtein:Found triad: ('VAL', 'VAL', 'ASP', 0, 0, 0, 1, 1, 1, 3, 3, 2) | ('A:VAL:66', 'C:VAL:5', 'C:ASP:4', 0, 0, 0, 1, 1, 1, 3, 3, 2)

DEBUG:CRSProtein:Found triad: ('ASP', 'VAL', 'MET', 0, 0, 0, 2, 1, 1, 2, 4, 3) | ('C:ASP:4', 'C:VAL:5', 'C:MET:7', 0, 0, 0, 2, 1, 1, 2, 4, 3)

DEBUG:CRSProtein:Found triad: ('GLY', 'VAL', 'ASP', 0, 0, 0, 1, 1, 1, 3, 4, 2) | ('A:GLY:69', 'C:VAL:5', 'C:ASP:4', 0, 0, 0, 1, 1, 1, 3, 4, 2)

DEBUG:CRSProtein:Found triad: ('GLY', 'TYR', 'ASP', 0, 0, 0, 1, 1, 1, 3, 5, 3) | ('A:GLY:151', 'A:TYR:155', 'C:ASP:4', 0, 0, 0, 1, 1, 1, 3, 5, 3)

DEBUG:CRSProtein:Found triad: ('ASP', 'TYR', 'MET', 0, 0, 0, 2, 1, 1, 3, 4, 3) | ('C:ASP:4', 'A:TYR:155', 'C:MET:7', 0, 0, 0, 2, 1, 1, 3, 4, 3)

DEBUG:CRSProtein:Found triad: ('ALA', 'TYR', 'ASP', 0, 0, 0, 2, 1, 1, 3, 3, 3) | ('A:ALA:158', 'A:TYR:155', 'C:ASP:4', 0, 0, 0, 2, 1, 1, 3, 3, 3)

DEBUG:CRSProtein:Found triad: ('GLU', 'TYR', 'ASP', 0, 0, 0, 2, 1, 1, 2, 5, 3) | ('A:GLU:154', 'A:TYR:155', 'C:ASP:4', 0, 0, 0, 2, 1, 1, 2, 5, 3)

DEBUG:CRSProtein:Found triad: ('ARG', 'TYR', 'ASP', 0, 0, 0, 1, 1, 1, 3, 5, 3) | ('A:ARG:157', 'A:TYR:155', 'C:ASP:4', 0, 0, 0, 1, 1, 1, 3, 5, 3)

DEBUG:CRSProtein:Found triad: ('GLN', 'VAL', 'ASP', 0, 0, 0, 2, 1, 1, 2, 5, 3) | ('A:GLN:65', 'A:VAL:66', 'C:ASP:4', 0, 0, 0, 2, 1, 1, 2, 5, 3)

DEBUG:CRSProtein:Found triad: ('ASP', 'VAL', 'VAL', 0, 0, 0, 2, 1, 1, 3, 2, 3) | ('C:ASP:4', 'A:VAL:66', 'C:VAL:5', 0, 0, 0, 2, 1, 1, 3, 2, 3)

DEBUG:CRSProtein:Found triad: ('GLY', 'VAL', 'ASP', 0, 0, 0, 1, 1, 1, 3, 4, 3) | ('A:GLY:69', 'A:VAL:66', 'C:ASP:4', 0, 0, 0, 1, 1, 1, 3, 4, 3)

DEBUG:CRSProtein:Found triad: ('ARG', 'VAL', 'ASP', 0, 0, 0, 1, 1, 1, 3, 5, 3) | ('A:ARG:62', 'A:VAL:66', 'C:ASP:4', 0, 0, 0, 1, 1, 1, 3, 5, 3)

DEBUG:CRSProtein:Found triad: ('LYS', 'VAL', 'ASP', 0, 0, 0, 2, 1, 1, 3, 5, 3) | ('A:LYS:68', 'A:VAL:66', 'C:ASP:4', 0, 0, 0, 2, 1, 1, 3, 5, 3)