In [2]:
import sys; print(sys.executable)

/Users/sebastien/miniforge3/envs/klifs/bin/python


In [3]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient

http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={"also_return_response": False}
)

# Quick call: list AURKB
kinases = client.kinases.get_kinases_list(kinase_name="AURKB", species="Human").result()
print(kinases[0]["kinase.klifs_id"], kinases[0]["kinase.name"])

AttributeError: Resource kinases not found. Available resources: Information, Interactions, Ligands, Structures

In [4]:
import sys, pkgutil
print("Python:", sys.version)
print("Executable:", sys.executable)
print("Have importlib_resources?", "importlib_resources" in {m.name for m in pkgutil.iter_modules()})

import subprocess
subprocess.run([sys.executable, "-m", "pip", "show", "bravado"])
subprocess.run([sys.executable, "-m", "pip", "show", "swagger-spec-validator"])
subprocess.run([sys.executable, "-m", "pip", "show", "importlib_resources"])

Python: 3.11.13 | packaged by conda-forge | (main, Jun  4 2025, 14:52:34) [Clang 18.1.8 ]
Executable: /Users/sebastien/miniforge3/envs/klifs/bin/python
Have importlib_resources? True
Name: bravado
Version: 11.0.3
Summary: Library for accessing Swagger-enabled API's
Home-page: https://github.com/Yelp/bravado
Author: Digium, Inc. and Yelp, Inc.
Author-email: opensource+bravado@yelp.com
License: BSD 3-Clause License
Location: /Users/sebastien/miniforge3/envs/klifs/lib/python3.11/site-packages
Requires: bravado-core, monotonic, msgpack, python-dateutil, pyyaml, requests, simplejson, six, typing-extensions
Required-by: 
Name: swagger-spec-validator
Version: 3.0.4
Summary: Validation of Swagger specifications
Home-page: http://github.com/Yelp/swagger_spec_validator
Author: Yelp
Author-email: core-services@yelp.com
License: Apache License, Version 2.0
Location: /Users/sebastien/miniforge3/envs/klifs/lib/python3.11/site-packages
Requires: importlib-resources, jsonschema, pyyaml, typing-extensi

CompletedProcess(args=['/Users/sebastien/miniforge3/envs/klifs/bin/python', '-m', 'pip', 'show', 'importlib_resources'], returncode=0)

In [13]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient
import pandas as pd
import numpy as np

# --- client (validation off) ---
http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={
        "also_return_response": False,
        "validate_swagger_spec": False,
        "validate_responses": False,
        "validate_requests": False,
        "use_models": False,
    },
)

# --- ops we will use ---
kinase_id_op       = client.Information.get_kinase_ID
structures_list_op = client.Structures.get_structures_list

def call_autolist(op, **kwargs):
    specs = {p["name"]: p for p in op.operation.op_spec.get("parameters", [])}
    fixed = {}
    for k, v in kwargs.items():
        p = specs.get(k)
        ptype = p.get("type") if p else None
        if not ptype and p and isinstance(p.get("schema"), dict):
            ptype = p["schema"].get("type")
        if ptype == "array" and not isinstance(v, (list, tuple)):
            v = [v]
        fixed[k] = v
    return op(**fixed).result()

def kinase_id(kinase_name, species="Human"):
    rows = call_autolist(kinase_id_op, kinase_name=kinase_name, species=species)
    df = pd.DataFrame(rows)
    for col in ("kinase.klifs_id","kinase_ID","kinase.id","kinase.klifs_ID"):
        if col in df.columns:
            return int(df.iloc[0][col])
    id_like = [c for c in df.columns if "klifs" in c.lower() and "id" in c.lower()]
    return int(df.iloc[0][id_like[0]])

def structures_for_kinase(kid):
    rows = call_autolist(structures_list_op, kinase_ID=int(kid))
    return pd.DataFrame(rows)

def classify_from_pocket_strings(pocket_strings):
    """
    pocket_strings: list of 85-aa pocket strings (with '-' where absent in that structure)
    Returns: pandas Series indexed 1..85 with 'deletion/short-loop' or 'unresolved-in-some-structures'
    """
    # keep only valid 85-length strings
    seqs = [s for s in pocket_strings if isinstance(s, str) and len(s) == 85]
    if not seqs:
        raise ValueError("No valid 85-aa pocket strings found for this kinase.")

    # presence matrix: rows = 85 positions, cols = structures (True=present)
    M = np.column_stack([[c != "-" for c in s] for s in seqs])
    # deletion if absent in ALL structures at that position
    status = np.where(M.any(axis=1), "unresolved-in-some-structures", "deletion/short-loop")
    return pd.Series(status, index=pd.RangeIndex(1, 86), name="status")

# -------- run for your kinases --------
for kinase in ["AURKB", "BUB1"]:
    kid = kinase_id(kinase, species="Human")
    S = structures_for_kinase(kid)
    # your server provides the 85-aa pocket in S['pocket']
    pockets = S["pocket"].dropna().tolist()
    print(f"{kinase}: {len(pockets)} pocket strings found.")
    status = classify_from_pocket_strings(pockets)
    print(status.value_counts())
    deletions = sorted(status[status=="deletion/short-loop"].index.tolist())
    print("True deletions at KLIFS positions:", deletions)





AURKB: 1 pocket strings found.
status
unresolved-in-some-structures    85
Name: count, dtype: int64
True deletions at KLIFS positions: []
BUB1: 7 pocket strings found.
status
unresolved-in-some-structures    85
Name: count, dtype: int64
True deletions at KLIFS positions: []


In [None]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient
import pandas as pd, os, time

# =========================
# CONFIG (edit if needed)
# =========================
INPUT_XLSX         = "data/human_kinase_pocket_sequences.xlsx"  # your uploaded file
UNIPROT_COL_GUESS  = ["uniprot", "uniprot_id", "uniprotID", "UniProt", "UniProtKB", "UniProtKB_AC"]  # auto-detect
SHEET_NAME         = 0     # index or string; 0 = first sheet
OUTPUT_CSV         = "klifs_pocket_sequences_human_wt_from_uniprot.csv"
ERRORS_CSV         = "klifs_extractor_errors_from_uniprot.csv"
CHECKPOINT_EVERY   = 25    # write out every N kinases (safe to re-run)
SLEEP_BETWEEN_CALLS = 0.05 # be gentle with the API

# =========================
# Build KLIFS v2 client
# =========================
http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={
        "also_return_response": False,
        "validate_swagger_spec": False,  # KLIFS can be slightly off-spec
        "validate_responses": False,
        "validate_requests": False,
        "use_models": False,
    },
)

# Endpoints your server exposes (per your earlier introspection)
info_get_kinase_information   = client.Information.get_kinase_information
structures_get_structures     = client.Structures.get_structures_list
structures_get_mod_res        = client.Structures.get_structure_modified_residues

def is_api_error(obj):
    """KLIFS sometimes returns [code, 'message'] rather than a JSON object on errors."""
    return isinstance(obj, list) and len(obj) >= 1 and isinstance(obj[0], int) and obj[0] >= 400

def call_autolist(op, **kwargs):
    """
    Wrap scalars into lists when the Swagger spec marks a parameter as type=array,
    and normalize KLIFS-style list errors into exceptions.
    """
    specs = {p["name"]: p for p in op.operation.op_spec.get("parameters", [])}
    fixed = {}
    for k, v in kwargs.items():
        p = specs.get(k)
        ptype = p.get("type") if p else None
        if not ptype and p and isinstance(p.get("schema"), dict):
            ptype = p["schema"].get("type")
        if ptype == "array" and not isinstance(v, (list, tuple)):
            v = [v]
        fixed[k] = v
    out = op(**fixed).result()
    if is_api_error(out):
        code = out[0]
        msg  = out[1] if len(out) > 1 else "KLIFS error"
        raise RuntimeError(f"KLIFS {code}: {msg}")
    return out

# -------------------------
# 1) Load UniProt IDs from Excel
# -------------------------
def load_uniprot_ids(path=INPUT_XLSX, sheet=SHEET_NAME):
    df = pd.read_excel(path, sheet_name=sheet)
    col = None
    for cand in UNIPROT_COL_GUESS:
        for c in df.columns:
            if c.strip().lower() == cand.lower():
                col = c
                break
        if col:
            break
    if col is None:
        raise ValueError(f"Could not find a UniProt column. Looked for: {UNIPROT_COL_GUESS}. "
                         f"Available columns: {df.columns.tolist()}")
    # Normalize IDs (strip, uppercase) and drop empties
    ids = (df[col].astype(str).str.strip().str.upper())
    ids = ids[ids.str.len() > 0]
    return sorted(set(ids.tolist()))

uniprot_ids = load_uniprot_ids()
print(f"UniProt IDs in file: {len(uniprot_ids)}")

# -------------------------
# 2) Build UniProt -> KLIFS mapping from Information.get_kinase_information
# -------------------------
def fetch_kinase_info_table():
    rows = call_autolist(info_get_kinase_information)
    info = pd.DataFrame(rows)
    # Try to standardize a few anticipated columns
    if info.empty:
        raise RuntimeError("KLIFS returned empty kinase information table.")
    # Identify columns
    # KLIFS ID
    kid_col = next((c for c in info.columns
                    if c in ("kinase.klifs_id","kinase_ID","kinase.id","kinase.klifs_ID","klifs_id","id")), None)
    if kid_col is None:
        raise KeyError(f"No kinase ID column in kinase information table: {info.columns.tolist()}")
    # UniProt
    uniprot_col = next((c for c in info.columns
                        if "uniprot" in c.lower() and "id" in c.lower() or c.strip().lower() in ("uniprot","uniprot_id")), None)
    if uniprot_col is None:
        raise KeyError(f"No UniProt column in kinase information table: {info.columns.tolist()}")
    # Species
    species_col = next((c for c in info.columns if c.strip().lower() == "species" or "species" in c.lower()), None)
    if species_col is None:
        # not fatal; create a placeholder
        species_col = "species"
        info[species_col] = None

    # Name (for reporting)
    name_col = next((c for c in info.columns if c.strip().lower() in ("kinase","kinase_name","name")), None)
    if name_col is None:
        name_col = "kinase"

    # Normalize basic fields
    info = info[[kid_col, name_col, uniprot_col, species_col]].copy()
    info.columns = ["kinase_klifs_id", "kinase_name", "uniprot_id", "species"]
    # sanitize
    info["uniprot_id"] = info["uniprot_id"].astype(str).str.strip().str.upper()
    return info

info_tbl = fetch_kinase_info_table()
info_human = info_tbl[info_tbl["species"].astype(str).str.contains("Human", case=False, na=False)].copy()
map_uniprot_to_kinase = (
    info_human.sort_values("kinase_klifs_id")
              .drop_duplicates(subset=["uniprot_id"])
              .set_index("uniprot_id")[["kinase_klifs_id","kinase_name"]]
)

print(f"Human entries in KLIFS info: {info_human.shape[0]}")
print(f"Unique UniProt → KLIFS mappings: {map_uniprot_to_kinase.shape[0]}")

# -------------------------
# 3) Structure listing + wild-type filter
# -------------------------
def list_structures(kinase_id_int: int) -> pd.DataFrame:
    rows = call_autolist(structures_get_structures, kinase_ID=int(kinase_id_int))
    S = pd.DataFrame(rows)
    if S.empty:
        return S
    # Ensure expected columns exist (fill if missing)
    if "structure_ID" not in S.columns:
        for cand in ("structure.klifs_id","structure.klifs_ID","klifs_id","id"):
            if cand in S.columns:
                S = S.rename(columns={cand:"structure_ID"})
                break
    for col in ["pocket","pdb","chain","alt","species","resolution","quality_score",
                "missing_residues","missing_atoms","DFG","aC_helix",
                "ligand","ligand_ID","allosteric_ligand","allosteric_ligand_ID"]:
        if col not in S.columns:
            S[col] = None
    return S

def is_wild_type(structure_id: int) -> bool:
    """
    True if KLIFS reports no modified residues for this structure.
    If the endpoint returns None/''/[] → treat as WT; any non-empty list → not WT.
    """
    try:
        rows = call_autolist(structures_get_mod_res, structure_ID=int(structure_id))
        return (rows in (None, "", [], ()))
    except Exception:
        # If the endpoint misbehaves, keep it but you can flip to False to be stricter.
        return True

# -------------------------
# 4) Main extraction loop (by UniProt)
# -------------------------
records = []
errors  = []
done    = 0

# Resume if file exists
done_uniprots = set()
if os.path.exists(OUTPUT_CSV):
    prev = pd.read_csv(OUTPUT_CSV)
    if "uniprot_id" in prev.columns:
        done_uniprots = set(prev["uniprot_id"].astype(str).str.upper())
        records.extend(prev.to_dict("records"))
        print(f"Resuming: {len(done_uniprots)} UniProt IDs already saved.")

for up in uniprot_ids:
    if up in done_uniprots:
        continue

    # Lookup KLIFS mapping
    if up not in map_uniprot_to_kinase.index:
        errors.append((up, "uniprot_not_found_in_klifs_info"))
        print(f" - SKIP (UniProt not in KLIFS Human info): {up}")
        continue

    kid  = int(map_uniprot_to_kinase.loc[up, "kinase_klifs_id"])
    name = str(map_uniprot_to_kinase.loc[up, "kinase_name"])

    try:
        S = list_structures(kid)
        if S.empty:
            print(f" - No structures: {name} ({up})")
            continue

        # Keep Human only (defensive)
        if "species" in S.columns:
            S = S[S["species"].astype(str).str.contains("Human", case=False, na=False)]
        if S.empty:
            print(f" - No Human structures: {name} ({up})")
            continue

        # Wild-type only
        keep_ids = []
        for sid in pd.to_numeric(S["structure_ID"], errors="coerce").dropna().astype(int):
            try:
                if is_wild_type(sid):
                    keep_ids.append(sid)
            except Exception as e:
                # keep it but log the check failure
                keep_ids.append(sid)
                errors.append((f"{up}:{sid}", f"modres_check_failed:{e}"))
        S = S[S["structure_ID"].astype(int).isin(keep_ids)]

        if S.empty:
            print(f" - No WT Human structures after filtering: {name} ({up})")
            continue

        # Build rows
        for _, row in S.iterrows():
            pocket = row["pocket"]
            records.append({
                "uniprot_id": up,
                "kinase_name": name,
                "kinase_klifs_id": kid,
                "structure_id": int(row["structure_ID"]),
                "pdb_id": row["pdb"],
                "chain": row["chain"],
                "alt_loc": row["alt"],
                "species": row.get("species", None),
                "resolution": row["resolution"],
                "quality_score": row["quality_score"],
                "missing_residues": row["missing_residues"],
                "missing_atoms": row["missing_atoms"],
                "DFG_state": row["DFG"],
                "alphaC_state": row["aC_helix"],
                "ligand": row["ligand"],
                "ligand_id": row["ligand_ID"],
                "allosteric_ligand": row["allosteric_ligand"],
                "allosteric_ligand_id": row["allosteric_ligand_ID"],
                "pocket_85aa": pocket,
                "dash_count": (pocket.count("-") if isinstance(pocket, str) else None),
            })

        done += 1
        if done % CHECKPOINT_EVERY == 0:
            df_ckpt = pd.DataFrame.from_records(records)
            df_ckpt.sort_values(["uniprot_id","structure_id"], inplace=True)
            df_ckpt.to_csv(OUTPUT_CSV, index=False)
            print(f"   ✔ checkpoint: {len(df_ckpt)} rows written to {OUTPUT_CSV}")

        time.sleep(SLEEP_BETWEEN_CALLS)

    except Exception as e:
        print(f" ! ERROR on {name} ({up}): {e}")
        errors.append((f"{up}", str(e)))
        continue

# Final save
df_out = pd.DataFrame.from_records(records)
if not df_out.empty:
    df_out.sort_values(["uniprot_id","structure_id"], inplace=True)
    df_out.to_csv(OUTPUT_CSV, index=False)
    print(f"\nWrote {len(df_out)} rows to: {os.path.abspath(OUTPUT_CSV)}  (rows: {len(df_out)})")
else:
    print("\nNo rows collected.")

# Error log
if errors:
    err_df = pd.DataFrame(errors, columns=["item","error"])
    err_df.to_csv(ERRORS_CSV, index=False)
    print(f"Logged {len(errors)} issues to {ERRORS_CSV}")



Kinases in file: 299
 - AAK1: kept 11 Human wild-type structures
 - ABL2: kept 10 Human wild-type structures
 - ACVR1: kept 202 Human wild-type structures
 ! ERROR on ACVR1B: Expected type to be dict for value [400, 'KLIFS error: An unknown kinase ID was provided'] to unmarshal to a <class 'dict'>.Was <class 'list'> instead.
 - ACVR2A: kept 12 Human wild-type structures
 - ACVRL1: kept 24 Human wild-type structures
 - AKT1: kept 24 Human wild-type structures
 - AKT2: kept 8 Human wild-type structures
 ! ERROR on AKT3: Expected type to be dict for value [400, 'KLIFS error: An unknown kinase ID was provided'] to unmarshal to a <class 'dict'>.Was <class 'list'> instead.
 - ALK: kept 95 Human wild-type structures
 - SKIP (no Human entry): AMPK-alpha1
 - SKIP (no Human entry): ARK5
 - SKIP (no Human entry): ASK1
 - SKIP (no Human entry): ASK2


KeyboardInterrupt: 