In [2]:
import sys; print(sys.executable)

/Users/sebastien/miniforge3/envs/klifs/bin/python


In [3]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient

http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={"also_return_response": False}
)

# Quick call: list AURKB
kinases = client.kinases.get_kinases_list(kinase_name="AURKB", species="Human").result()
print(kinases[0]["kinase.klifs_id"], kinases[0]["kinase.name"])

AttributeError: Resource kinases not found. Available resources: Information, Interactions, Ligands, Structures

In [4]:
import sys, pkgutil
print("Python:", sys.version)
print("Executable:", sys.executable)
print("Have importlib_resources?", "importlib_resources" in {m.name for m in pkgutil.iter_modules()})

import subprocess
subprocess.run([sys.executable, "-m", "pip", "show", "bravado"])
subprocess.run([sys.executable, "-m", "pip", "show", "swagger-spec-validator"])
subprocess.run([sys.executable, "-m", "pip", "show", "importlib_resources"])

Python: 3.11.13 | packaged by conda-forge | (main, Jun  4 2025, 14:52:34) [Clang 18.1.8 ]
Executable: /Users/sebastien/miniforge3/envs/klifs/bin/python
Have importlib_resources? True
Name: bravado
Version: 11.0.3
Summary: Library for accessing Swagger-enabled API's
Home-page: https://github.com/Yelp/bravado
Author: Digium, Inc. and Yelp, Inc.
Author-email: opensource+bravado@yelp.com
License: BSD 3-Clause License
Location: /Users/sebastien/miniforge3/envs/klifs/lib/python3.11/site-packages
Requires: bravado-core, monotonic, msgpack, python-dateutil, pyyaml, requests, simplejson, six, typing-extensions
Required-by: 
Name: swagger-spec-validator
Version: 3.0.4
Summary: Validation of Swagger specifications
Home-page: http://github.com/Yelp/swagger_spec_validator
Author: Yelp
Author-email: core-services@yelp.com
License: Apache License, Version 2.0
Location: /Users/sebastien/miniforge3/envs/klifs/lib/python3.11/site-packages
Requires: importlib-resources, jsonschema, pyyaml, typing-extensi

CompletedProcess(args=['/Users/sebastien/miniforge3/envs/klifs/bin/python', '-m', 'pip', 'show', 'importlib_resources'], returncode=0)

In [13]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient
import pandas as pd
import numpy as np

# --- client (validation off) ---
http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={
        "also_return_response": False,
        "validate_swagger_spec": False,
        "validate_responses": False,
        "validate_requests": False,
        "use_models": False,
    },
)

# --- ops we will use ---
kinase_id_op       = client.Information.get_kinase_ID
structures_list_op = client.Structures.get_structures_list

def call_autolist(op, **kwargs):
    specs = {p["name"]: p for p in op.operation.op_spec.get("parameters", [])}
    fixed = {}
    for k, v in kwargs.items():
        p = specs.get(k)
        ptype = p.get("type") if p else None
        if not ptype and p and isinstance(p.get("schema"), dict):
            ptype = p["schema"].get("type")
        if ptype == "array" and not isinstance(v, (list, tuple)):
            v = [v]
        fixed[k] = v
    return op(**fixed).result()

def kinase_id(kinase_name, species="Human"):
    rows = call_autolist(kinase_id_op, kinase_name=kinase_name, species=species)
    df = pd.DataFrame(rows)
    for col in ("kinase.klifs_id","kinase_ID","kinase.id","kinase.klifs_ID"):
        if col in df.columns:
            return int(df.iloc[0][col])
    id_like = [c for c in df.columns if "klifs" in c.lower() and "id" in c.lower()]
    return int(df.iloc[0][id_like[0]])

def structures_for_kinase(kid):
    rows = call_autolist(structures_list_op, kinase_ID=int(kid))
    return pd.DataFrame(rows)

def classify_from_pocket_strings(pocket_strings):
    """
    pocket_strings: list of 85-aa pocket strings (with '-' where absent in that structure)
    Returns: pandas Series indexed 1..85 with 'deletion/short-loop' or 'unresolved-in-some-structures'
    """
    # keep only valid 85-length strings
    seqs = [s for s in pocket_strings if isinstance(s, str) and len(s) == 85]
    if not seqs:
        raise ValueError("No valid 85-aa pocket strings found for this kinase.")

    # presence matrix: rows = 85 positions, cols = structures (True=present)
    M = np.column_stack([[c != "-" for c in s] for s in seqs])
    # deletion if absent in ALL structures at that position
    status = np.where(M.any(axis=1), "unresolved-in-some-structures", "deletion/short-loop")
    return pd.Series(status, index=pd.RangeIndex(1, 86), name="status")

# -------- run for your kinases --------
for kinase in ["AURKB", "BUB1"]:
    kid = kinase_id(kinase, species="Human")
    S = structures_for_kinase(kid)
    # your server provides the 85-aa pocket in S['pocket']
    pockets = S["pocket"].dropna().tolist()
    print(f"{kinase}: {len(pockets)} pocket strings found.")
    status = classify_from_pocket_strings(pockets)
    print(status.value_counts())
    deletions = sorted(status[status=="deletion/short-loop"].index.tolist())
    print("True deletions at KLIFS positions:", deletions)





AURKB: 1 pocket strings found.
status
unresolved-in-some-structures    85
Name: count, dtype: int64
True deletions at KLIFS positions: []
BUB1: 7 pocket strings found.
status
unresolved-in-some-structures    85
Name: count, dtype: int64
True deletions at KLIFS positions: []


In [None]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient
import pandas as pd, os, time

# =========================
# CONFIG (edit if needed)
# =========================
INPUT_XLSX         = "data/human_kinase_pocket_sequences.xlsx"  # your uploaded file
OUTPUT_CSV         = "klifs_pocket_sequences_human_wt_from_uniprot.csv"
ERRORS_CSV         = "klifs_extractor_errors_from_uniprot.csv"
SHEET_NAME         = 0
UNIPROT_COL_NAME   = None   # set to exact header if you know it; else None to auto-detect common ones
CHECKPOINT_EVERY   = 25
SLEEP_BETWEEN_CALLS = 0.05

http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={
        "also_return_response": False,
        "validate_swagger_spec": False,
        "validate_responses": False,
        "validate_requests": False,
        "use_models": False,
    },
)

# ----- endpoints -----
info_get_kinase_information   = client.Information.get_kinase_information
structures_get_structures     = client.Structures.get_structures_list
structures_get_mod_res        = client.Structures.get_structure_modified_residues

# ----- helpers -----
def safe_call(op, **kwargs):
    """Call a KLIFS op safely:
       - auto-wrap scalars into arrays when needed,
       - if server returns [400, '...'] or bravado raises, return ('ERROR', code, msg)."""
    # wrap array params
    specs = {p["name"]: p for p in op.operation.op_spec.get("parameters", [])}
    fixed = {}
    for k,v in kwargs.items():
        p = specs.get(k)
        ptype = p.get("type") if p else None
        if (not ptype) and p and isinstance(p.get("schema"), dict):
            ptype = p["schema"].get("type")
        if ptype == "array" and not isinstance(v, (list, tuple)):
            v = [v]
        fixed[k] = v
    try:
        out = op(**fixed).result()
        # KLIFS sometimes returns [code, 'message'] instead of an object
        if isinstance(out, list) and out and isinstance(out[0], int) and out[0] >= 400:
            return ("ERROR", out[0], out[1] if len(out) > 1 else "KLIFS error")
        return out
    except Exception as e:
        msg = str(e)
        # normalize to ERROR tuple; use 400 if message mentions “unknown kinase id”
        code = 400 if "unknown kinase id" in msg.lower() else 500
        return ("ERROR", code, msg)

def load_uniprot_ids(path=INPUT_XLSX, sheet=SHEET_NAME):
    df = pd.read_excel(path, sheet_name=sheet)
    if UNIPROT_COL_NAME is None:
        guesses = ["uniprot", "uniprot_id", "uniprotID", "UniProt", "UniProtKB", "UniProtKB_AC", "ACC"]
        col = next((c for c in df.columns if c.strip().lower() in [g.lower() for g in guesses]), None)
        if col is None:
            raise ValueError(f"Could not find a UniProt column. Columns: {df.columns.tolist()}")
    else:
        col = UNIPROT_COL_NAME
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found. Columns: {df.columns.tolist()}")
    ids = (df[col].astype(str).str.strip().str.upper())
    return sorted(set(ids[ids.str.len() > 0].tolist()))

def fetch_info_table():
    out = safe_call(info_get_kinase_information)
    if isinstance(out, tuple) and out[0] == "ERROR":
        raise RuntimeError(f"Failed to retrieve kinase information: {out[1]} {out[2]}")
    info = pd.DataFrame(out)
    # columns we need
    kid_col = next((c for c in info.columns if c in ("kinase.klifs_id","kinase_ID","kinase.id","kinase.klifs_ID","klifs_id","id")), None)
    unp_col = next((c for c in info.columns if "uniprot" in c.lower()), None)
    species_col = next((c for c in info.columns if "species" in c.lower()), None)
    name_col = next((c for c in info.columns if c.strip().lower() in ("kinase","kinase_name","name")), None)
    if not (kid_col and unp_col):
        raise KeyError(f"Missing expected columns in kinase information: {info.columns.tolist()}")
    if species_col is None:
        info["species"] = None
        species_col = "species"
    if name_col is None:
        name_col = unp_col  # fallback
    info = info[[kid_col, name_col, unp_col, species_col]].copy()
    info.columns = ["kinase_klifs_id", "kinase_name", "uniprot_id", "species"]
    info["uniprot_id"] = info["uniprot_id"].astype(str).str.strip().str.upper()
    return info

def list_structures(kid: int) -> pd.DataFrame:
    out = safe_call(structures_get_structures, kinase_ID=int(kid))
    if isinstance(out, tuple) and out[0] == "ERROR":
        # treat any structure-list error as "no structures"
        return pd.DataFrame()
    S = pd.DataFrame(out)
    if S.empty:
        return S
    # normalize columns
    if "structure_ID" not in S.columns:
        for c in ("structure.klifs_id","structure.klifs_ID","klifs_id","id"):
            if c in S.columns:
                S = S.rename(columns={c:"structure_ID"})
                break
    for col in ["pocket","pdb","chain","alt","species","resolution","quality_score",
                "missing_residues","missing_atoms","DFG","aC_helix",
                "ligand","ligand_ID","allosteric_ligand","allosteric_ligand_ID"]:
        if col not in S.columns: S[col] = None
    return S

def is_wild_type(structure_id: int) -> bool:
    out = safe_call(structures_get_mod_res, structure_ID=int(structure_id))
    if isinstance(out, tuple) and out[0] == "ERROR":
        # if mod-res lookup fails, keep the structure (or flip to False to be stricter)
        return True
    return (out in (None, "", [], ()))

# ----- main -----
uniprot_ids = load_uniprot_ids()
print("UniProt IDs in file:", len(uniprot_ids))

info_tbl = fetch_info_table()
info_human = info_tbl[info_tbl["species"].astype(str).str.contains("Human", case=False, na=False)]
map_unp = (info_human.sort_values("kinase_klifs_id")
                    .drop_duplicates(subset=["uniprot_id"])
                    .set_index("uniprot_id")[["kinase_klifs_id","kinase_name"]])
print("Human entries in KLIFS info:", info_human.shape[0])
print("Unique UniProt → KLIFS mappings:", map_unp.shape[0])

# resume support
records, errors = [], []
done_uniprots = set()
if os.path.exists(OUTPUT_CSV):
    prev = pd.read_csv(OUTPUT_CSV)
    if not prev.empty:
        records.extend(prev.to_dict("records"))
        done_uniprots = set(prev["uniprot_id"].astype(str).str.upper())
        print(f"Resuming: {len(done_uniprots)} UniProt IDs already saved.")

done = 0
for up in uniprot_ids:
    if up in done_uniprots: 
        continue
    if up not in map_unp.index:
        errors.append((up, "uniprot_not_in_klifs_human_information"))
        print(f" - SKIP (not in KLIFS Human info): {up}")
        continue

    kid  = int(map_unp.loc[up, "kinase_klifs_id"])
    name = str(map_unp.loc[up, "kinase_name"])

    S = list_structures(kid)
    if S.empty:
        print(f" - No structures: {name} ({up})")
        continue

    # human only (defensive)
    if "species" in S.columns:
        S = S[S["species"].astype(str).str.contains("Human", case=False, na=False)]
    if S.empty:
        print(f" - No Human structures: {name} ({up})")
        continue

    # WT only
    keep = []
    for sid in pd.to_numeric(S["structure_ID"], errors="coerce").dropna().astype(int):
        try:
            if is_wild_type(sid): keep.append(sid)
        except Exception as e:
            # keep but log
            keep.append(sid)
            errors.append((f"{up}:{sid}", f"modres_check_failed:{e}"))
    S = S[S["structure_ID"].astype(int).isin(keep)]

    if S.empty:
        print(f" - No WT Human structures: {name} ({up})")
        continue

    # rows
    kept = 0
    for _, r in S.iterrows():
        pocket = r["pocket"]
        records.append({
            "uniprot_id": up,
            "kinase_name": name,
            "kinase_klifs_id": kid,
            "structure_id": int(r["structure_ID"]),
            "pdb_id": r["pdb"],
            "chain": r["chain"],
            "alt_loc": r["alt"],
            "species": r.get("species", None),
            "resolution": r["resolution"],
            "quality_score": r["quality_score"],
            "missing_residues": r["missing_residues"],
            "missing_atoms": r["missing_atoms"],
            "DFG_state": r["DFG"],
            "alphaC_state": r["aC_helix"],
            "ligand": r["ligand"],
            "ligand_id": r["ligand_ID"],
            "allosteric_ligand": r["allosteric_ligand"],
            "allosteric_ligand_id": r["allosteric_ligand_ID"],
            "pocket_85aa": pocket,
            "dash_count": (pocket.count("-") if isinstance(pocket, str) else None),
        })
        kept += 1

    print(f" - {name} ({up}): kept {kept} WT Human structures")

    done += 1
    if done % CHECKPOINT_EVERY == 0:
        df_ckpt = pd.DataFrame.from_records(records)
        df_ckpt.sort_values(["uniprot_id","structure_id"], inplace=True)
        df_ckpt.to_csv(OUTPUT_CSV, index=False)
        print(f"   ✔ checkpoint: {len(df_ckpt)} rows → {OUTPUT_CSV}")

    time.sleep(SLEEP_BETWEEN_CALLS)

# final save
df_out = pd.DataFrame.from_records(records)
if not df_out.empty:
    df_out.sort_values(["uniprot_id","structure_id"], inplace=True)
    df_out.to_csv(OUTPUT_CSV, index=False)
    print(f"\nWrote {len(df_out)} rows to {OUTPUT_CSV}")
else:
    print("\nNo rows collected.")

if errors:
    pd.DataFrame(errors, columns=["item","error"]).to_csv(ERRORS_CSV, index=False)
    print(f"Logged {len(errors)} issues to {ERRORS_CSV}")




UniProt IDs in file: 508
Human entries in KLIFS info: 555
Unique UniProt → KLIFS mappings: 542
 - No structures: PI4KAP2 (A4QPH2)
 - SGK1 (O00141): kept 4 WT Human structures
 - BMPR1B (O00238): kept 4 WT Human structures
