In [2]:
import sys; print(sys.executable)

/Users/sebastien/miniforge3/envs/klifs/bin/python


In [3]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient

http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={"also_return_response": False}
)

# Quick call: list AURKB
kinases = client.kinases.get_kinases_list(kinase_name="AURKB", species="Human").result()
print(kinases[0]["kinase.klifs_id"], kinases[0]["kinase.name"])

AttributeError: Resource kinases not found. Available resources: Information, Interactions, Ligands, Structures

In [4]:
import sys, pkgutil
print("Python:", sys.version)
print("Executable:", sys.executable)
print("Have importlib_resources?", "importlib_resources" in {m.name for m in pkgutil.iter_modules()})

import subprocess
subprocess.run([sys.executable, "-m", "pip", "show", "bravado"])
subprocess.run([sys.executable, "-m", "pip", "show", "swagger-spec-validator"])
subprocess.run([sys.executable, "-m", "pip", "show", "importlib_resources"])

Python: 3.11.13 | packaged by conda-forge | (main, Jun  4 2025, 14:52:34) [Clang 18.1.8 ]
Executable: /Users/sebastien/miniforge3/envs/klifs/bin/python
Have importlib_resources? True
Name: bravado
Version: 11.0.3
Summary: Library for accessing Swagger-enabled API's
Home-page: https://github.com/Yelp/bravado
Author: Digium, Inc. and Yelp, Inc.
Author-email: opensource+bravado@yelp.com
License: BSD 3-Clause License
Location: /Users/sebastien/miniforge3/envs/klifs/lib/python3.11/site-packages
Requires: bravado-core, monotonic, msgpack, python-dateutil, pyyaml, requests, simplejson, six, typing-extensions
Required-by: 
Name: swagger-spec-validator
Version: 3.0.4
Summary: Validation of Swagger specifications
Home-page: http://github.com/Yelp/swagger_spec_validator
Author: Yelp
Author-email: core-services@yelp.com
License: Apache License, Version 2.0
Location: /Users/sebastien/miniforge3/envs/klifs/lib/python3.11/site-packages
Requires: importlib-resources, jsonschema, pyyaml, typing-extensi

CompletedProcess(args=['/Users/sebastien/miniforge3/envs/klifs/bin/python', '-m', 'pip', 'show', 'importlib_resources'], returncode=0)

In [13]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient
import pandas as pd
import numpy as np

# --- client (validation off) ---
http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={
        "also_return_response": False,
        "validate_swagger_spec": False,
        "validate_responses": False,
        "validate_requests": False,
        "use_models": False,
    },
)

# --- ops we will use ---
kinase_id_op       = client.Information.get_kinase_ID
structures_list_op = client.Structures.get_structures_list

def call_autolist(op, **kwargs):
    specs = {p["name"]: p for p in op.operation.op_spec.get("parameters", [])}
    fixed = {}
    for k, v in kwargs.items():
        p = specs.get(k)
        ptype = p.get("type") if p else None
        if not ptype and p and isinstance(p.get("schema"), dict):
            ptype = p["schema"].get("type")
        if ptype == "array" and not isinstance(v, (list, tuple)):
            v = [v]
        fixed[k] = v
    return op(**fixed).result()

def kinase_id(kinase_name, species="Human"):
    rows = call_autolist(kinase_id_op, kinase_name=kinase_name, species=species)
    df = pd.DataFrame(rows)
    for col in ("kinase.klifs_id","kinase_ID","kinase.id","kinase.klifs_ID"):
        if col in df.columns:
            return int(df.iloc[0][col])
    id_like = [c for c in df.columns if "klifs" in c.lower() and "id" in c.lower()]
    return int(df.iloc[0][id_like[0]])

def structures_for_kinase(kid):
    rows = call_autolist(structures_list_op, kinase_ID=int(kid))
    return pd.DataFrame(rows)

def classify_from_pocket_strings(pocket_strings):
    """
    pocket_strings: list of 85-aa pocket strings (with '-' where absent in that structure)
    Returns: pandas Series indexed 1..85 with 'deletion/short-loop' or 'unresolved-in-some-structures'
    """
    # keep only valid 85-length strings
    seqs = [s for s in pocket_strings if isinstance(s, str) and len(s) == 85]
    if not seqs:
        raise ValueError("No valid 85-aa pocket strings found for this kinase.")

    # presence matrix: rows = 85 positions, cols = structures (True=present)
    M = np.column_stack([[c != "-" for c in s] for s in seqs])
    # deletion if absent in ALL structures at that position
    status = np.where(M.any(axis=1), "unresolved-in-some-structures", "deletion/short-loop")
    return pd.Series(status, index=pd.RangeIndex(1, 86), name="status")

# -------- run for your kinases --------
for kinase in ["AURKB", "BUB1"]:
    kid = kinase_id(kinase, species="Human")
    S = structures_for_kinase(kid)
    # your server provides the 85-aa pocket in S['pocket']
    pockets = S["pocket"].dropna().tolist()
    print(f"{kinase}: {len(pockets)} pocket strings found.")
    status = classify_from_pocket_strings(pockets)
    print(status.value_counts())
    deletions = sorted(status[status=="deletion/short-loop"].index.tolist())
    print("True deletions at KLIFS positions:", deletions)





AURKB: 1 pocket strings found.
status
unresolved-in-some-structures    85
Name: count, dtype: int64
True deletions at KLIFS positions: []
BUB1: 7 pocket strings found.
status
unresolved-in-some-structures    85
Name: count, dtype: int64
True deletions at KLIFS positions: []


In [1]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient
import pandas as pd, os, time

# =========================
# CONFIG (edit if needed)
# =========================
INPUT_XLSX         = "data/human_kinase_pocket_sequences.xlsx"  # your uploaded file
OUTPUT_CSV         = "klifs_pocket_sequences_human_wt_from_uniprot.csv"
ERRORS_CSV         = "klifs_extractor_errors_from_uniprot.csv"
SHEET_NAME         = 0
UNIPROT_COL_NAME   = None   # set to exact header if you know it; else None to auto-detect common ones
CHECKPOINT_EVERY   = 25
SLEEP_BETWEEN_CALLS = 0.05

http = RequestsClient()
client = SwaggerClient.from_url(
    "https://klifs.net/swagger_v2/swagger.json",
    http_client=http,
    config={
        "also_return_response": False,
        "validate_swagger_spec": False,
        "validate_responses": False,
        "validate_requests": False,
        "use_models": False,
    },
)

# ----- endpoints -----
info_get_kinase_information   = client.Information.get_kinase_information
structures_get_structures     = client.Structures.get_structures_list
structures_get_mod_res        = client.Structures.get_structure_modified_residues

# ----- helpers -----
def safe_call(op, **kwargs):
    """Call a KLIFS op safely:
       - auto-wrap scalars into arrays when needed,
       - if server returns [400, '...'] or bravado raises, return ('ERROR', code, msg)."""
    # wrap array params
    specs = {p["name"]: p for p in op.operation.op_spec.get("parameters", [])}
    fixed = {}
    for k,v in kwargs.items():
        p = specs.get(k)
        ptype = p.get("type") if p else None
        if (not ptype) and p and isinstance(p.get("schema"), dict):
            ptype = p["schema"].get("type")
        if ptype == "array" and not isinstance(v, (list, tuple)):
            v = [v]
        fixed[k] = v
    try:
        out = op(**fixed).result()
        # KLIFS sometimes returns [code, 'message'] instead of an object
        if isinstance(out, list) and out and isinstance(out[0], int) and out[0] >= 400:
            return ("ERROR", out[0], out[1] if len(out) > 1 else "KLIFS error")
        return out
    except Exception as e:
        msg = str(e)
        # normalize to ERROR tuple; use 400 if message mentions “unknown kinase id”
        code = 400 if "unknown kinase id" in msg.lower() else 500
        return ("ERROR", code, msg)

def load_uniprot_ids(path=INPUT_XLSX, sheet=SHEET_NAME):
    df = pd.read_excel(path, sheet_name=sheet)
    if UNIPROT_COL_NAME is None:
        guesses = ["uniprot", "uniprot_id", "uniprotID", "UniProt", "UniProtKB", "UniProtKB_AC", "ACC"]
        col = next((c for c in df.columns if c.strip().lower() in [g.lower() for g in guesses]), None)
        if col is None:
            raise ValueError(f"Could not find a UniProt column. Columns: {df.columns.tolist()}")
    else:
        col = UNIPROT_COL_NAME
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found. Columns: {df.columns.tolist()}")
    ids = (df[col].astype(str).str.strip().str.upper())
    return sorted(set(ids[ids.str.len() > 0].tolist()))

def fetch_info_table():
    out = safe_call(info_get_kinase_information)
    if isinstance(out, tuple) and out[0] == "ERROR":
        raise RuntimeError(f"Failed to retrieve kinase information: {out[1]} {out[2]}")
    info = pd.DataFrame(out)
    # columns we need
    kid_col = next((c for c in info.columns if c in ("kinase.klifs_id","kinase_ID","kinase.id","kinase.klifs_ID","klifs_id","id")), None)
    unp_col = next((c for c in info.columns if "uniprot" in c.lower()), None)
    species_col = next((c for c in info.columns if "species" in c.lower()), None)
    name_col = next((c for c in info.columns if c.strip().lower() in ("kinase","kinase_name","name")), None)
    if not (kid_col and unp_col):
        raise KeyError(f"Missing expected columns in kinase information: {info.columns.tolist()}")
    if species_col is None:
        info["species"] = None
        species_col = "species"
    if name_col is None:
        name_col = unp_col  # fallback
    info = info[[kid_col, name_col, unp_col, species_col]].copy()
    info.columns = ["kinase_klifs_id", "kinase_name", "uniprot_id", "species"]
    info["uniprot_id"] = info["uniprot_id"].astype(str).str.strip().str.upper()
    return info

def list_structures(kid: int) -> pd.DataFrame:
    out = safe_call(structures_get_structures, kinase_ID=int(kid))
    if isinstance(out, tuple) and out[0] == "ERROR":
        # treat any structure-list error as "no structures"
        return pd.DataFrame()
    S = pd.DataFrame(out)
    if S.empty:
        return S
    # normalize columns
    if "structure_ID" not in S.columns:
        for c in ("structure.klifs_id","structure.klifs_ID","klifs_id","id"):
            if c in S.columns:
                S = S.rename(columns={c:"structure_ID"})
                break
    for col in ["pocket","pdb","chain","alt","species","resolution","quality_score",
                "missing_residues","missing_atoms","DFG","aC_helix",
                "ligand","ligand_ID","allosteric_ligand","allosteric_ligand_ID"]:
        if col not in S.columns: S[col] = None
    return S

def is_wild_type(structure_id: int) -> bool:
    out = safe_call(structures_get_mod_res, structure_ID=int(structure_id))
    if isinstance(out, tuple) and out[0] == "ERROR":
        # if mod-res lookup fails, keep the structure (or flip to False to be stricter)
        return True
    return (out in (None, "", [], ()))

# ----- main -----
uniprot_ids = load_uniprot_ids()
print("UniProt IDs in file:", len(uniprot_ids))

info_tbl = fetch_info_table()
info_human = info_tbl[info_tbl["species"].astype(str).str.contains("Human", case=False, na=False)]
map_unp = (info_human.sort_values("kinase_klifs_id")
                    .drop_duplicates(subset=["uniprot_id"])
                    .set_index("uniprot_id")[["kinase_klifs_id","kinase_name"]])
print("Human entries in KLIFS info:", info_human.shape[0])
print("Unique UniProt → KLIFS mappings:", map_unp.shape[0])

# resume support
records, errors = [], []
done_uniprots = set()
if os.path.exists(OUTPUT_CSV):
    prev = pd.read_csv(OUTPUT_CSV)
    if not prev.empty:
        records.extend(prev.to_dict("records"))
        done_uniprots = set(prev["uniprot_id"].astype(str).str.upper())
        print(f"Resuming: {len(done_uniprots)} UniProt IDs already saved.")

done = 0
for up in uniprot_ids:
    if up in done_uniprots: 
        continue
    if up not in map_unp.index:
        errors.append((up, "uniprot_not_in_klifs_human_information"))
        print(f" - SKIP (not in KLIFS Human info): {up}")
        continue

    kid  = int(map_unp.loc[up, "kinase_klifs_id"])
    name = str(map_unp.loc[up, "kinase_name"])

    S = list_structures(kid)
    if S.empty:
        print(f" - No structures: {name} ({up})")
        continue

    # human only (defensive)
    if "species" in S.columns:
        S = S[S["species"].astype(str).str.contains("Human", case=False, na=False)]
    if S.empty:
        print(f" - No Human structures: {name} ({up})")
        continue

    # WT only
    keep = []
    for sid in pd.to_numeric(S["structure_ID"], errors="coerce").dropna().astype(int):
        try:
            if is_wild_type(sid): keep.append(sid)
        except Exception as e:
            # keep but log
            keep.append(sid)
            errors.append((f"{up}:{sid}", f"modres_check_failed:{e}"))
    S = S[S["structure_ID"].astype(int).isin(keep)]

    if S.empty:
        print(f" - No WT Human structures: {name} ({up})")
        continue

    # rows
    kept = 0
    for _, r in S.iterrows():
        pocket = r["pocket"]
        records.append({
            "uniprot_id": up,
            "kinase_name": name,
            "kinase_klifs_id": kid,
            "structure_id": int(r["structure_ID"]),
            "pdb_id": r["pdb"],
            "chain": r["chain"],
            "alt_loc": r["alt"],
            "species": r.get("species", None),
            "resolution": r["resolution"],
            "quality_score": r["quality_score"],
            "missing_residues": r["missing_residues"],
            "missing_atoms": r["missing_atoms"],
            "DFG_state": r["DFG"],
            "alphaC_state": r["aC_helix"],
            "ligand": r["ligand"],
            "ligand_id": r["ligand_ID"],
            "allosteric_ligand": r["allosteric_ligand"],
            "allosteric_ligand_id": r["allosteric_ligand_ID"],
            "pocket_85aa": pocket,
            "dash_count": (pocket.count("-") if isinstance(pocket, str) else None),
        })
        kept += 1

    print(f" - {name} ({up}): kept {kept} WT Human structures")

    done += 1
    if done % CHECKPOINT_EVERY == 0:
        df_ckpt = pd.DataFrame.from_records(records)
        df_ckpt.sort_values(["uniprot_id","structure_id"], inplace=True)
        df_ckpt.to_csv(OUTPUT_CSV, index=False)
        print(f"   ✔ checkpoint: {len(df_ckpt)} rows → {OUTPUT_CSV}")

    time.sleep(SLEEP_BETWEEN_CALLS)

# final save
df_out = pd.DataFrame.from_records(records)
if not df_out.empty:
    df_out.sort_values(["uniprot_id","structure_id"], inplace=True)
    df_out.to_csv(OUTPUT_CSV, index=False)
    print(f"\nWrote {len(df_out)} rows to {OUTPUT_CSV}")
else:
    print("\nNo rows collected.")

if errors:
    pd.DataFrame(errors, columns=["item","error"]).to_csv(ERRORS_CSV, index=False)
    print(f"Logged {len(errors)} issues to {ERRORS_CSV}")




UniProt IDs in file: 508
Human entries in KLIFS info: 555
Unique UniProt → KLIFS mappings: 542
Resuming: 25 UniProt IDs already saved.
 - No structures: PI4KAP2 (A4QPH2)
 - No structures: PIK3C2A (O00443)
 - No structures: PIK3C2B (O00750)
 - No structures: CRIK (O14578)
 - No structures: RIOK3 (O14730)
 - No structures: MAST4 (O15021)
 - No structures: MUSK (O15146)
 - No structures: EphB6 (O15197)
 - No structures: IRAK2 (O43187)
 - No structures: LZK (O43283)
 - No WT Human structures: DYRK3 (O43781)
 - No structures: PRKY (O43930)
 - No structures: Trad (O60229)
 - No structures: NuaK1 (O60285)
 - No structures: MAST3 (O60307)
 - No WT Human structures: ULK1 (O75385)
 - No structures: MSK2 (O75676)
 - MPSK1 (O75716): kept 2 WT Human structures
 - No structures: PIK3C2G (O75747)
 - PAK3 (O75914): kept 2 WT Human structures
 - No structures: Trio (O75962)
 - CDKL5 (O76039): kept 3 WT Human structures
 - DRAK2 (O94768): kept 20 WT Human structures
 - LOK (O94804): kept 39 WT Human str

In [2]:
import pandas as pd, numpy as np

CSV = "klifs_pocket_sequences_human_wt_from_uniprot.csv"
df = pd.read_csv(CSV)

# quick look
display(df.head())
print("Rows:", len(df))
print("Kinases (UniProt):", df["uniprot_id"].nunique())
print("Structures:", df["structure_id"].nunique())

# basic QC distributions
print("Resolution (Å):", df["resolution"].describe())
print("Quality score:", df["quality_score"].describe())

# how many dashes per pocket
df["dash_count"] = df["pocket_85aa"].fillna("").str.count("_")
print("Dash count:", df["dash_count"].describe())

# rows with malformed pockets (not 85 chars)
bad = df[~df["pocket_85aa"].astype(str).map(lambda s: len(s)==85)]
print("Non-85-char pocket rows:", len(bad))

Unnamed: 0,uniprot_id,kinase_name,kinase_klifs_id,structure_id,pdb_id,chain,alt_loc,species,resolution,quality_score,missing_residues,missing_atoms,DFG_state,alphaC_state,ligand,ligand_id,allosteric_ligand,allosteric_ligand_id,pocket_85aa,dash_count
0,O00141,SGK1,64,3270,3hdn,A,,Human,3.1,6.2,7,2,in,na,GMG,1003,0,0,KVIGKGSFGKVLLYAVKVL_______VLLKNVPFLVGLHFSYFVLD...,0
1,O00141,SGK1,64,3271,3hdm,A,,Human,2.6,5.9,7,5,in,na,MMG,1004,0,0,KVIGKGSFGKVLLYAVKVL_______VLLKNVPFLVGLHFSYFVLD...,0
2,O00141,SGK1,64,6849,2r5t,A,,Human,1.9,6.2,7,2,in,na,ANP,64,0,0,KVIGKGSFGKVLLYAVKVL_______VLLKNVPFLVGLHFSYFVLD...,0
3,O00141,SGK1,64,14012,7pue,A,,Human,2.506,5.1,9,9,in,na,86H,4157,0,0,KVIGKG__GKVLLYAVKVL_______VLLKNVPFLVGLHFSYFVLD...,0
4,O00238,BMPR1B,520,1500,3mdy,C,B,Human,2.05,9.6,0,4,in,in,LDN,321,0,0,KQIGKGRYGEVWMVAVKVFSWFRETEIYQTVLENILGFIAAYLITD...,0


Rows: 10411
Kinases (UniProt): 283
Structures: 10411
Resolution (Å): count    10411.000000
mean         2.219712
std          0.692025
min          0.000000
25%          1.850000
50%          2.190000
75%          2.600000
max          9.800000
Name: resolution, dtype: float64
Quality score: count    10411.000000
mean         7.729661
std          1.234665
min          0.000000
25%          7.600000
50%          8.000000
75%          8.000000
max          9.900000
Name: quality_score, dtype: float64
Dash count: count    10411.000000
mean         1.849102
std          3.976346
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max         78.000000
Name: dash_count, dtype: float64
Non-85-char pocket rows: 61


In [4]:
# how many structures per UniProt
per_u = (df.groupby("uniprot_id")
           .agg(n_structures=("structure_id","nunique"),
                kinase_name=("kinase_name","first"),
                median_res=("resolution","median"),
                median_q=("quality_score","median"))
           .sort_values("n_structures", ascending=False))
display(per_u.head(20))

# pick a "best representative" structure per UniProt:
# 1) fewest dashes  2) highest quality_score  3) lowest resolution
rep = (df.sort_values(["uniprot_id","dash_count","quality_score","resolution"],
                      ascending=[True, True, False, True])
         .groupby("uniprot_id").head(1)
         .reset_index(drop=True))
print("Representative structures:", len(rep))
display(rep[["uniprot_id","kinase_name","structure_id","pdb_id","chain","dash_count","quality_score","resolution"]].head(20))

# save representatives if useful
rep.to_csv("klifs_representatives.csv", index=False)


Unnamed: 0_level_0,n_structures,kinase_name,median_res,median_q
uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P24941,653,CDK2,1.86,8.0
P68400,593,CK2a1,1.85,8.0
P00533,565,EGFR,2.4,8.0
Q16539,348,p38a,2.1,7.6
Q06187,264,BTK,1.675,8.0
P28482,261,Erk2,1.9,8.0
O14757,237,CHK1,2.0,8.0
P15056,220,BRAF,2.88,7.7
P11362,216,FGFR1,2.23,8.0
Q99986,209,VRK1,1.9,7.6


Representative structures: 283


Unnamed: 0,uniprot_id,kinase_name,structure_id,pdb_id,chain,dash_count,quality_score,resolution
0,O00141,SGK1,6849,2r5t,A,7,6.2,1.9
1,O00238,BMPR1B,1500,3mdy,C,0,9.6,2.05
2,O00311,CDC7,13841,6ya6,A,3,8.0,1.44
3,O00329,p110d,9093,5t8f,A,2,8.7,2.91
4,O00444,PLK4,1814,3cok,A,0,8.8,2.25
5,O00506,YSK1,464,4nzw,B,0,8.0,3.58
6,O14733,MAP2K7,12695,6yg0,A,0,9.6,2.0
7,O14757,CHK1,13492,7ako,B,0,9.9,1.8
8,O14920,IKKb,291,4e3c,D,0,8.0,3.98
9,O14936,CASK,13583,7oai,A,0,9.7,2.3


In [5]:
def consensus_from_strings(strings):
    """strings: iterable of 85-char pocket strings; ignores '-' in voting."""
    strings = [s for s in strings if isinstance(s,str) and len(s)==85]
    if not strings:
        return None, None, None  # consensus, coverage, entropy
    arr = np.array([list(s) for s in strings])  # shape: (n, 85)
    cons, cov, ent = [], [], []
    for i in range(85):
        col = arr[:,i]
        aa = col[col!="_"]
        if len(aa)==0:
            cons.append("_")
            cov.append(0)
            ent.append(0.0)
            continue
        # frequency
        vals, counts = np.unique(aa, return_counts=True)
        winner = vals[counts.argmax()]
        cons.append(winner)
        cov.append(len(aa))  # how many structures contributed at this position
        # Shannon entropy (variability)
        p = counts / counts.sum()
        ent.append(-(p*np.log2(p)).sum())
    return "".join(cons), np.array(cov), np.array(ent)

rows = []
for unp, g in df.groupby("uniprot_id"):
    cons, cov, ent = consensus_from_strings(g["pocket_85aa"])
    rows.append({
        "uniprot_id": unp,
        "kinase_name": g["kinase_name"].iloc[0],
        "n_structures": g["structure_id"].nunique(),
        "consensus_85aa": cons,
        "mean_coverage": None if cov is None else float(np.mean(cov)),
        "min_coverage": None if cov is None else int(np.min(cov)),
        "max_coverage": None if cov is None else int(np.max(cov)),
        "mean_entropy": None if ent is None else float(np.mean(ent)),
    })
cons_df = pd.DataFrame(rows).sort_values(["n_structures","uniprot_id"], ascending=[False,True])
display(cons_df.head(20))

# save the consensus table
cons_df.to_csv("klifs_consensus_per_uniprot.csv", index=False)

# optional: export consensus as FASTA
with open("klifs_consensus_per_uniprot.fasta","w") as fh:
    for r in rows:
        if r["consensus_85aa"]:
            fh.write(f">{r['uniprot_id']}|{r['kinase_name']}|n={r['n_structures']}\n{r['consensus_85aa']}\n")


Unnamed: 0,uniprot_id,kinase_name,n_structures,consensus_85aa,mean_coverage,min_coverage,max_coverage,mean_entropy
69,P24941,CDK2,653,EKIGEGTYGVVYKVALKKITAIREISLLKELNPNIVKLLDVYLVFE...,642.388235,0.0,653.0,0.005626
135,P68400,CK2a1,593,RKLGRGKYSEVFEVVVKILKIKREIKILENLRPNIITLADIALVFE...,570.352941,0.0,593.0,0.007497
35,P00533,EGFR,565,KVLGSGAFGTVYKVAIKELEILDEAYVMASVDPHVCRLLGIQLIMQ...,561.188235,515.0,565.0,0.076839
185,Q16539,p38a,348,SPVGSGAYGSVCAVAVKKLRTYRELRLLKHMKENVIGLLDVYLVTH...,333.235294,0.0,348.0,0.023669
151,Q06187,BTK,264,KELGTGQFGVVKYVAIKMIEFIEEAKVMMNLSEKLVQLYGVFIITE...,261.729412,239.0,264.0,0.001896
75,P28482,Erk2,261,SYIGEGAYGMVCSVAIKKIRTLREIKILLRFRENIIGINDIYIVQD...,257.047059,0.0,261.0,0.005231
7,O14757,CHK1,237,QTLGEGAYGEVQLVAVKIVNIKKEICINKMLNENVVKFYGHYLFLE...,233.635294,163.0,237.0,0.092062
57,P15056,BRAF,220,QRIGSGSFGTVYKVAVKMLAFKNEVGVLRKTRVNILLFMGYAIVTQ...,217.917647,166.0,220.0,0.01879
53,P11362,FGFR1,216,KPLGEGAFGQVVLVAVKMLDLISEMEMMKMIGKNIINLLGAYVIVE...,211.082353,136.0,216.0,0.012123
243,Q99986,VRK1,209,LPIGQGGFGCIYLCVVKVEPLFTELKFYQRAALGVPKYWGSFMIMD...,199.529412,0.0,209.0,0.0


In [7]:
def deletion_positions(strings):
    strings = [s for s in strings if isinstance(s,str) and len(s)==85]
    if not strings:
        return []
    arr = np.array([list(s) for s in strings])  # (n,85)
    # at each pos: if all chars are '-' → deletion
    del_pos = [i+1 for i in range(85) if np.all(arr[:,i]=="-")]
    return del_pos

del_rows = []
for unp, g in df.groupby("uniprot_id"):
    del_pos = deletion_positions(g["pocket_85aa"])
    del_rows.append({
        "uniprot_id": unp,
        "kinase_name": g["kinase_name"].iloc[0],
        "n_structures": g["structure_id"].nunique(),
        "n_true_deletions": len(del_pos),
        "true_deletion_positions": ",".join(map(str, del_pos))
    })
deletions_df = pd.DataFrame(del_rows).sort_values(["n_true_deletions","n_structures"], ascending=[False,False])
display(deletions_df.head(20))
deletions_df.to_csv("klifs_true_deletions_per_uniprot.csv", index=False)


Unnamed: 0,uniprot_id,kinase_name,n_structures,n_true_deletions,true_deletion_positions
69,P24941,CDK2,653,0,
135,P68400,CK2a1,593,0,
35,P00533,EGFR,565,0,
185,Q16539,p38a,348,0,
151,Q06187,BTK,264,0,
75,P28482,Erk2,261,0,
7,O14757,CHK1,237,0,
57,P15056,BRAF,220,0,
53,P11362,FGFR1,216,0,
243,Q99986,VRK1,209,0,


In [8]:
def stratified_consensus(g, dfg=("in",), ac=("in",)):
    sel = g.copy()
    if "DFG_state" in sel.columns and dfg is not None:
        sel = sel[sel["DFG_state"].astype(str).str.lower().isin([d.lower() for d in dfg])]
    if "alphaC_state" in sel.columns and ac is not None:
        sel = sel[sel["alphaC_state"].astype(str).str.lower().isin([a.lower() for a in ac])]
    return consensus_from_strings(sel["pocket_85aa"])

# example: DFG-in & aC-in consensus per kinase
rows = []
for unp, g in df.groupby("uniprot_id"):
    cons, cov, ent = stratified_consensus(g, dfg=("in",), ac=("in",))
    rows.append({
        "uniprot_id": unp,
        "kinase_name": g["kinase_name"].iloc[0],
        "n_structures_total": g["structure_id"].nunique(),
        "n_structures_state": g[(g["DFG_state"].astype(str).str.lower()=="in") &
                                (g["alphaC_state"].astype(str).str.lower()=="in")]["structure_id"].nunique(),
        "consensus_85aa_DFGin_aCin": cons
    })
df_state = pd.DataFrame(rows)
display(df_state.head(20))
df_state.to_csv("klifs_consensus_DFGin_aCin.csv", index=False)


Unnamed: 0,uniprot_id,kinase_name,n_structures_total,n_structures_state,consensus_85aa_DFGin_aCin
0,O00141,SGK1,4,0,
1,O00238,BMPR1B,4,4,KQIGKGRYGEVWMVAVKVFSWFRETEIYQTVLENILGFIAAYLITD...
2,O00311,CDC7,12,10,DKIGEGTFSSVYLIALKHLRIAAELQCLTVAGDNVMGVKYCVIAMP...
3,O00329,p110d,17,17,CTFMDSKMKPLWIIIFKNGDLRQDMLTLQMIQLRMTPYGCLTGLIE...
4,O00444,PLK4,6,4,NLLGKGSFAGVYRVAIKMIRVQNEVKIHCQLKPSILELYNYYLVLE...
5,O00506,YSK1,1,0,
6,O14733,MAP2K7,57,25,GEMGSGTCGQVWKIAVKQMRILMDLDVVLKSHPYIVQCFGTFIAME...
7,O14757,CHK1,237,236,QTLGEGAYGEVQLVAVKIVNIKKEICINKMLNENVVKFYGHYLFLE...
8,O14920,IKKb,7,6,RLGTGGFGNVIRWIAIKQCRWCLEIQIMRRLTPNVVAARDVLLAME...
9,O14936,CASK,25,22,EVIGKGPFSVVRRFAVKIVDLKREASICHMLKPHIVELLETYMVFE...


In [9]:
# suggest a baseline QC filter (tune as you like)
qc = df.copy()
qc = qc[(qc["quality_score"] >= 6) & (qc["resolution"] <= 2.8) & (qc["dash_count"] <= 8)]
print("Kept after QC:", len(qc), "rows (of", len(df), ")")

# redo representatives or consensus on 'qc' instead of 'df' if you want stricter inputs


Kept after QC: 8400 rows (of 10411 )


In [10]:
# build a dict of consensus sequences (no dashes)
seqs = {r.uniprot_id: r.consensus_85aa.replace("-","") for r in cons_df.itertuples() if isinstance(r.consensus_85aa,str)}
ids = sorted(seqs.keys())
def pid(a,b):
    la, lb = len(a), len(b)
    L = min(la, lb)
    if L == 0: return np.nan
    return sum(aa==bb for aa,bb in zip(a[:L], b[:L]))/L
mat = pd.DataFrame([[pid(seqs[i],seqs[j]) for j in ids] for i in ids], index=ids, columns=ids)
mat.to_csv("klifs_consensus_pairwise_identity.csv")
display(mat.iloc[:10,:10])


Unnamed: 0,O00141,O00238,O00311,O00329,O00444,O00506,O14733,O14757,O14920,O14936
O00141,1.0,0.341176,0.329412,0.094118,0.364706,0.364706,0.282353,0.388235,0.294118,0.352941
O00238,0.341176,1.0,0.258824,0.105882,0.364706,0.329412,0.223529,0.388235,0.305882,0.258824
O00311,0.329412,0.258824,1.0,0.094118,0.282353,0.317647,0.329412,0.341176,0.270588,0.258824
O00329,0.094118,0.105882,0.094118,1.0,0.094118,0.129412,0.129412,0.105882,0.105882,0.094118
O00444,0.364706,0.364706,0.282353,0.094118,1.0,0.411765,0.247059,0.423529,0.352941,0.352941
O00506,0.364706,0.329412,0.317647,0.129412,0.411765,1.0,0.341176,0.388235,0.305882,0.352941
O14733,0.282353,0.223529,0.329412,0.129412,0.247059,0.341176,1.0,0.270588,0.258824,0.282353
O14757,0.388235,0.388235,0.341176,0.105882,0.423529,0.388235,0.270588,1.0,0.388235,0.376471
O14920,0.294118,0.305882,0.270588,0.105882,0.352941,0.305882,0.258824,0.388235,1.0,0.317647
O14936,0.352941,0.258824,0.258824,0.094118,0.352941,0.352941,0.282353,0.376471,0.317647,1.0
