In [2]:
# =========================
# Cell 0: imports & config
# =========================

import os
import re
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score,
)

# ---- file paths (adjust as needed) ----
POS_FASTA = "Positive_Final.fasta"
NEG_FASTA = "Negative_Final.fasta"

# Optional: directory where you will store PSSM files (one per protein)
# if you decide to fully implement EIPP as in PRBP.
PSSM_DIR = "pssm_files"  # e.g. each protein: PSSM_DIR/<seq_id>.pssm

# Amino acids in a fixed order
AA_ORDER = list("ACDEFGHIKLMNPQRSTVWY")
AA_INDEX = {aa: i for i, aa in enumerate(AA_ORDER)}
N_AA = len(AA_ORDER)


In [3]:
# =========================
# Cell 1: FASTA utilities
# =========================

def read_fasta(path):
    """
    Minimal FASTA reader.
    Returns dict: {sequence_id: sequence_string}
    """
    seqs = {}
    header = None
    parts = []

    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                # save previous
                if header is not None:
                    seqs[header] = "".join(parts).replace(" ", "").upper()
                header = line[1:].split()[0]  # take first token after '>'
                parts = []
            else:
                parts.append(line)

    if header is not None:
        seqs[header] = "".join(parts).replace(" ", "").upper()

    return seqs


# Load positive (RBP) and negative (non-RBP) sequences
pos_seqs = read_fasta(POS_FASTA)
neg_seqs = read_fasta(NEG_FASTA)

print(f"Loaded {len(pos_seqs)} positive (RBP) sequences")
print(f"Loaded {len(neg_seqs)} negative (non-RBP) sequences")


Loaded 3741 positive (RBP) sequences
Loaded 3712 negative (non-RBP) sequences


In [4]:
# ====================================
# Cell 2: build labeled sequence table
# ====================================

def build_labeled_df(pos_dict, neg_dict):
    rows = []
    for sid, seq in pos_dict.items():
        rows.append({"id": sid, "sequence": seq, "label": 1})
    for sid, seq in neg_dict.items():
        rows.append({"id": sid, "sequence": seq, "label": 0})
    return pd.DataFrame(rows)


df = build_labeled_df(pos_seqs, neg_seqs)
print(df.head())
print(df["label"].value_counts())


                          id  \
0  sp|A0A0A7HFE1|CAS10_STRTR   
1  sp|A0A0A7HIX6|CSM6A_STRTR   
2   sp|A0A0B4KGY6|NOVA_DROME   
3   sp|A0A0D1DWZ5|RRM4_MYCMD   
4  sp|A0A0F6B5X4|TACT3_SALT1   

                                            sequence  label  
0  MKKEKIDLFYGALLHDIGKVIQRATGERKKHALVGADWFDEIADNQ...      1  
1  MKILISAVGTTDPISNNHDAALLHIARNYRPDKIVLVYSQEMMVKQ...      1  
2  MESIMKVAMDKAAEQLIQQFGFDYLQQQLQLQHQNQHNSSPQQPQH...      1  
3  MSDSIYAPHNKHKLEAARAADAAADDAATVSALVEPTDSTAQASHA...      1  
4  MMFTDWHEAAIGKTHNRMNFDCGDADLNQFLQRHARQNHEKGTTKT...      1  
label
1    3741
0    3712
Name: count, dtype: int64


In [5]:
# ====================================
# Cell 3: AAC feature implementation
# ====================================

def compute_aac(seq, aa_order=AA_ORDER):
    """
    AAC: frequency of each amino acid in aa_order.
    Returns a 20-dim numpy array.
    """
    counts = np.zeros(len(aa_order), dtype=float)
    L = 0
    for aa in seq:
        if aa in AA_INDEX:
            counts[AA_INDEX[aa]] += 1
            L += 1
    if L > 0:
        counts = counts / L
    return counts


# Example: test on first sequence
test_aac = compute_aac(df["sequence"].iloc[0])
print("AAC length:", len(test_aac))
print("Sum of AAC:", test_aac.sum())


AAC length: 20
Sum of AAC: 1.0


In [6]:
# =========================================================
# Cell 4: placeholders for physico-chemical property table
# =========================================================

# TODO: Fill these 6 dictionaries from PRBP paper / cited sources:
# 1) pKa of amino group
# 2) pKa of carboxyl group
# 3) molecular mass
# 4) EIIP
# 5) number of lone electron pairs
# 6) Wiener index

# Example structure (numbers here are placeholders – REPLACE with real values):
PHYSICOCHEMICAL_PROPERTIES = {
    "pka_amino": {aa: 0.0 for aa in AA_ORDER},
    "pka_carboxyl": {aa: 0.0 for aa in AA_ORDER},
    "mass": {aa: 0.0 for aa in AA_ORDER},
    "eiip": {aa: 0.0 for aa in AA_ORDER},
    "lone_pairs": {aa: 0.0 for aa in AA_ORDER},
    "wiener_index": {aa: 0.0 for aa in AA_ORDER},
}

def normalize_property_vector(values_dict):
    """
    Given dict {aa: raw_value}, return np.array[20] of normalized da(i) in [0,1] as in Eq. (4).
    """
    vals = np.array([values_dict[aa] for aa in AA_ORDER], dtype=float)
    vmin = vals.min()
    vmax = vals.max()
    if vmax == vmin:
        return np.zeros_like(vals)  # avoid division by zero
    return (vals - vmin) / (vmax - vmin)


In [7]:
# ==========================================
# Cell 5: PSSM loading & normalization
# ==========================================

def load_pssm_for_sequence(seq_id, pssm_dir=PSSM_DIR):
    """
    TODO: Adapt this function to the actual format of your PSSM files.
    It should return a numpy array of shape (L, 20) with the raw scores.
    """
    path = os.path.join(pssm_dir, f"{seq_id}.pssm")
    if not os.path.exists(path):
        raise FileNotFoundError(f"PSSM not found for {seq_id}: {path}")

    # --- Example parser for a simple whitespace-separated PSSM with 20 score columns ---
    rows = []
    with open(path, "r") as f:
        for line in f:
            # You MUST adjust this condition and parsing according to your file
            parts = line.strip().split()
            # naive example: expect length >= 22, with 20 scores in positions 2:22
            if len(parts) >= 22 and parts[0].isdigit():
                scores = [float(x) for x in parts[2:22]]
                rows.append(scores)

    if len(rows) == 0:
        raise ValueError(f"No PSSM rows parsed for {seq_id}")
    pssm = np.array(rows, dtype=float)
    return pssm


def normalize_pssm(pssm):
    """
    Apply sigmoid normalization f(x) = 1 / (1 + exp(−x)) to each cell, as in Eq. (3).
    """
    return 1.0 / (1.0 + np.exp(-pssm))


In [8]:
# ==========================================
# Cell 6: EIPP feature computation
# ==========================================

def compute_eipp_features(seq_id, seq, pssm_dir=PSSM_DIR):
    """
    Compute 120-dim EIPP feature for a single sequence as in PRBP.
    """
    pssm_raw = load_pssm_for_sequence(seq_id, pssm_dir=pssm_dir)
    pssm_norm = normalize_pssm(pssm_raw)  # shape (L, 20)

    if pssm_norm.shape[0] != len(seq):
        # You may want to handle gaps or mismatches more gracefully
        raise ValueError(f"PSSM length ({pssm_norm.shape[0]}) != sequence length ({len(seq)}) for {seq_id}")

    # For each amino acid type k, we need normalized PSSM values f_k(i)
    # i indexes the 20 AA types (columns). We'll aggregate over positions where seq[pos] == k.
    # EIPP dimension: 6 properties * 20 AA types = 120
    eipp_vector = []

    # Pre-normalize all 6 property vectors da(i)
    normalized_props = {
        pname: normalize_property_vector(values_dict)
        for pname, values_dict in PHYSICOCHEMICAL_PROPERTIES.items()
    }

    for prop_name, da in normalized_props.items():  # 6 properties
        # For each residue type k in AA_ORDER
        for k, aa_k in enumerate(AA_ORDER):
            # Positions where sequence residue == aa_k
            positions = [pos for pos, aa in enumerate(seq) if aa == aa_k]
            if not positions:
                # No such residue in sequence; feature becomes 0
                eipp_vector.append(0.0)
                continue

            # For all those positions, take PSSM rows and sum sqrt(da(i)) * f_k(i)
            # Here, f_k(i) is the normalized PSSM for AA type i at those positions, averaged over positions.
            sub_pssm = pssm_norm[positions, :]  # shape (#pos, 20)
            f_k = sub_pssm.mean(axis=0)  # average over positions for this k

            contrib = np.sum(np.sqrt(da) * f_k)
            eipp_vector.append(contrib)

    return np.array(eipp_vector, dtype=float)


# Small smoke-test on first sequence (will fail until you provide real PSSMs & property table)
# eipp_test = compute_eipp_features(df["id"].iloc[0], df["sequence"].iloc[0])
# print("EIPP length:", len(eipp_test))


In [9]:
# ==========================================
# Cell 7: feature extraction for all sequences
# ==========================================

def extract_features(
    df,
    use_eipp=False,
    pssm_dir=PSSM_DIR,
):
    """
    Build feature matrix X and label vector y.
    If use_eipp=False: only AAC (20 dims).
    If use_eipp=True: AAC (20) + EIPP (120) = 140 dims.
    """
    X_list = []
    y_list = []
    ids = []

    for idx, row in df.iterrows():
        seq_id = row["id"]
        seq = row["sequence"]
        label = row["label"]

        aac = compute_aac(seq)  # 20 dims
        feats = [aac]

        if use_eipp:
            eipp = compute_eipp_features(seq_id, seq, pssm_dir=pssm_dir)  # 120 dims
            feats.append(eipp)

        X_list.append(np.concatenate(feats))
        y_list.append(label)
        ids.append(seq_id)

    X = np.vstack(X_list)
    y = np.array(y_list, dtype=int)
    ids = np.array(ids)
    return X, y, ids


# Start with AAC-only (works immediately)
X_aac, y, ids = extract_features(df, use_eipp=False)
print("Feature matrix shape (AAC-only):", X_aac.shape)


Feature matrix shape (AAC-only): (7453, 20)


In [10]:
# Once PSSM + property table are ready, you can do:
# X_full, y, ids = extract_features(df, use_eipp=True)
# print("Feature matrix shape (AAC+EIPP):", X_full.shape)


In [11]:
# ==========================================
# Cell 8: single train/test split with RF
# ==========================================

RANDOM_STATE = 42

X = X_aac  # start with AAC-only features; later you can switch to X_full
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE,
)

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    n_jobs=-1,
    random_state=RANDOM_STATE,
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"MCC:       {mcc:.4f}")
print(f"AUC:       {auc:.4f}")


Accuracy:  0.7311
Precision: 0.7245
Recall:    0.7487
F1-score:  0.7364
MCC:       0.4623
AUC:       0.8096


In [12]:
# ==========================================
# Cell 9: 5-fold CV (PRBP-style evaluation)
# ==========================================

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
    "roc_auc": "roc_auc",
}

cv_results = cross_validate(
    rf,
    X,
    y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_estimator=False,
)

for metric, values in cv_results.items():
    if metric.startswith("test_"):
        name = metric.replace("test_", "")
        print(f"{name:10s}: mean={values.mean():.4f}, std={values.std():.4f}")


accuracy  : mean=0.7355, std=0.0042
precision : mean=0.7296, std=0.0092
recall    : mean=0.7522, std=0.0153
f1        : mean=0.7406, std=0.0048
roc_auc   : mean=0.8159, std=0.0053


In [13]:
# ==========================================
# CELL 10: Predict 5 random sequences
# ==========================================

# Ensure you have already trained `rf` on X and y
# (from previous cells)

# Get predictions for the whole dataset
all_probs = rf.predict_proba(X)[:, 1]   # P(RBP | sequence)
all_preds = rf.predict(X)               # 0/1 labels

results = df.copy()
results["pred_label"] = all_preds
results["pred_proba_RBP"] = all_probs

# Sample 5 random proteins
sample5 = results.sample(n=5, random_state=RANDOM_STATE)

for _, row in sample5.iterrows():
    print(f"ID:           {row['id']}")
    print(f"True label:   {row['label']}   (1 = RBP, 0 = non-RBP)")
    print(f"Predicted:    {row['pred_label']}")
    print(f"P(RBP | seq): {row['pred_proba_RBP']:.4f}")
    # Optional: don’t print full huge sequence
    seq = row["sequence"]
    print(f"Seq (first 60 aa): {seq[:60]}{'...' if len(seq) > 60 else ''}")
    print("-" * 60)


ID:           sp|O48398|GP42_BPSP1
True label:   0   (1 = RBP, 0 = non-RBP)
Predicted:    0
P(RBP | seq): 0.2140
Seq (first 60 aa): MRKFVTTLTASPRNKKVGNHRLEISPFVSLRRYYYFNTAICIENPVTREFAIDDSYGSLS...
------------------------------------------------------------
ID:           sp|Q6AQA5|RLMKL_DESPS
True label:   1   (1 = RBP, 0 = non-RBP)
Predicted:    1
P(RBP | seq): 0.7200
Seq (first 60 aa): MCDKKDVSPQKDQYTFLANCALGLEELIEAEIKGFSGVEVELGKGTVQWQGSLETGYRAC...
------------------------------------------------------------
ID:           sp|P64578|HIGB_ECOLI
True label:   1   (1 = RBP, 0 = non-RBP)
Predicted:    1
P(RBP | seq): 0.8480
Seq (first 60 aa): MHLITQKALKDAAEKYPQHKTELVALGNTIAKGYFKKPESLKAVFPSLDNFKYLDKHYVF...
------------------------------------------------------------
ID:           sp|Q976I5|RL40_SULTO
True label:   0   (1 = RBP, 0 = non-RBP)
Predicted:    0
P(RBP | seq): 0.2080
Seq (first 60 aa): MPLTDPVKLQIVQQRIFLKKVCRDCGALNSVRATKCRRCHSKNLRPKKKELPAKKG
---------------------------------------