# Get binders 
Script to sort data out and save files only containing binders

In [18]:
import os

# List of all HLA alleles
hla_list = [
    "A0101", "A0201", "A0202", "A0203", "A0206", "A0301", "A1101", "A2301",
    "A2402", "A2403", "A2601", "A2902", "A3001", "A3002", "A3101", "A3301",
    "A6801", "A6802", "A6901", "B0702", "B0801", "B1501", "B1801", "B2705",
    "B3501", "B4001", "B4002", "B4402", "B4403", "B4501", "B5101", "B5301",
    "B5401", "B5701", "B5801"
]

threshold_nM = 0.426

for hla in hla_list:
    file_path = f"/Users/mathildedue/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/master_bioinformatics/1.semester/22125_algorithms_in_bioinformatics/BioAlgoProject2025/data/AllFiles/{hla}/{hla}.dat"
    output_path = f"/Users/mathildedue/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/master_bioinformatics/1.semester/22125_algorithms_in_bioinformatics/BioAlgoProject2025/data/PSSM/{hla}/{hla}_bind.dat"
    bind_count = 0

    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    try:
        with open(file_path, "r") as infile, open(output_path, "w") as outfile:
            for line in infile:
                parts = line.strip().split()
                if len(parts) == 3:
                    try:
                        value = float(parts[1])
                        if value > threshold_nM:
                            outfile.write(line)
                            bind_count += 1
                    except ValueError:
                        continue

        print(f"{hla}: {bind_count} strong binders saved to {output_path}")
    except FileNotFoundError:
        print(f"File not found: {file_path}")


A0101: 103 strong binders saved to /Users/mathildedue/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/master_bioinformatics/1.semester/22125_algorithms_in_bioinformatics/BioAlgoProject2025/data/PSSM/A0101/A0101_bind.dat
A0201: 1181 strong binders saved to /Users/mathildedue/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/master_bioinformatics/1.semester/22125_algorithms_in_bioinformatics/BioAlgoProject2025/data/PSSM/A0201/A0201_bind.dat
A0202: 649 strong binders saved to /Users/mathildedue/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/master_bioinformatics/1.semester/22125_algorithms_in_bioinformatics/BioAlgoProject2025/data/PSSM/A0202/A0202_bind.dat
A0203: 639 strong binders saved to /Users/mathildedue/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/master_bioinformatics/1.semester/22125_algorithms_in_bioinformatics/BioAlgoProject2025/data/PSSM/A0203/A0203_bind.dat
A0206: 513 strong binders saved to /Users/mathildedue/Library/CloudStorage/OneD

## Split data into folds (nested 5 fold)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import os
from typing import List

# ------------------------
# Hobohm 1 Clustering
# ------------------------
def simple_seq_identity(a: str, b: str) -> float:
    dp = np.zeros((len(a)+1, len(b)+1), dtype=int)
    for i in range(len(a)+1):
        dp[i][0] = i
    for j in range(len(b)+1):
        dp[0][j] = j
    for i in range(1, len(a)+1):
        for j in range(1, len(b)+1):
            cost = 0 if a[i-1] == b[j-1] else 1
            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
    edit_distance = dp[len(a)][len(b)]
    return 1 - edit_distance / max(len(a), len(b))

def hobohm1(seqs: List[str], cutoff: float = 0.3) -> np.ndarray:
    order = np.argsort([-len(s) for s in seqs])
    reps, clusters = [], np.full(len(seqs), -1, int)
    for idx in order:
        s = seqs[idx]
        for rep_id in reps:
            if simple_seq_identity(s, seqs[rep_id]) >= cutoff:
                clusters[idx] = clusters[rep_id]
                break
        else:
            clusters[idx] = len(reps)
            reps.append(idx)
    return clusters

# ------------------------
# Main Processing Function
# ------------------------
def process_peptide_file(input_path: str, output_dir: str = "cv_splits", cutoff: float = 0.9):
    os.makedirs(output_dir, exist_ok=True)

    # Read .dat file (sequence, affinity, allele)
    with open(input_path, 'r') as f:
        lines = f.readlines()

    sequences = []
    affinities = []
    for line in lines:
        parts = line.strip().split()
        if len(parts) != 3:
            continue
        seq, affinity_str, _ = parts
        sequences.append(seq)
        affinities.append(float(affinity_str))

    # Create DataFrame
    df = pd.DataFrame({
        "sequence": sequences,
        "affinity": affinities
    })
    df["label"] = df["affinity"].apply(lambda x: 1 if x < 0.5 else 0)

    # Apply Hobohm 1
    groups = hobohm1(df["sequence"].tolist(), cutoff=cutoff)

    # Outer CV (evaluation split)
    outer_cv = GroupKFold(n_splits=5)
    for outer_idx, (train_val_idx, eval_idx) in enumerate(outer_cv.split(df["sequence"], df["label"], groups)):
        df.iloc[eval_idx].to_csv(os.path.join(output_dir, "e000.csv"), index=False)

        train_val_df = df.iloc[train_val_idx]
        inner_groups = groups[train_val_idx]
        inner_cv = GroupKFold(n_splits=4)

        for inner_idx, (train_idx, test_idx) in enumerate(inner_cv.split(train_val_df["sequence"], train_val_df["label"], inner_groups)):
            train_val_df.iloc[train_idx].to_csv(os.path.join(output_dir, f"f00{inner_idx+1}.csv"), index=False)
            train_val_df.iloc[test_idx].to_csv(os.path.join(output_dir, f"c00{inner_idx+1}.csv"), index=False)

        break  # Only generate the first evaluation fold

# ------------------------
# Example call from notebook
# ------------------------
# Use your actual path here:
input_file = "B0801.dat"
output_dir = "splits"
cutoff = 0.9

process_peptide_file(input_file, output_dir, cutoff)
