In [1]:
import pandas as pd
import re

# 파일 경로
val_pdb_path = "/home/qkrgangeun/LigMet/code/text/biolip/filtered/val_pdbs_filtered.txt"
metal_site_path = "/home/qkrgangeun/LigMet/code/text/biolip/metal_binding_sites_NOSSE.csv"

# 표준 아미노산 정의
standard_aas = {
    'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE',
    'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'
}

# metal site가 standard인지 판단
def is_standard_site(binding_str):
    residues = re.findall(r"'(\w+)'", binding_str)
    return all(res in standard_aas for res in residues)

# 1. validation pdb 리스트
with open(val_pdb_path, "r") as f:
    val_pdbs = set(line.strip().lower() for line in f if line.strip())

# 2. metal site 정보
df = pd.read_csv(metal_site_path)
df["pdb_id"] = df["PDB ID"].str.lower()
df = df[df["pdb_id"].isin(val_pdbs)]
df["is_standard"] = df["Binding Residues"].apply(is_standard_site)

# 3. PDB별 site 구성을 기반으로 그룹 나누기
group_a, group_b, group_c = [], [], []

pdb_to_sites = df.groupby("pdb_id")["is_standard"].apply(list)

for pdb_id, flags in pdb_to_sites.items():
    if all(flags):
        group_a.append(pdb_id)
    elif any(flags):
        group_b.append(pdb_id)
    else:
        group_c.append(pdb_id)

# 4. 결과 출력
print("✔ Group A (standard-only):", group_a)
print("✔ Group B (mixed):", group_b)
print("✔ Group C (ligand-only):", group_c)


✔ Group A (standard-only): ['5xwm', '5ya6', '5z34', '5z6e', '5zns', '5znt', '5zo4', '5zo5', '5zru', '5zt7', '6a2w', '6a48', '6a8z', '6a93', '6a94', '6aef', '6ai0', '6bt9', '6c72', '6ck1', '6cmk', '6cnl', '6cpt', '6cpu', '6cqg', '6cxb', '6d2s', '6d2y', '6d9f', '6ddt', '6ddu', '6dk9', '6dkc', '6dkd', '6dtk', '6dyo', '6e60', '6e61', '6e90', '6edw', '6eet', '6elt', '6elv', '6f98', '6fhg', '6fok', '6fop', '6foy', '6fp5', '6fpy', '6frw', '6gel', '6gez', '6gf7', '6gka', '6gkb', '6gkc', '6goc', '6gos', '6gpx', '6grg', '6grh', '6gri', '6gs8', '6guu', '6gux', '6guy', '6guz', '6h3a', '6h4l', '6h5k', '6h5m', '6h5u', '6h6n', '6h6o', '6h6p', '6h7t', '6h87', '6h98', '6h99', '6hb0', '6hbd', '6hbe', '6hbm', '6hbs', '6hbv', '6hcz', '6hd2', '6hft', '6hk5', '6hk9', '6hkv', '6hpd', '6hq4', '6hq7', '6hqv', '6hr1', '6hr5', '6htj', '6hur', '6hus', '6hwj', '6hyh', '6hzn', '6hzz', '6i01', '6i02', '6i1q', '6i1t', '6i5r', '6i5w', '6i7s', '6i9h', '6iah', '6ibh', '6ibi', '6ibj', '6idy', '6ie3', '6ifc', '6iff', '6if

In [2]:
import re
import numpy as np

# 1. 그룹 A, B, C의 pdb 리스트 (소문자)
# group_a = ['6kq1']
# group_b = []
# group_c = ['6y4f']

# 2. 로그 파일 경로
log_path = "/home/qkrgangeun/LigMet/benchmark/test_chain1_pre3.log"

# 3. 로그에서 성능 파싱
results = {}  # {pdb_id: {'precision': x, 'recall': y, 'type_accuracy': z}}

with open(log_path, 'r') as f:
    lines = f.readlines()

pdb_id = None
for line in lines:
    pdb_match = re.match(r"=== PDB: \['(\w+)'\] ===", line)
    if pdb_match:
        pdb_id = pdb_match.group(1).lower()
        results[pdb_id] = {}

    if pdb_id and "threshold 0.5 | precision:" in line:
        pr, rc = re.findall(r"precision: ([0-9.]+) \| recall: ([0-9.]+)", line)[0]
        results[pdb_id]["precision"] = float(pr)
        results[pdb_id]["recall"] = float(rc)

    if pdb_id and "threshold 0.5 | type_accuracy:" in line:
        acc = re.findall(r"type_accuracy: ([0-9.]+)", line)[0]
        results[pdb_id]["type_accuracy"] = float(acc)

# 4. 그룹별 성능 평균 계산
def group_avg(group_ids, group_name):
    precisions = [results[pdb]["precision"] for pdb in group_ids if pdb in results]
    recalls = [results[pdb]["recall"] for pdb in group_ids if pdb in results]
    accs = [results[pdb]["type_accuracy"] for pdb in group_ids if pdb in results]
    print(f"\n📊 {group_name} (PDB 수: {len(precisions)}):")
    print(f"  - Precision (threshold 0.5): {np.mean(precisions):.4f}")
    print(f"  - Recall (threshold 0.5): {np.mean(recalls):.4f}")
    print(f"  - Type Accuracy (threshold 0.5): {np.mean(accs):.4f}")

# 5. 출력
group_avg(group_a, "Group A - standard-only")
group_avg(group_b, "Group B - mixed")
group_avg(group_c, "Group C - ligand-only")



📊 Group A - standard-only (PDB 수: 651):
  - Precision (threshold 0.5): 0.3687
  - Recall (threshold 0.5): 0.7051
  - Type Accuracy (threshold 0.5): 0.7260

📊 Group B - mixed (PDB 수: 84):
  - Precision (threshold 0.5): 0.4472
  - Recall (threshold 0.5): 0.8264
  - Type Accuracy (threshold 0.5): 0.6458

📊 Group C - ligand-only (PDB 수: 191):
  - Precision (threshold 0.5): 0.4061
  - Recall (threshold 0.5): 0.8147
  - Type Accuracy (threshold 0.5): 0.7392


In [None]:
import pandas as pd
import re

# 경로
log_path = "/home/qkrgangeun/LigMet/benchmark/test_chain1_pre3.log"
metal_site_path = "/home/qkrgangeun/LigMet/code/text/biolip/metal_binding_sites_NOSSE.csv"

# 🎯 Group A 리스트를 실제 PDB ID로 설정
# group_a = ['6kq1', '6y4f']  # ← 여기에 실제 Group A의 pdb_id 소문자 목록 입력

# 1. 로그에서 precision / recall 추출
results = {}
with open(log_path, 'r') as f:
    lines = f.readlines()

pdb_id = None
for line in lines:
    match = re.match(r"=== PDB: \['(\w+)'\] ===", line)
    if match:
        pdb_id = match.group(1).lower()
        results[pdb_id] = {}
    if pdb_id and "threshold 0.5 | precision:" in line:
        pr, rc = re.findall(r"precision: ([0-9.]+) \| recall: ([0-9.]+)", line)[0]
        results[pdb_id]["precision"] = float(pr)
        results[pdb_id]["recall"] = float(rc)

# 2. Group A 중 high_pr / low_pr 분류
high_pr, low_pr = [], []
for pdb in group_a:
    if pdb in results:
        pr, rc = results[pdb]["precision"], results[pdb]["recall"]
        if pr > 0.5 and rc > 0.7:
            high_pr.append(pdb)
        elif pr <= 0.5 and rc <= 0.7:
            low_pr.append(pdb)

# 3. 메탈 사이트 정보 불러오기 및 binding residue 수 계산
df = pd.read_csv(metal_site_path)
df["pdb_id"] = df["PDB ID"].str.lower()
df["num_residues"] = df["Binding Residues"].apply(lambda s: len(re.findall(r"'(\w+)'", s)))

# 4. high_pr / low_pr 필터링 + binding residue ≥ 4 조건
high_df = df[(df["pdb_id"].isin(high_pr)) & (df["num_residues"] >= 3)]
low_df = df[(df["pdb_id"].isin(low_pr)) & (df["num_residues"] >= 3)]

# 5. 저장
high_df.to_csv("nosse_filtered_high_pr.csv", index=False)
low_df.to_csv("nosse_filtered_low_pr.csv", index=False)

print("✔ 파일 저장 완료:")
print(" - nosse_filtered_high_pr.csv (binding residues >= 3)")
print(" - nosse_filtered_low_pr.csv (binding residues >= 3)")


✔ 파일 저장 완료:
 - nosse_filtered_high_pr.csv (binding residues > 3)
 - nosse_filtered_low_pr.csv (binding residues > 3)
