In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
from scipy.spatial.distance import cdist

# 설정
pdb_file = Path("/home/qkrgangeun/LigMet/code/text/biolip/filtered/train_pdbs_chain1_10000.txt")
dl_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/dl/features")
metal_label_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/metal_label")
rf_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/rf/grid_prob")
rf_threshold = 0.5
target_metals = {"MG", "ZN", "MN", "CA", "FE", "NI", "CO", "CU", "K"}

metal_to_counts = defaultdict(int)

# pdb id 리스트 로드
with open(pdb_file) as f:
    pdb_ids = [line.strip() for line in f]

# 통계 계산
for pdb_id in tqdm(pdb_ids, desc="Processing"):
    try:
        feature_path = dl_dir / f"{pdb_id}.npz"
        rf_path = rf_dir / f"{pdb_id}.npz"
        metal_path = metal_label_dir / f"{pdb_id}.npz"

        data = np.load(feature_path, allow_pickle=True)
        rf_data = np.load(rf_path)
        metal_data = np.load(metal_path, allow_pickle=True)

        metal_positions = metal_data["metal_positions"]
        metal_types = metal_data["metal_types"]
        grid_positions = data["grid_positions"]
        grid_probs = rf_data["prob"]

        grid_mask = grid_probs >= rf_threshold
        filtered_grids = grid_positions[grid_mask]

        for metal_pos, metal_type in zip(metal_positions, metal_types):
            if metal_type not in target_metals:
                continue
            dists = cdist(filtered_grids, [metal_pos])
            num_within_2A = np.sum(dists <= 2.0)
            metal_to_counts[metal_type] += num_within_2A
    except Exception as e:
        print(e)
        continue

# 결과 저장
df = pd.DataFrame(metal_to_counts.items(), columns=["metal_type", "grid_points_within_2A"])
df.to_csv("metal_rf_overlap_stats.csv", index=False)
print(df)


Processing: 100%|██████████| 15622/15622 [08:14<00:00, 31.56it/s]  

  metal_type  grid_points_within_2A
0         MN                  19655
1         ZN                  62491
2         MG                  46310
3         CA                 117550
4         CU                   7502
5         FE                   6331
6         CO                   4666
7          K                    471
8         NI                    296





In [11]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
from scipy.spatial.distance import cdist

# 설정
pdb_file = Path("/home/qkrgangeun/LigMet/code/text/biolip/filtered/train_pdbs_chain1_10000.txt")
dl_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/dl/features")
metal_label_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/metal_label")
rf_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/rf/grid_prob")
rf_threshold = 0.5
target_metals = {"MG", "ZN", "MN", "CA", "FE", "NI", "CO", "CU", "K"}

metal_to_counts = defaultdict(int)

# pdb id 리스트 로드
with open(pdb_file) as f:
    pdb_ids = [line.strip() for line in f]

# 통계 계산
for pdb_id in tqdm(pdb_ids, desc="Processing"):
    try:
        feature_path = dl_dir / f"{pdb_id}.npz"
        rf_path = rf_dir / f"{pdb_id}.npz"
        metal_path = metal_label_dir / f"{pdb_id}.npz"

        data = np.load(feature_path, allow_pickle=True)
        rf_data = np.load(rf_path)
        metal_data = np.load(metal_path, allow_pickle=True)

        metal_positions = metal_data["metal_positions"]
        metal_types = metal_data["metal_types"]
        grid_positions = data["grid_positions"]
        grid_probs = rf_data["prob"]

        grid_mask = grid_probs >= rf_threshold
        filtered_grids = grid_positions[grid_mask]

        for metal_pos, metal_type in zip(metal_positions, metal_types):
            if metal_type not in target_metals:
                continue
            dists = cdist(filtered_grids, [metal_pos])
            num_within_2A = np.sum(dists <= 2.0)
            metal_to_counts[metal_type] += num_within_2A
    except Exception as e:
        print(e)
        continue

# 결과 저장
df = pd.DataFrame(metal_to_counts.items(), columns=["metal_type", "grid_points_within_2A"])
df.to_csv("metal_rf_overlap_stats.csv", index=False)
print(df)


Processing: 100%|██████████| 15622/15622 [00:15<00:00, 1022.95it/s]

  metal_type  grid_points_within_2A
0         MN                  19655
1         ZN                  62491
2         MG                  46310
3         CA                 117550
4         CU                   7502
5         FE                   6331
6         CO                   4666
7          K                    471
8         NI                    296





In [10]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
from scipy.spatial.distance import cdist

# 설정
pdb_file = Path("/home/qkrgangeun/LigMet/code/text/biolip/filtered/train_pdbs_chain1_10000.txt")
dl_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/dl/features")
metal_label_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/metal_label")
rf_dir = Path("/home/qkrgangeun/LigMet/data/biolip_backup/rf/grid_prob")
rf_threshold = 0.0
target_metals = {"MG", "ZN", "MN", "CA", "FE", "NI", "CO", "CU", "K"}

metal_to_counts = defaultdict(int)   # RF grid 2Å 이내 개수
metal_to_total = defaultdict(int)    # 실제 metal 위치 수

# pdb id 리스트 로드
with open(pdb_file) as f:
    pdb_ids = [line.strip() for line in f]

# 통계 계산
for pdb_id in tqdm(pdb_ids, desc="Processing"):
    try:
        feature_path = dl_dir / f"{pdb_id}.npz"
        rf_path = rf_dir / f"{pdb_id}.npz"
        metal_path = metal_label_dir / f"{pdb_id}.npz"

        data = np.load(feature_path, allow_pickle=True)
        rf_data = np.load(rf_path)
        metal_data = np.load(metal_path, allow_pickle=True)

        metal_positions = metal_data["metal_positions"]
        metal_types = metal_data["metal_types"]
        grid_positions = data["grid_positions"]
        grid_probs = rf_data["prob"]

        grid_mask = grid_probs >= rf_threshold
        filtered_grids = grid_positions[grid_mask]

        for metal_pos, metal_type in zip(metal_positions, metal_types):
            if metal_type not in target_metals:
                continue
            dists = cdist(filtered_grids, [metal_pos])
            num_within_2A = np.sum(dists <= 2.0)
            metal_to_counts[metal_type] += num_within_2A
            metal_to_total[metal_type] += 1  # metal 위치 수
    except Exception as e:
        print(f"[{pdb_id}] error: {e}")
        continue

# 결과 저장
df = pd.DataFrame([
    {
        "metal_type": metal,
        "grid_points_within_2A": metal_to_counts[metal],
        "metal_sites_total": metal_to_total[metal],
        "avg_grids_per_metal": (
            metal_to_counts[metal] / metal_to_total[metal]
            if metal_to_total[metal] > 0 else 0.0
        )
    }
    for metal in sorted(target_metals)
])

df.to_csv("metal_rf_overlap_stats.csv", index=False)
print(df)


Processing: 100%|██████████| 15622/15622 [00:23<00:00, 653.59it/s]

  metal_type  grid_points_within_2A  metal_sites_total  avg_grids_per_metal
0         CA                 118241               8292            14.259648
1         CO                   4672                548             8.525547
2         CU                   7579               1042             7.273512
3         FE                   6363                846             7.521277
4          K                    471                 31            15.193548
5         MG                  46490               3819            12.173344
6         MN                  19682               1871            10.519508
7         NI                    296                 38             7.789474
8         ZN                  62780               8650             7.257803



