In [2]:
import os
import re

# DNA residue 파일 경로
dna_residue_dir = "/applic/rosetta/2023.45/database/chemical/residue_type_sets/fa_standard/residue_types/nucleic/rna_legacy"

# 결과 저장 딕셔너리 초기화
residue_atomtype_mapping = {}
residue_charge_mapping = {}

# 수소 원자 필터링을 위한 패턴
hydrogen_pattern = re.compile(r"^H\d*")  # H 또는 H숫자로 시작하는 경우

# DNA 폴더 내 모든 .params 파일 처리
for file_name in os.listdir(dna_residue_dir):
    if file_name.endswith(".params"):
        file_path = os.path.join(dna_residue_dir, file_name)
        
        with open(file_path, "r") as f:
            lines = f.readlines()
        
        res_name = None
        atom_charge_dict = {}

        for line in lines:
            if line.startswith("IO_STRING"):
                res_name = line.split()[1]  # DA, DC, DG, DT 등
            
            if line.startswith("ATOM"):
                parts = line.split()
                if len(parts) >= 5:  # 최소한 5개의 컬럼이 존재해야 함
                    atom_name = parts[1]  # 두 번째 컬럼 (ATOM명)
                    
                    # 수소 원자는 제외
                    if hydrogen_pattern.match(atom_name):
                        continue
                    
                    atom_type = parts[2]  # 세 번째 컬럼 (Atom Type)
                    try:
                        charge = float(parts[4])  # 다섯 번째 컬럼 (전하 값)
                    except ValueError:
                        charge = None  # 전하 값이 없으면 None 처리

                    atom_charge_dict[atom_name] = charge
                    residue_atomtype_mapping[(res_name, atom_name)] = atom_type

        if res_name:
            residue_charge_mapping[res_name] = atom_charge_dict

# 결과 출력 (테스트용)
print("Residue AtomType Mapping:", residue_atomtype_mapping)
print("Residue Charge Mapping:", residue_charge_mapping)


Residue AtomType Mapping: {('U', 'P'): 'Phos', ('U', 'OP2'): 'OOC', ('U', 'OP1'): 'OOC', ('U', "O5'"): 'Oet2', ('U', "C5'"): 'CH2', ('U', "C4'"): 'CH1', ('U', "O4'"): 'Oet3', ('U', "C3'"): 'CH1', ('U', "O3'"): 'Oet2', ('U', "C1'"): 'CH1', ('U', "C2'"): 'CH2', ('U', "O2'"): 'OH', ('U', 'N1'): 'Ntrp', ('U', 'C2'): 'CObb', ('U', 'O2'): 'OCbb', ('U', 'N3'): 'Ntrp', ('U', 'C4'): 'CObb', ('U', 'O4'): 'OCbb', ('U', 'C5'): 'aroC', ('U', 'C6'): 'aroC', ('A', 'P'): 'Phos', ('A', 'OP2'): 'OOC', ('A', 'OP1'): 'OOC', ('A', "O5'"): 'Oet2', ('A', "C5'"): 'CH2', ('A', "C4'"): 'CH1', ('A', "O4'"): 'Oet3', ('A', "C3'"): 'CH1', ('A', "O3'"): 'Oet2', ('A', "C1'"): 'CH1', ('A', "C2'"): 'CH2', ('A', "O2'"): 'OH', ('A', 'N1'): 'Nhis', ('A', 'C2'): 'aroC', ('A', 'N3'): 'Nhis', ('A', 'C4'): 'aroC', ('A', 'C5'): 'aroC', ('A', 'C6'): 'aroC', ('A', 'N6'): 'NH2O', ('A', 'N7'): 'Nhis', ('A', 'C8'): 'aroC', ('A', 'N9'): 'Ntrp', ('G', 'P'): 'Phos', ('G', 'OP2'): 'OOC', ('G', 'OP1'): 'OOC', ('G', "O5'"): 'Oet2', ('G',

In [3]:
import numpy as np
ex = np.zeros((2,3))
mask = np.ones((2,3)) == 1
ex[mask] = 'L'
print(ex)


ValueError: could not convert string to float: 'L'

pdb로부터 features까지 one shot

정답 metal 이 없으면 structure에 none이 들어가서 오류남

In [8]:
from ligmet.utils.pdb import read_pdb, StructureWithGrid
from ligmet.featurizer import Features, make_pdb, process_pdb, secondary_struct,calculate_sasa, q_per_atom, cov_bonds_mask
from ligmet.utils.grid import sasa_grids, filter_by_clashmap
from dataclasses import asdict
from rdkit import Chem
pdb_path = '/home/qkrgangeun/LigMet/code/src/ligmet/utils/examples/1a05_ligand.pdb'
structure = read_pdb(pdb_path)
grids = sasa_grids(structure.atom_positions, structure.atom_elements)
grids = filter_by_clashmap(grids)
structure_dict = asdict(structure)
structure_with_grid = StructureWithGrid(
    grid_positions= grids,
    **structure_dict  # structure_dict의 내용을 추가
)
pdb_io, protein_io, ligand_io = make_pdb(structure_with_grid)
ligand_pdb_str = ligand_io.getvalue()
ligand_mol = None
if ligand_pdb_str.strip():
    ligand_mol = Chem.MolFromPDBBlock(ligand_pdb_str, removeHs=False)
pdb_path_new = process_pdb(pdb_io)
sasa = calculate_sasa(pdb_path_new)  # PDB 파일 대신 StringIO 사용
qs = q_per_atom(ligand_mol, structure_with_grid)
sec_structs = secondary_struct(pdb_path_new, structure_with_grid)  # PDB 파일 대신 StringIO 사용
bond_masks = cov_bonds_mask(structure, ligand_mol)
features= Features(**structure_dict, grid_positions=grids,sasas=sasa, qs=qs, sec_structs=sec_structs,bond_masks=bond_masks)

In [9]:
print(structure)
print(features.sasas)
print(features.qs)
print(features.sec_structs)
print(features.bond_masks)
print(sum(features.is_ligand))
print(len(features.is_ligand))
print(features.atom_residues)

Structure(atom_positions=array([[ 3.006, 21.67 , 74.626],
       [ 2.164, 20.55 , 75.398],
       [ 4.352, 21.981, 74.058],
       ...,
       [-1.341, 24.284, 19.587],
       [-1.512, 21.621, 23.485],
       [-0.298, 21.62 , 21.675]], dtype=float32), atom_names=array(['N', 'O', 'CA', ..., 'O3', 'O4', 'O5'], dtype='<U3'), atom_elements=array(['N', 'O', 'C', ..., 'O', 'O', 'O'], dtype='<U1'), atom_residues=array(['MET', 'MET', 'MET', ..., 'IPM', 'IPM', 'IPM'], dtype='<U3'), residue_idxs=array([  1,   1,   1, ..., 402, 402, 402]), chain_ids=array(['A', 'A', 'A', ..., 'B', 'B', 'B'], dtype='<U1'), is_ligand=array([0, 0, 0, ..., 1, 1, 1]), metal_positions=array([[ 9.923, 25.875, 41.811],
       [-1.037, 25.652, 23.76 ]], dtype=float32), metal_types=array(['MG', 'MG'], dtype='<U2'))
[0.44835146 1.04584954 0.06670686 ... 0.63338535 0.83233149 1.15937971]
[-0.6046255  -0.6884871   0.0900506  ... -0.36595403 -0.36782612
 -0.36782612]
[8 8 8 ... 7 7 7]
[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0.

make_sasa 와 freesasa비교

In [17]:
from ligmet.utils.rf.rf_features import RSA
import time
one = time.time()
sasa1 = RSA(features.grid_positions,features.atom_positions,features.atom_elements)
print(features.grid_positions[200])
two = time.time()
print(two-one)
print('0',sasa1[0]*105.9)
print('100',sasa1[200]*105.9)
for a in sasa1*105.9:
    print(a)


[18.98664724 27.26700089 60.66096108]
0.8896842002868652
0 93.19200000000001
100 69.894
93.19200000000001
95.31
82.602
91.074
63.54
82.602
88.956
88.956
80.48400000000001
99.546
99.546
82.602
80.48400000000001
88.956
88.956
95.31
99.546
95.31
95.31
78.366
91.074
97.42800000000001
91.074
95.31
82.602
101.664
74.13
80.48400000000001
46.596000000000004
76.248
93.19200000000001
82.602
69.894
76.248
91.074
78.366
101.664
91.074
84.72000000000001
105.9
86.838
82.602
82.602
78.366
103.78200000000001
101.664
67.77600000000001
82.602
57.18600000000001
80.48400000000001
84.72000000000001
93.19200000000001
101.664
84.72000000000001
93.19200000000001
63.54
78.366
78.366
101.664
55.068000000000005
31.77
38.124
44.478
59.30400000000001
65.658
80.48400000000001
86.838
65.658
91.074
69.894
86.838
97.42800000000001
93.19200000000001
91.074
91.074
101.664
69.894
74.13
95.31
103.78200000000001
86.838
101.664
76.248
76.248
88.956
76.248
80.48400000000001
88.956
93.19200000000001
50.832
88.956
74.13
74.13


In [18]:
from ligmet.featurizer import calculate_sasa
pdb_path = '/home/qkrgangeun/LigMet/code/src/ligmet/utils/examples/1a05_ligand.pdb'
sasa2 = calculate_sasa(pdb_path)
for b in zip(sasa2*50):
    print(b)

(50.65719804091402,)
(32.96998027342346,)
(9.15553123530102,)
(21.590613411997037,)
(3.3217944348137873,)
(0.0,)
(8.15996929280209,)
(21.454441430288433,)
(12.653581264154997,)
(0.0,)
(29.822995257596467,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(10.455670869787596,)
(1.5016461586041416,)
(41.548568852262626,)
(19.935199791341354,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(2.9477752782922875,)
(0.7070316726857818,)
(1.1421985092308435,)
(11.577596982604428,)
(33.245469792367054,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.40668689291139826,)
(0.0,)
(0.0,)
(0.013812354060321882,)
(0.5025847329924157,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(0.0,)
(2.406026115879007,)
(0.0,)
(0.0,)
(0.0,)
(12.726958558653353,)
(4.132925604989888,)
(0.0,)
(0.0,)
(5.257568708453007,)
(0.0,)
(4.4668377622919,)
(0.0,)
(0.0,)
(0.05668516910531696,)
(0.0,)
(0.8954904129632759,)
(0.0,)
(10.245404699183224,)
(6.852967253540212,)

grid 랑 label 비율 알아보기

In [7]:
from ligmet.utils.pdb import read_pdb, StructureWithGrid
from ligmet.featurizer import Features, make_pdb, process_pdb, secondary_struct,calculate_sasa, q_per_atom, cov_bonds_mask
from ligmet.utils.grid import sasa_grids, filter_by_clashmap
from dataclasses import asdict
from rdkit import Chem
import numpy as np
from scipy.spatial import cKDTree

pdb_path = '/home/qkrgangeun/LigMet/code/src/ligmet/utils/examples/1a05_ligand.pdb'
structure = read_pdb(pdb_path)
grids = sasa_grids(structure.atom_positions, structure.atom_elements)
grids = filter_by_clashmap(grids)
structure_dict = asdict(structure)
structure_with_grid = StructureWithGrid(
    grid_positions= grids,
    **structure_dict  # structure_dict의 내용을 추가
)

# metal_positions이 비어있지 않은지 확인
if hasattr(structure_with_grid, "metal_positions") and structure_with_grid.metal_positions is not None:
    metal_positions = np.array(structure_with_grid.metal_positions)
    grid_positions = np.array(structure_with_grid.grid_positions)

    # KD-Tree 생성
    metal_tree = cKDTree(metal_positions)
    grid_tree = cKDTree(grid_positions)

    # metal 위치에서 2.0Å 이내에 있는 grid 찾기
    radius = 2.0  # Angstrom 단위
    neighbor_indices = metal_tree.query_ball_tree(grid_tree, radius)

    # 유니크한 grid index 개수 계산
    unique_grid_indices = set(idx for indices in neighbor_indices for idx in indices)

    # 결과 출력
    print( '전체 grid 개수:', len(grid_positions))
    print("2Å 이내에 있는 유니크한 grid 개수:", len(unique_grid_indices))

else:
    print("metal_positions이 존재하지 않음.")


전체 grid 개수: 18961
2Å 이내에 있는 유니크한 grid 개수: 24


In [2]:
from ligmet.utils.pdb import read_pdb, StructureWithGrid
from ligmet.utils.grid import sasa_grids, filter_by_clashmap
from dataclasses import asdict
import numpy as np
from scipy.spatial import cKDTree

# PDB 경로 및 train_pdbs.txt 파일 경로
pdb_dir = '/home/qkrgangeun/LigMet/data/biolip/pdb'
train_pdbs_file = '/home/qkrgangeun/LigMet/code/text/biolip/train_pdbs.txt'

# 결과 저장 변수
ratios = []
ratios1 = []
# PDB 파일 목록 읽기
with open(train_pdbs_file, 'r') as f:
    pdb_ids = [line.strip() for line in f.readlines()]

# 각 PDB에 대해 계산 수행
for pdb_id in pdb_ids:
    pdb_path = f"{pdb_dir}/{pdb_id}.pdb"

    # PDB 구조 읽기
    structure = read_pdb(pdb_path)
    grids = sasa_grids(structure.atom_positions, structure.atom_elements)
    grids = filter_by_clashmap(grids)
    structure_dict = asdict(structure)

    structure_with_grid = StructureWithGrid(
        grid_positions=grids,
        **structure_dict  # structure_dict의 내용을 추가
    )

    # metal_positions이 비어있지 않은지 확인
    if hasattr(structure_with_grid, "metal_positions") and structure_with_grid.metal_positions is not None:
        metal_positions = np.array(structure_with_grid.metal_positions)
        grid_positions = np.array(structure_with_grid.grid_positions)

        if len(metal_positions) > 0 and len(grid_positions) > 0:
            # KD-Tree 생성
            metal_tree = cKDTree(metal_positions)
            grid_tree = cKDTree(grid_positions)

            # metal 위치에서 2.0Å 이내에 있는 grid 찾기
            radius = 2.0  # Angstrom 단위
            radius1= 1.28847
            neighbor_indices = metal_tree.query_ball_tree(grid_tree, radius)
            neighbor_indices1 = metal_tree.query_ball_tree(grid_tree, radius1)

            # 유니크한 grid index 개수 계산
            unique_grid_indices = set(idx for indices in neighbor_indices for idx in indices)
            unique_grid_indices1 = set(idx for indices in neighbor_indices1 for idx in indices)

            # 비율 계산 및 저장
            ratio = len(unique_grid_indices) / len(grid_positions)
            ratio1 = len(unique_grid_indices1) / len(grid_positions)
            ratios.append(ratio)
            ratios1.append(ratio1)

            print(f"PDB ID: {pdb_id} | 1.288Å 이내 grid 개수:{len(unique_grid_indices)} | 2Å 이내 유니크 grid 개수: {len(unique_grid_indices)} | 전체 grid 개수: {len(grid_positions)} | 비율: {ratio1:.4f}|{ratio:.4f}")

# 전체 평균 비율 출력
if ratios:
    mean_ratio = sum(ratios) / len(ratios)
    print(f"\n전체 평균 비율 (2Å 이내 grid / 전체 grid): {mean_ratio:.4f}")
else:
    print("유효한 데이터가 없습니다.")


PDB ID: 3cna | 1.288Å 이내 grid 개수:14 | 2Å 이내 유니크 grid 개수: 14 | 전체 grid 개수: 7442 | 비율: 0.0005|0.0019
PDB ID: 2cna | 1.288Å 이내 grid 개수:27 | 2Å 이내 유니크 grid 개수: 27 | 전체 grid 개수: 7444 | 비율: 0.0013|0.0036
Insertion code found at pdb /home/qkrgangeun/LigMet/data/biolip/pdb/1tgb.pdb,chain A, residue 184
Insertion code found at pdb /home/qkrgangeun/LigMet/data/biolip/pdb/1tgb.pdb,chain A, residue 188
Insertion code found at pdb /home/qkrgangeun/LigMet/data/biolip/pdb/1tgb.pdb,chain A, residue 221
PDB ID: 1tgb | 1.288Å 이내 grid 개수:19 | 2Å 이내 유니크 grid 개수: 19 | 전체 grid 개수: 6487 | 비율: 0.0011|0.0029
PDB ID: 2sod | 1.288Å 이내 grid 개수:47 | 2Å 이내 유니크 grid 개수: 47 | 전체 grid 개수: 17092 | 비율: 0.0018|0.0027
PDB ID: 1azu | 1.288Å 이내 grid 개수:9 | 2Å 이내 유니크 grid 개수: 9 | 전체 grid 개수: 3832 | 비율: 0.0018|0.0023
PDB ID: 1bp2 | 1.288Å 이내 grid 개수:14 | 2Å 이내 유니크 grid 개수: 14 | 전체 grid 개수: 4075 | 비율: 0.0017|0.0034
Insertion code found at pdb /home/qkrgangeun/LigMet/data/biolip/pdb/1tgc.pdb,chain A, residue 184
Insertion code 

KeyboardInterrupt: 