In [1]:
import pymol
#from pymol import cmd

In [2]:
# 加载 PDB 文件
pymol.cmd.load("6GZR_4.pdb") # 以6gzr的model 4为例

In [3]:
# 选择小分子 FH8（俗称TMR，这里是segment B的A链）
pymol.cmd.select("ligand", "resn FH8 and chain A and resi 101")

# 选择 RNA 链（只有一个A，但也要注意这里RNA链是segment A，ligand是 segment B）
pymol.cmd.select("rna_chain", "chain A and polymer")


1546

In [27]:
# 计算小分子和RNA链之间的极性接触 cutoff 3.5
pymol.cmd.dist("ligand_polar_conts","(ligand)","(rna_chain)",quiet=2,mode=2,label=0,reset=1);cmd.enable("ligand_polar_conts")

这里解决了哪些atom需要计算的问题。

In [9]:
'''
(c) 2011 Thomas Holder, MPI for Developmental Biology
'''

from pymol import cmd

def polarpairs(sel1, sel2, cutoff=4.0, angle=54.0, name='', state=1, quiet=1):
    '''
ARGUMENTS

    sel1, sel2 = string: atom selections

    cutoff = float: distance cutoff

    angle = float: h-bond angle cutoff in degrees. If angle="default", take
    "h_bond_max_angle" setting. If angle=0, do not detect h-bonding.

    name = string: If given, also create a distance object for visual representation

SEE ALSO

    cmd.find_pairs, cmd.distance
    '''
    cutoff = float(cutoff)
    quiet = int(quiet)
    state = int(state)
    if angle == 'default':
        angle = cmd.get('h_bond_max_angle', cmd.get_object_list(sel1)[0])
    angle = float(angle)
    mode = 1 if angle > 0 else 0
    x = cmd.find_pairs('(%s) and donors' % sel1, '(%s) and acceptors' % sel2,
            state, state,
            cutoff=cutoff, mode=mode, angle=angle) + \
        cmd.find_pairs('(%s) and acceptors' % sel1, '(%s) and donors' % sel2,
            state, state,
            cutoff=cutoff, mode=mode, angle=angle)
    x = sorted(set(x))
    if not quiet:
        print('Settings: cutoff=%.1fangstrom angle=%.1fdegree' % (cutoff, angle))
        print('Found %d polar contacts' % (len(x)))
    if len(name) > 0:
        for p in x:
            cmd.distance(name, '(%s`%s)' % p[0], '(%s`%s)' % p[1])
    return x

cmd.extend('polarpairs', polarpairs)

pairs = polarpairs('rna_chain', 'ligand', angle=63, cutoff=3.5)
for p in pairs:
    dist = cmd.get_distance('(%s`%s)' % p[0], '(%s`%s)' % p[1])
    print(p, 'Distance: %.2f' % (dist))

(('6GZR_4', 785), ('6GZR_4', 1551)) Distance: 2.78
(('6GZR_4', 787), ('6GZR_4', 1551)) Distance: 2.74
(('6GZR_4', 819), ('6GZR_4', 1551)) Distance: 3.37
(('6GZR_4', 1267), ('6GZR_4', 1547)) Distance: 2.86
(('6GZR_4', 1271), ('6GZR_4', 1551)) Distance: 2.75
(('6GZR_4', 1272), ('6GZR_4', 1547)) Distance: 3.36


In [6]:
cmd.dist("ligand_polar_conts","(ligand)","(rna_chain)",quiet=2,mode=2,label=0,reset=1)

2.895991086959839

In [7]:
from pymol import cmd

def calculate_polar_contacts(selection1="(ligand)", selection2="(rna_chain)", cutoff=3.5, _self=cmd):
    '''
    This function calculates the polar contacts between two selections (selection1 and selection2) 
    using cmd.dist for each pair of N or O atoms. It only prints out the contacts with distances 
    smaller than the cutoff.
    
    Parameters:
    - selection1 : str : First selection (e.g., "ligand")
    - selection2 : str : Second selection (e.g., "rna_chain")
    - cutoff : float : Distance cutoff (default is 3.5 Å)
    
    Returns:
    - None
    '''
    
    # Ensure only N and O atoms are selected
    selection1 = f"{selection1} and (elem N or elem O)"
    selection2 = f"{selection2} and (elem N or elem O)"
    
    # Get the list of atoms for selection1 (N and O atoms only)
    atoms1 = cmd.get_model(selection1)
    
    # Get the list of atoms for selection2 (N and O atoms only)
    atoms2 = cmd.get_model(selection2)
    
    # Loop through all combinations of atoms in selection1 and selection2
    for atom1 in atoms1.atom:
        for atom2 in atoms2.atom:
            # Compute the distance between atom1 and atom2
            dist_name = f"dist_{atom1.index}_{atom2.index}"  # Give the distance object a unique name
            cmd.dist(dist_name, f"({selection1} and index {atom1.index})", f"({selection2} and index {atom2.index})", cutoff=cutoff, mode=2, quiet=1, label=0, reset=1)
            
            # Check if the distance object exists
            if dist_name in cmd.get_names("objects"):
                # Get the distance directly using cmd.get_distance
                distance = cmd.get_distance(f"({selection1} and index {atom1.index})", f"({selection2} and index {atom2.index})")
                
                # If the distance is smaller than the cutoff, print the result
                if distance < cutoff:
                    print(f"Atom1: {atom1.name} (Residue: {atom1.resn}, Chain: {atom1.chain}), "
                          f"Atom2: {atom2.name} (Residue: {atom2.resn}, Chain: {atom2.chain}), "
                          f"Distance: {distance:.2f} Å")
                
                # Clean up the distance object
                cmd.delete(dist_name)

# Example usage:
calculate_polar_contacts("ligand", "rna_chain", cutoff=3.5)


Atom1: OE (Residue: FH8, Chain: A), Atom2: O2' (Residue: G, Chain: A), Distance: 2.86 Å
Atom1: OE (Residue: FH8, Chain: A), Atom2: N3 (Residue: G, Chain: A), Distance: 3.36 Å
Atom1: N2 (Residue: FH8, Chain: A), Atom2: N9 (Residue: G, Chain: A), Distance: 3.44 Å
Atom1: O2 (Residue: FH8, Chain: A), Atom2: N1 (Residue: G, Chain: A), Distance: 2.78 Å
Atom1: O2 (Residue: FH8, Chain: A), Atom2: N2 (Residue: G, Chain: A), Distance: 2.74 Å
Atom1: O2 (Residue: FH8, Chain: A), Atom2: N1 (Residue: G, Chain: A), Distance: 3.37 Å
Atom1: O2 (Residue: FH8, Chain: A), Atom2: N2 (Residue: G, Chain: A), Distance: 2.97 Å
Atom1: O2 (Residue: FH8, Chain: A), Atom2: N2 (Residue: G, Chain: A), Distance: 2.75 Å
Atom1: N3 (Residue: FH8, Chain: A), Atom2: N1 (Residue: G, Chain: A), Distance: 3.46 Å
Atom1: N3 (Residue: FH8, Chain: A), Atom2: O6 (Residue: G, Chain: A), Distance: 3.41 Å
Atom1: N3 (Residue: FH8, Chain: A), Atom2: N7 (Residue: G, Chain: A), Distance: 3.24 Å
Atom1: O5 (Residue: FH8, Chain: A), Atom2:

# 解决了哪些原子被计算的问题，下面我们考虑所有的model都放在一个坐标系的问题。

## 第一个问题是，每个model是否都有一个自己的空间直角坐标系，以便于我进行坐标转换。

eric的原始假设中，认为ligand都有相同的起始坐标。在此基础上计算contact之间的距离。
举个例子

| 序号 | 实验数据 (x-ray) | 预测数据 (predict) | 匹配状态       |
|:----:|:----------------:|:------------------:|:-------------:|
| contact 1    | (6,4) - (3,4)   | (6,4) - (3,4)     | ✔️ 完全匹配   |
| contact 2    | (6,3) - (3,2)   | (6,3) - (3,5)     | ❌ 部分偏移   |

这需要，获取contasts，转换原子到笛卡尔坐标，print如上| 序号 | 实验数据 (x-ray) | 预测数据 (predict) | 匹配状态       |
|:----:|:----------------:|:------------------:|:-------------:|
| contact 1    | (6,4) - (3,4)   | (6,4) - (3,4)     | ✔️ 完全匹配   |
| contact 2    | (6,3) - (3,2)   | (6,3) - (3,5)     | ❌ 部分偏移   |的结果。
input的部分是“(('6GZR_4', 785), ('6GZR_4', 1551)) Distance: 2.78
(('6GZR_4', 787), ('6GZR_4', 1551)) Distance: 2.74
(('6GZR_4', 819), ('6GZR_4', 1551)) Distance: 3.37
(('6GZR_4', 1267), ('6GZR_4', 1547)) Distance: 2.86
(('6GZR_4', 1271), ('6GZR_4', 1551)) Distance: 2.75
(('6GZR_4', 1272), ('6GZR_4', 1547)) Distance: 3.36”
这里的信息记录了模型id，以及形成contacts的两个原子号，以及计算出的距离，单位是A。
要求建立笛卡儿坐标系，坐标系每2A为一个单位1。程序获取形成contacts的两个原子的笛卡尔坐标。程序获取坐标后进行距离计算返回这对原子的坐标和距离。

In [11]:
from pymol import cmd
import math

# 获取原子坐标的函数
def get_atom_coordinates(model_id, atom_index):
    """
    获取指定模型和原子编号的原子坐标。
    这里的坐标以3位小数精度返回。
    """
    # 通过模型ID和原子编号构造选择条件
    selection = f"{model_id} and index {atom_index}"
    
    # 获取该选择的模型
    model = cmd.get_model(selection)
    
    if len(model.atom) == 0:
        raise ValueError(f"No atom found with index {atom_index} in model {model_id}")
    
    # 获取原子的坐标（返回的坐标是一个元组: (x, y, z)）
    atom_coord = model.atom[0].coord
    # 保证坐标以三位小数精度返回
    return tuple(round(coord, 3) for coord in atom_coord)

# 计算两点之间的欧几里得距离
def calculate_distance(coord1, coord2):
    """
    计算两点之间的欧几里得距离，使用三位小数的精度。
    """
    distance = math.sqrt((coord1[0] - coord2[0])**2 + (coord1[1] - coord2[1])**2 + (coord1[2] - coord2[2])**2)
    # 保证计算的距离以三位小数精度返回
    return round(distance, 3)

# 计算并输出contacts原子对的坐标信息和距离
def process_contacts_and_calculate_distance(contact_data):
    """
    处理contacts数据，计算原子对的坐标信息和距离。
    
    contact_data 格式：[('model_id', atom_index1), ('model_id', atom_index2), distance]
    """
    for contact in contact_data:
        model1, atom1 = contact[0]  # 提取第一个原子的模型ID和编号
        model2, atom2 = contact[1]  # 提取第二个原子的模型ID和编号
        distance = contact[2]  # 提取计算出的距离
        
        # 获取原子的坐标
        coord1 = get_atom_coordinates(model1, atom1)
        coord2 = get_atom_coordinates(model2, atom2)
        
        # 计算原子对之间的距离
        calc_distance = calculate_distance(coord1, coord2)
        
        # 输出原子对的坐标信息和计算的距离
        print(f"Contact: {model1} atom {atom1} - {model2} atom {atom2}")
        print(f"Coordinates: ({coord1[0]}, {coord1[1]}, {coord1[2]}) - "
              f"({coord2[0]}, {coord2[1]}, {coord2[2]})")
        print(f"Calculated Distance: {calc_distance} Å (Original Distance: {distance} Å)")
        print(f"Precision used: 3 decimal places for coordinates and distances.")
        print('-' * 60)

# 示例输入数据：原子编号、模型ID 和 计算出来的原子对距离
contact_data = [
    (('6GZR_4', 785), ('6GZR_4', 1551), 2.78),
    (('6GZR_4', 787), ('6GZR_4', 1551), 2.74),
    (('6GZR_4', 819), ('6GZR_4', 1551), 3.37),
    (('6GZR_4', 1267), ('6GZR_4', 1547), 2.86),
    (('6GZR_4', 1271), ('6GZR_4', 1551), 2.75),
    (('6GZR_4', 1272), ('6GZR_4', 1547), 3.36)
]

# 计算并输出
process_contacts_and_calculate_distance(contact_data)


Contact: 6GZR_4 atom 785 - 6GZR_4 atom 1551
Coordinates: (19.603, 1.835, 6.623) - (18.394, 1.34, 4.172)
Calculated Distance: 2.777 Å (Original Distance: 2.78 Å)
Precision used: 3 decimal places for coordinates and distances.
------------------------------------------------------------
Contact: 6GZR_4 atom 787 - 6GZR_4 atom 1551
Coordinates: (20.232, -0.122, 5.579) - (18.394, 1.34, 4.172)
Calculated Distance: 2.738 Å (Original Distance: 2.74 Å)
Precision used: 3 decimal places for coordinates and distances.
------------------------------------------------------------
Contact: 6GZR_4 atom 819 - 6GZR_4 atom 1551
Coordinates: (20.529, 3.932, 3.95) - (18.394, 1.34, 4.172)
Calculated Distance: 3.365 Å (Original Distance: 3.37 Å)
Precision used: 3 decimal places for coordinates and distances.
------------------------------------------------------------
Contact: 6GZR_4 atom 1267 - 6GZR_4 atom 1547
Coordinates: (15.273, 2.161, 0.133) - (16.998, 1.316, 2.252)
Calculated Distance: 2.86 Å (Origina

合并功能模块

In [5]:
#!/usr/bin/python
#######################
# 读取pdb文件，获取 contacts 信息，计算坐标与距离，返回一个如下的数据格式
# {
    # 'model1': '6GZR_4',
    # 'atom1': 785,
    # 'coord1': (19.603, 1.835, 6.623),
    # 'model2': '6GZR_4',
    # 'atom2': 1551,
    # 'coord2': (18.394, 1.34, 4.172),
    # 'calculated_distance': 2.777, 这个来自于pymol的find_pairs
    # 'original_distance': 2.777 这个来自于从获取到的坐标直接用欧几里得公式计算
    # }
#######################
import pymol
from pymol import cmd
import math

#######################
# 模块一：获取 contacts 信息
#######################

def polarpairs(sel1, sel2, cutoff=4.0, angle=54.0, name='', state=1, quiet=1):
    '''
ARGUMENTS
    sel1, sel2 : string: atom selections
    cutoff     : float: distance cutoff
    angle      : float: h-bond angle cutoff in degrees.
                 If angle="default", use "h_bond_max_angle" setting.
                 If angle=0, do not detect h-bonding.
    name       : string: If given, also create a distance object for visual representation
SEE ALSO
    cmd.find_pairs, cmd.distance
    '''
    cutoff = float(cutoff)
    quiet = int(quiet)
    state = int(state)
    if angle == 'default':
        angle = cmd.get('h_bond_max_angle', cmd.get_object_list(sel1)[0])
    angle = float(angle)
    mode = 1 if angle > 0 else 0
    # 计算两种情况的配对：donor-acceptor和acceptor-donor
    x = cmd.find_pairs('(%s) and donors' % sel1, '(%s) and acceptors' % sel2,
            state, state,
            cutoff=cutoff, mode=mode, angle=angle) + \
        cmd.find_pairs('(%s) and acceptors' % sel1, '(%s) and donors' % sel2,
            state, state,
            cutoff=cutoff, mode=mode, angle=angle)
    x = sorted(set(x))
    if not quiet:
        print('Settings: cutoff=%.1f Å, angle=%.1f°' % (cutoff, angle))
        print('Found %d polar contacts' % (len(x)))
    if len(name) > 0:
        for p in x:
            cmd.distance(name, '(%s`%s)' % p[0], '(%s`%s)' % p[1])
    return x

cmd.extend('polarpairs', polarpairs)

#######################
# 模块二：根据 contacts 信息计算坐标与距离
#######################

def get_atom_coordinates(model_id, atom_index):
    """
    获取指定模型和原子编号的原子坐标。
    返回坐标时保留三位小数精度。
    """
    selection = f"{model_id} and index {atom_index}"
    model = cmd.get_model(selection)
    if len(model.atom) == 0:
        raise ValueError(f"No atom found with index {atom_index} in model {model_id}")
    atom_coord = model.atom[0].coord
    # 保证每个坐标值保留三位小数
    return tuple(round(coord, 3) for coord in atom_coord)

def calculate_distance(coord1, coord2):
    """
    计算两点之间的欧几里得距离，结果保留三位小数。
    """
    distance = math.sqrt((coord1[0] - coord2[0])**2 +
                         (coord1[1] - coord2[1])**2 +
                         (coord1[2] - coord2[2])**2)
    return round(distance, 3)


# 获取原子坐标的函数
def get_atom_coordinates(model_id, atom_index):
    """
    获取指定模型和原子编号的原子坐标。
    这里的坐标以3位小数精度返回。
    """
    selection = f"{model_id} and index {atom_index}"
    model = cmd.get_model(selection)
    if len(model.atom) == 0:
        raise ValueError(f"No atom found with index {atom_index} in model {model_id}")
    atom_coord = model.atom[0].coord
    # 保证坐标保留三位小数
    return tuple(round(coord, 3) for coord in atom_coord)

# 计算两点之间的欧几里得距离
def calculate_distance(coord1, coord2):
    """
    计算两点之间的欧几里得距离，使用三位小数的精度。
    """
    distance = math.sqrt((coord1[0] - coord2[0])**2 + 
                         (coord1[1] - coord2[1])**2 + 
                         (coord1[2] - coord2[2])**2)
    # 保证计算的距离保留三位小数
    return round(distance, 3)

def process_contacts_and_calculate_distance(contact_data):
    """
    处理 contacts 数据，计算每对原子的笛卡尔坐标信息和距离，并将结果存储在字典中。
    
    contact_data 格式：
      [
         (('model_id', atom_index1), ('model_id', atom_index2), recorded_distance),
         ...
      ]
    输出每对原子的坐标和计算得到的距离，并存储为数据结构。
    """
    results = []  # 用于存储所有计算的结果
    for contact in contact_data:
        model1, atom1 = contact[0]  # 第一个原子：模型ID和原子编号
        model2, atom2 = contact[1]  # 第二个原子：模型ID和原子编号
        original_distance = contact[2]  # 原始记录的距离（单位 Å）
        
        # 获取原子的笛卡尔坐标
        coord1 = get_atom_coordinates(model1, atom1)
        coord2 = get_atom_coordinates(model2, atom2)
        
        # 计算两原子之间的距离
        calc_distance = calculate_distance(coord1, coord2)
        
        # 存储当前contact的计算结果
        contact_result = {
            'model1': model1,
            'atom1': atom1,
            'coord1': coord1,
            'model2': model2,
            'atom2': atom2,
            'coord2': coord2,
            'calculated_distance': calc_distance,
            'original_distance': original_distance
        }
        results.append(contact_result)

    # 返回所有计算的结果
    return results


#######################
模块0 正式调用
#######################

# 加载 PDB 文件
pymol.cmd.load("6GZR_4.pdb") # 以6gzr的model 4为例

# 选择小分子 FH8（俗称TMR，这里是segment B的A链）
pymol.cmd.select("ligand", "resn FH8 and chain A and resi 101")

# 选择 RNA 链（只有一个A，但也要注意这里RNA链是segment A，ligand是 segment B）
pymol.cmd.select("rna_chain", "chain A and polymer")

# 1. 使用 polarpairs 计算 contacts 信息（返回的是原子对的元组列表）
contacts = polarpairs('rna_chain', 'ligand', angle=63, cutoff=3.5)

# 2. 将 contacts 转换为包含原子对和原始距离的列表 contact_data
contact_data = []
for p in contacts:
    # 计算原始距离（用 cmd.get_distance 来获得记录的距离，保留三位小数）
    orig_dist = cmd.get_distance('(%s`%s)' % p[0], '(%s`%s)' % p[1])
    orig_dist = round(orig_dist, 3)
    contact_data.append((p[0], p[1], orig_dist))

# 3. 根据 contact_data 计算并输出每对原子的坐标和距离，更新了数据格式
results = process_contacts_and_calculate_distance(contact_data)



# 新的数据格式给出

In [7]:
# 实例
results

[{'model1': '6GZR_4',
  'atom1': 785,
  'coord1': (19.603, 1.835, 6.623),
  'model2': '6GZR_4',
  'atom2': 1551,
  'coord2': (18.394, 1.34, 4.172),
  'calculated_distance': 2.777,
  'original_distance': 2.777},
 {'model1': '6GZR_4',
  'atom1': 787,
  'coord1': (20.232, -0.122, 5.579),
  'model2': '6GZR_4',
  'atom2': 1551,
  'coord2': (18.394, 1.34, 4.172),
  'calculated_distance': 2.738,
  'original_distance': 2.738},
 {'model1': '6GZR_4',
  'atom1': 819,
  'coord1': (20.529, 3.932, 3.95),
  'model2': '6GZR_4',
  'atom2': 1551,
  'coord2': (18.394, 1.34, 4.172),
  'calculated_distance': 3.365,
  'original_distance': 3.365},
 {'model1': '6GZR_4',
  'atom1': 1267,
  'coord1': (15.273, 2.161, 0.133),
  'model2': '6GZR_4',
  'atom2': 1547,
  'coord2': (16.998, 1.316, 2.252),
  'calculated_distance': 2.86,
  'original_distance': 2.86},
 {'model1': '6GZR_4',
  'atom1': 1271,
  'coord1': (17.58, 3.792, 5.107),
  'model2': '6GZR_4',
  'atom2': 1551,
  'coord2': (18.394, 1.34, 4.172),
  'calcu

## 新构成了调用方式

接下来考虑自动化这个过程，并且格式化输出，和stanard进行比较获取评分。
考虑批量调用这个脚本，并收集所有的results。
input是所有pdb的文件夹路径，一个文件夹一个pz的组的提交。这一步会通过这个脚本获取到所有的提交的结果的这个坐标和距离指标。

问题是，我们这里前提是让所有的ligand先align了，这样人类的可读性好一些。因为起始坐标ligand上的原子是一致的。

现在实现一个，调用方式为
python calculate_contacts.py -i /path/to/pdb_folder -c config.txt -o output.json
把配置写到config.txt里面，这个脚本会自动读取这个配置，然后调用calculate_contacts.py脚本，把结果所有的results输出到output.json里面。

In [None]:
# config里的内容，ligand在前
resn FH8 and chain A and resi 101
chain A and polymer


In [None]:
# -*- coding: utf-8 -*-
import os
import json
import pymol
from pymol import cmd
import math
import argparse

#######################
# 模块一：获取 contacts 信息
#######################

def polarpairs(sel1, sel2, cutoff=4.0, angle=54.0, name='', state=1, quiet=1):
    cutoff = float(cutoff)
    quiet = int(quiet)
    state = int(state)
    if angle == 'default':
        angle = cmd.get('h_bond_max_angle', cmd.get_object_list(sel1)[0])
    angle = float(angle)
    mode = 1 if angle > 0 else 0
    x = cmd.find_pairs('(%s) and donors' % sel1, '(%s) and acceptors' % sel2,
            state, state,
            cutoff=cutoff, mode=mode, angle=angle) + \
        cmd.find_pairs('(%s) and acceptors' % sel1, '(%s) and donors' % sel2,
            state, state,
            cutoff=cutoff, mode=mode, angle=angle)
    x = sorted(set(x))
    if not quiet:
        print('Settings: cutoff=%.1f Å, angle=%.1f°' % (cutoff, angle))
        print('Found %d polar contacts' % (len(x)))
    if len(name) > 0:
        for p in x:
            cmd.distance(name, '(%s%s)' % p[0], '(%s%s)' % p[1])
    return x

cmd.extend('polarpairs', polarpairs)

#######################
# 模块二：根据 contacts 信息计算坐标与距离
#######################

def get_atom_coordinates(model_id, atom_index):
    selection = f"{model_id} and index {atom_index}"
    model = cmd.get_model(selection)
    if len(model.atom) == 0:
        raise ValueError(f"No atom found with index {atom_index} in model {model_id}")
    atom_coord = model.atom[0].coord
    return tuple(round(coord, 3) for coord in atom_coord)

def calculate_distance(coord1, coord2):
    distance = math.sqrt((coord1[0] - coord2[0])**2 +
                         (coord1[1] - coord2[1])**2 +
                         (coord1[2] - coord2[2])**2)
    return round(distance, 3)

def process_contacts_and_calculate_distance(contact_data):
    results = []
    for contact in contact_data:
        model1, atom1 = contact[0]
        model2, atom2 = contact[1]
        original_distance = contact[2]
        
        coord1 = get_atom_coordinates(model1, atom1)
        coord2 = get_atom_coordinates(model2, atom2)
        
        calc_distance = calculate_distance(coord1, coord2)
        
        contact_result = {
            'model1': model1,
            'atom1': atom1,
            'coord1': coord1,
            'model2': model2,
            'atom2': atom2,
            'coord2': coord2,
            'calculated_distance': calc_distance,
            'original_distance': original_distance
        }
        results.append(contact_result)

    return results

#######################
# 模块0 批量处理部分
#######################

def process_all_pdb_files(folder_path, select_ligand, select_rna_chain, output_file):
    # 存储所有计算结果
    all_results = []

    # 遍历文件夹中的所有 pdb 文件
    for pdb_file in os.listdir(folder_path):
        if pdb_file.endswith('.pdb'):
            pdb_path = os.path.join(folder_path, pdb_file)
            print(f"Processing {pdb_file}...")

            # 加载 PDB 文件
            cmd.load(pdb_path)

            # 使用用户指定的选择条件
            cmd.select("ligand", select_ligand)
            cmd.select("rna_chain", select_rna_chain)

            # 使用 polarpairs 计算 contacts 信息
            contacts = polarpairs('rna_chain', 'ligand', angle=63, cutoff=3.5)

            # 将 contacts 转换为包含原子对和原始距离的列表 contact_data
            contact_data = []
            for p in contacts:
                orig_dist = cmd.get_distance('(%s%s)' % p[0], '(%s%s)' % p[1])
                orig_dist = round(orig_dist, 3)
                contact_data.append((p[0], p[1], orig_dist))

            # 根据 contact_data 计算并输出每对原子的坐标和距离
            results = process_contacts_and_calculate_distance(contact_data)

            # 将当前 PDB 文件的计算结果添加到 all_results 列表中
            all_results.append({
                'pdb_file': pdb_file,
                'contacts': results
            })

            # 删除当前PDB文件，准备下一个文件
            cmd.delete("all")
    
    # 将结果保存到JSON文件
    with open(output_file, 'w') as f:
        json.dump(all_results, f, indent=4)
    print(f"Results have been saved to {output_file}")

#######################
# 批量处理函数的命令行接口
#######################

def main():
    parser = argparse.ArgumentParser(description="Process PDB files and calculate contacts distances.")
    parser.add_argument("-i", "--input_folder", required=True, help="Path to the folder containing PDB files.")
    parser.add_argument("-c", "--config_file", required=True, help="Path to the configuration file containing select statements.")
    parser.add_argument("-o", "--output_file", required=True, help="Path to the output JSON file.")
    args = parser.parse_args()

    # 从 config 文件读取 select 信息
    with open(args.config_file, 'r') as f:
        lines = f.readlines()
        select_ligand = lines[1].strip()
        select_rna_chain = lines[2].strip()

    # 执行批量处理
    process_all_pdb_files(args.input_folder, select_ligand, select_rna_chain, args.output_file)

if __name__ == "__main__":
    main()


In [3]:
# 测试一下
!python testPZpdb/calculate_contacts.py -i testPZpdb -c testPZpdb/config.txt -o testPZpdb/output.json

Processing 6GZR.1-10_09.pdb...
Processing 6GZR.1-10_08.pdb...
Processing 6GZR.1-10_06.pdb...
Processing 6GZR.1-10_07.pdb...
Processing 6GZR.1-10_05.pdb...
Processing 6GZR.1-10_10.pdb...
Processing 6GZR.1-10_04.pdb...
Processing 6GZR.1-10_01.pdb...
Processing 6GZR.1-10_03.pdb...
Processing 6GZR.1-10_02.pdb...
Results have been saved to testPZpdb/output.json


# 可视化的部分

这里的需求是这样的，首先是对标准的数据进行初始化，制作一个初步的tables
，然后对获得的results的json进行可视化。
在这里可以合并这个操作，我指定json中的某一组数据是x-ray组即可。
在config指定哪一组是x-ray组，在可视化的时候捕获这个信息。

In [None]:
#!/usr/bin/python
import json
import csv
import argparse
import math

def format_contact(contact):
    """
    将单个 contact 格式化为字符串，格式为：
    "model1,atom1-model2,atom2; (x1,y1,z1) - (x2,y2,z2); calculated_distance Å"
    """
    model1 = contact["model1"]
    atom1 = contact["atom1"]
    coord1 = contact["coord1"]  # 应该是一个包含3个数字的列表
    model2 = contact["model2"]
    atom2 = contact["atom2"]
    coord2 = contact["coord2"]
    distance = contact["calculated_distance"]
    coord1_str = f"({coord1[0]:.3f},{coord1[1]:.3f},{coord1[2]:.3f})"
    coord2_str = f"({coord2[0]:.3f},{coord2[1]:.3f},{coord2[2]:.3f})"
    return f"{model1},{atom1}-{model2},{atom2}; {coord1_str} - {coord2_str}; {distance:.3f} Å"

def compare_contacts(xray_contact, predict_contact, tol=0.1):
    """
    比较 x-ray 和预测 contact 的 calculated_distance，
    若两者之差小于 tol，则认为“完全匹配”，否则“部分偏移”。
    """
    diff = abs(xray_contact["calculated_distance"] - predict_contact["calculated_distance"])
    return "✔️ 完全匹配" if diff < tol else "❌ 部分偏移"

def process_json_to_csv(json_file, output_csv, xray_identifier):
    """
    从 JSON 文件读取数据（多个组），每个组包含 "pdb_file" 和 "contacts"，
    根据外部指定的 xray_identifier 确定哪个组为实验数据，其它组为预测数据（这里只取一个预测组）。
    然后生成 CSV 文件，表头：
    序号, 实验数据 (x-ray), 预测数据 (predict), 匹配状态
    每一行对应一个 contact 的数据，contact 信息由 model, atom, 坐标和 calculated_distance 组成。
    """
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    xray_group = None
    predict_group = None
    for group in data:
        pdb_file = group.get("pdb_file", "")
        if xray_identifier in pdb_file:
            xray_group = group
        else:
            # 这里只取第一个非-xray组作为预测组
            if predict_group is None:
                predict_group = group
    
    if xray_group is None:
        raise ValueError(f"未找到包含 '{xray_identifier}' 的 x-ray 组")
    if predict_group is None:
        raise ValueError("未找到预测组")
    
    xray_contacts = xray_group.get("contacts", [])
    predict_contacts = predict_group.get("contacts", [])
    
    # 取两组中最少的 contact 数量
    num_contacts = min(len(xray_contacts), len(predict_contacts))
    
    # 存储所有表格行数据
    table_rows = []
    for i in range(num_contacts):
        xray_contact = xray_contacts[i]
        predict_contact = predict_contacts[i]
        xray_str = format_contact(xray_contact)
        predict_str = format_contact(predict_contact)
        match_status = compare_contacts(xray_contact, predict_contact)
        table_rows.append([f"contact {i+1}", xray_str, predict_str, match_status])
    
    # 写入 CSV 文件
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["序号", "实验数据 (x-ray)", "预测数据 (predict)", "匹配状态"])
        writer.writerows(table_rows)
    
    print(f"CSV file saved to {output_csv}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert contacts JSON to CSV table.")
    parser.add_argument("-j", "--json", required=True, help="Input JSON file containing contacts groups.")
    parser.add_argument("-o", "--output", required=True, help="Output CSV file path.")
    parser.add_argument("-x", "--xray", required=True, help="Identifier string for the x-ray group (e.g., '6GZR.1-10_09').")
    args = parser.parse_args()
    
    process_json_to_csv(args.json, args.output, args.xray)


In [9]:
!python testPZpdb/convert_json_to_csv.py -j testPZpdb/output.json -o testPZpdb/results.csv -x "6GZR.1-10_09"


CSV 文件已保存至 testPZpdb/results.csv


# 把所有的结构按照ligand进行align的部分
借助pymol来实现，读入文件夹里的所有结构，然后把ligand align到指定的x-ray的结构的ligand上，然后保存为pdb文件



In [None]:
import os
from pymol import cmd
import argparse

def load_structures(input_folder):
    """
    加载指定文件夹中的所有PDB文件，并返回结构列表。
    """
    structures = []
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".pdb"):
                structures.append(os.path.join(root, file))
    return structures

def get_reference_structure(structures, config_file):
    """
    从配置文件中读取参考结构名称，并找到它在structures中的位置。
    """
    with open(config_file, 'r') as f:
        reference_name = f.readline().strip()
    
    # 在 structures 中找到匹配的模型文件并返回
    for structure in structures:
        if reference_name in structure:  # 匹配参考结构
            return structure
    raise ValueError(f"Reference structure {reference_name} not found in the input folder.")

def load_and_align_structures(structures, reference):
    """
    对比并对齐所有结构文件，align到参考结构。
    structures : list : PDB 文件的相对路径列表
    reference : str : 参考结构的文件名（不含路径）
    """
    # 加载参考结构
    cmd.load(reference)

    # 遍历其他结构并对齐
    for structure in structures:
        # 去除文件路径，只保留文件名（去掉目录部分）
        structure_name = os.path.basename(structure).replace(".pdb", "")
        reference_name = os.path.basename(reference).replace(".pdb", "")
        if structure_name != reference:
            print(f"Aligning {structure_name} to reference {reference}...")

            # 加载结构并选择其ligand
            cmd.load(structure)

            # 选择 ligand 和参考结构中的 ligand
            cmd.select("ligand1", f"{structure_name} and segment B and chain A")
            cmd.select("ligand2", f"{reference_name} and segment B and chain A")

            # 对齐到参考结构
            cmd.align("ligand1", "ligand2")
            
            # 删除当前加载的结构，为下一个结构准备
            cmd.delete(structure_name)

            print(f"Finished aligning {structure_name} to {reference}.")
        else:
            print(f"Skipping alignment for reference structure {reference}.")

    """
    对比并对齐所有结构文件，align到参考结构。
    """
    # 加载参考结构
    cmd.load(reference)

    # 遍历其他结构并对齐
    for structure in structures:
        if structure != reference:
            #testOut/6GZR.1-10_08.pdb换为6GZR.1-10_08
            structure=structure.replace(".pdb", "/")
            print(structure)
            print(reference)
            # 对齐到参考结构
            print(f"Aligning {structure} to reference {reference}...")
            cmd.select("ligand1", f"{structure} and segment B and chain A")
            cmd.select("ligand2", f"{reference} and segment B and chain A")
            cmd.align("ligand1", "ligand2")

def main():
    """
    主函数，读取命令行参数并执行文件夹中的PDB对齐操作。
    """
    parser = argparse.ArgumentParser(description="Align ligands to x-ray structure.")
    parser.add_argument("-i", "--input_folder", required=True, help="输入文件夹路径，包含PDB文件")
    parser.add_argument("-c", "--config", required=True, help="配置文件路径，包含参考结构名称")
    args = parser.parse_args()

    # 加载结构
    structures = load_structures(args.input_folder)

    # 获取参考结构
    reference = get_reference_structure(structures, args.config)

    # 从结构列表中删除参考模型
    structures.remove(reference)

    # 对齐结构
    load_and_align_structures(structures, reference)

if __name__ == "__main__":
    main()


In [42]:
!python testPZpdb/align_ligand.py -i testOut -c testPZpdb/config.txt

Aligning 6GZR.1-10_08 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_08 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_06 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_06 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_07 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_07 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_05 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_05 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_10 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_10 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_04 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_04 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_01 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_01 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_03 to reference testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_03 to testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-

# 测试单元

In [5]:
!python /Users/hughes/docs/ligandBindingAssessment/ligand_workflow/ligand_workflow/00_align_ligand.py -i /Users/hughes/docs/ligandBindingAssessment/testOut -c /Users/hughes/docs/ligandBindingAssessment/testPZpdb/config.txt

Aligning 6GZR.1-10_08 to reference /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_08 to /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_06 to reference /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_06 to /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_07 to reference /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_07 to /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_05 to reference /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_05 to /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb.
Aligning 6GZR.1-10_10 to reference /Users/hughes/docs/ligandBindingAssessment/testOut/6GZR.1-10_09.pdb...
Finished aligning 6GZR.1-10_10 to /Users/hughes/docs/ligan

In [9]:
!python /Users/hughes/docs/ligandBindingAssessment/ligand_workflow/ligand_workflow/10_calculate_contacts.py -i /Users/hughes/docs/ligandBindingAssessment/testPZpdb -c /Users/hughes/docs/ligandBindingAssessment/testPZpdb/config.txt -o /Users/hughes/docs/ligandBindingAssessment/testPZpdb/output2.json

Processing 6GZR.1-10_09.pdb...
Processing 6GZR.1-10_08.pdb...
Processing 6GZR.1-10_06.pdb...
Processing 6GZR.1-10_07.pdb...
Processing 6GZR.1-10_05.pdb...
Processing 6GZR.1-10_10.pdb...
Processing 6GZR.1-10_04.pdb...
Processing 6GZR.1-10_01.pdb...
Processing 6GZR.1-10_03.pdb...
Processing 6GZR.1-10_02.pdb...
Results have been saved to /Users/hughes/docs/ligandBindingAssessment/testPZpdb/output2.json


In [3]:
!python /Users/hughes/docs/ligandBindingAssessment/ligand_workflow/ligand_workflow/_20_convert_json_to_csv.py -j /Users/hughes/docs/ligandBindingAssessment/testPZpdb/output.json -o /Users/hughes/docs/ligandBindingAssessment/testPZpdb/results.csv -c /Users/hughes/docs/ligandBindingAssessment/testPZpdb/config.txt

CSV 文件已保存至 /Users/hughes/docs/ligandBindingAssessment/testPZpdb/results.csv


In [4]:
!python /Users/hughes/docs/ligandBindingAssessment/ligand_workflow/ligand_workflow/_90_ligand_workflow.py  -i /Users/hughes/docs/ligandBindingAssessment/testPZpdb -c /Users/hughes/docs/ligandBindingAssessment/testPZpdb/config.txt -j /Users/hughes/docs/ligandBindingAssessment/testPZpdb/output.json -o /Users/hughes/docs/ligandBindingAssessment/testPZpdb/results.csv 

Traceback (most recent call last):
  File "/Users/hughes/docs/ligandBindingAssessment/ligand_workflow/ligand_workflow/_90_ligand_workflow.py", line 3, in <module>
    from ligand_workflow._00_align_ligand import load_and_align_structures
ModuleNotFoundError: No module named 'ligand_workflow'
