In [1]:
from Bio import PDB

In [17]:
from Bio import PDB
import pandas as pd

def pdb_to_dataframe(pdb_file_path):
    # Create a PDB parser
    parser = PDB.PDBParser(QUIET=True)

    # Parse the PDB file
    structure = parser.get_structure("ATOM", pdb_file_path)

    # Create lists to store data
    atom_data = []

    # Iterate through the structure and collect atom coordinates
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_info = {
                        "Record_Type": atom.get_full_id()[0],
                        "Atom_Serial_Number": atom.get_serial_number(),
                        "Atom_Name": atom.get_name(),
                        "Amino_Acid": residue.resname,
                        "Chain_ID": chain.id,
                        "Residue_Sequence_Number": residue.id[1],
                        "X_Coordinate": atom.coord[0],
                        "Y_Coordinate": atom.coord[1],
                        "Z_Coordinate": atom.coord[2],
                        "Occupancy": atom.get_occupancy(),
                        "B_Factor": atom.get_bfactor(),
                        "Element": atom.element,
                    }
                    atom_data.append(atom_info)

    # Create a DataFrame from the collected data
    df = pd.DataFrame(atom_data)

    return df


# Parse the PDB file
pdb_path = "/home/qiliu02/GHDDI/GHDDI-Database/dev_query_pdb/comp_before.pdb"
df = pdb_to_dataframe(pdb_path)
df

Unnamed: 0,Record_Type,Atom_Serial_Number,Atom_Name,Amino_Acid,Chain_ID,Residue_Sequence_Number,X_Coordinate,Y_Coordinate,Z_Coordinate,Occupancy,B_Factor,Element
0,ATOM,1,N,MET,R,44,111.019997,127.860001,171.990997,1.0,48.95,N
1,ATOM,2,CA,MET,R,44,112.453003,128.162994,171.977005,1.0,48.95,C
2,ATOM,3,C,MET,R,44,113.230003,127.164001,171.112000,1.0,48.95,C
3,ATOM,4,O,MET,R,44,114.134003,127.565002,170.380997,1.0,48.95,O
4,ATOM,5,CB,MET,R,44,113.003998,128.220993,173.417999,1.0,48.95,C
...,...,...,...,...,...,...,...,...,...,...,...,...
4505,ATOM,4506,HD12,LEU,R,357,119.571999,127.056000,145.712997,1.0,35.43,H
4506,ATOM,4507,HD13,LEU,R,357,118.335999,126.264999,146.690002,1.0,35.43,H
4507,ATOM,4508,HD21,LEU,R,357,120.117996,124.129997,147.945007,1.0,35.43,H
4508,ATOM,4509,HD22,LEU,R,357,118.628998,123.608002,147.175003,1.0,35.43,H


In [18]:
path_before = "/home/qiliu02/GHDDI/GHDDI-Database/dev_query_pdb/comp_before.pdb"
df_before = pdb_to_dataframe(path_before)
print(f"df_before: {df_before.shape}")

path_after = "/home/qiliu02/GHDDI/GHDDI-Database/dev_query_pdb/comp.pdb"
df_after = pdb_to_dataframe(path_after)
print(f"df_after: {df_after.shape}")

df_before: (4510, 12)
df_after: (4617, 12)


In [19]:
df_before

Unnamed: 0,Record_Type,Atom_Serial_Number,Atom_Name,Amino_Acid,Chain_ID,Residue_Sequence_Number,X_Coordinate,Y_Coordinate,Z_Coordinate,Occupancy,B_Factor,Element
0,ATOM,1,N,MET,R,44,111.019997,127.860001,171.990997,1.0,48.95,N
1,ATOM,2,CA,MET,R,44,112.453003,128.162994,171.977005,1.0,48.95,C
2,ATOM,3,C,MET,R,44,113.230003,127.164001,171.112000,1.0,48.95,C
3,ATOM,4,O,MET,R,44,114.134003,127.565002,170.380997,1.0,48.95,O
4,ATOM,5,CB,MET,R,44,113.003998,128.220993,173.417999,1.0,48.95,C
...,...,...,...,...,...,...,...,...,...,...,...,...
4505,ATOM,4506,HD12,LEU,R,357,119.571999,127.056000,145.712997,1.0,35.43,H
4506,ATOM,4507,HD13,LEU,R,357,118.335999,126.264999,146.690002,1.0,35.43,H
4507,ATOM,4508,HD21,LEU,R,357,120.117996,124.129997,147.945007,1.0,35.43,H
4508,ATOM,4509,HD22,LEU,R,357,118.628998,123.608002,147.175003,1.0,35.43,H


In [21]:
query_str = "Atom_Name == 'OD1' and Amino_Acid == 'ASP' and Chain_ID=='R' and Residue_Sequence_Number == '121'"
query_res = df_before.query(query_str)
query_res

Unnamed: 0,Record_Type,Atom_Serial_Number,Atom_Name,Amino_Acid,Chain_ID,Residue_Sequence_Number,X_Coordinate,Y_Coordinate,Z_Coordinate,Occupancy,B_Factor,Element
1204,ATOM,1205,OD1,ASP,R,121,131.416,129.070999,170.716003,1.0,32.11,O


In [25]:
query_res['Atom_Serial_Number'].values

array([1205])

In [22]:
df_after.head()

Unnamed: 0,Record_Type,Atom_Serial_Number,Atom_Name,Amino_Acid,Chain_ID,Residue_Sequence_Number,X_Coordinate,Y_Coordinate,Z_Coordinate,Occupancy,B_Factor,Element
0,ATOM,1,C,LIG,,1,127.030998,132.970001,177.307999,1.0,0.0,C
1,ATOM,2,O,LIG,,1,128.078995,132.024002,177.481003,1.0,0.0,O
2,ATOM,3,C1,LIG,,1,127.818001,130.927002,178.276001,1.0,0.0,C
3,ATOM,4,C2,LIG,,1,126.782997,130.904999,179.244995,1.0,0.0,C
4,ATOM,5,C3,LIG,,1,126.555,129.757996,180.024002,1.0,0.0,C


In [27]:
x, y, z = query_res['X_Coordinate'].values[0], query_res['Y_Coordinate'].values[0], query_res['Z_Coordinate'].values[0]
query_xyz = f"X_Coordinate == {x} and Y_Coordinate=={y} and Z_Coordinate=={z}"
query_res_fin = df_after.query(query_xyz)
query_res_fin

Unnamed: 0,Record_Type,Atom_Serial_Number,Atom_Name,Amino_Acid,Chain_ID,Residue_Sequence_Number,X_Coordinate,Y_Coordinate,Z_Coordinate,Occupancy,B_Factor,Element
1265,ATOM,1266,OD1,ASP,,79,131.416,129.070999,170.716003,1.0,0.0,O
