## Convert PBD file to Zarr file using `BioPandas`

In [9]:
import pandas as pd
from biopandas.pdb import PandasPdb
import os
import numpy as np
import zarr

In [5]:
# Use Bio.PDB to check if two pdb are identifcal

from Bio.PDB import PDBParser, Superimposer

# Function to load a PDB file
def load_structure(file_path):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('', file_path)
    return structure

# Function to compare two structures
def compare_structures(structure1, structure2):
    parser = PDBParser(QUIET=True)

    # Load structure from pdb files
    structure1 = parser.get_structure('', structure1)
    structure2 = parser.get_structure('', structure2)

    # Check if the number of atoms is the same
    atoms1 = list(structure1.get_atoms())
    atoms2 = list(structure2.get_atoms())

    if len(atoms1) != len(atoms2):
        return False

    # Check if atomic coordinates are the same
    for atom1, atom2 in zip(atoms1, atoms2):
        if not atom1 - atom2 < 1e-3:  # Use a small tolerance for numerical precision
            return False

    # Superimpose the structures and check the RMSD
    sup = Superimposer()
    sup.set_atoms(atoms1, atoms2)
    sup.apply(structure2.get_atoms())

    # Print the RMSD
    print(f"RMSD: {sup.rms:.4f} Å")
    if sup.rms > 1e-3:  # Again, use a small tolerance
        return False

    return True




In [3]:
# load a pdb file using biopandas function
ppdb_df =  PandasPdb().read_pdb('8tav.pdb')

# export PandasPdb object to pdb file
ppdb_df.to_pdb('8tav_biopandas.pdb')

In [6]:
# Compare the structures
are_identical = compare_structures('8tav.pdb', '8tav_biopandas.pdb')

# Print the result
if are_identical:
    print("The structures are identical.")
else:
    print("The structures are not identical.")

RMSD: 0.0000 Å
The structures are identical.


In [7]:
ppdb_df.__dict__.keys()

dict_keys(['_df', 'pdb_text', 'header', 'code', '_get_dict', 'pdb_path'])

In [8]:
def load_group_as_numpy_arrays(group):
    ''' Load group datasets to numpy arrays exluding `column_names`'''
    arrays = {}
    for key, item in group.items():
        if key == "column_names":
            continue
        if isinstance(item, zarr.core.Array):
            arrays[key] = item[:]
        elif isinstance(item, zarr.hierarchy.Group):
            arrays[key] = load_group_as_numpy_arrays(item)
    return arrays

In [10]:
import zarr
import numpy as np


def create_zarr_from_pdb(pdb_file, zarr_file):

    # load structure
    ppdb_df = PandasPdb().read_pdb(pdb_file)

    # Create a Zarr store
    store = zarr.DirectoryStore(zarr_file)

    # Create a root group
    root = zarr.group(store=store)

    dtype_dict = {'object': 'str'}

    for key in ['ATOM', 'HETATM', 'ANISOU', 'OTHERS']:

        # Create group and add datasets
        group = root.create_group(key)
        
        # create a dataset  
        group.create_dataset("column_names", data=ppdb_df.df[key].columns.tolist(), dtype="str")

        for col_name, col_val in ppdb_df.df[key].items():
            dtype = col_val.values.dtype
            dtype = dtype_dict.get(str(dtype) ,dtype)
            group.create_dataset(col_name, data=col_val.values, dtype=dtype)


In [11]:
create_zarr_from_pdb('8tav.pdb', '8tav.zarr')

In [13]:
## load from zarr 
def zarr_to_pdb(zarr_file, pdb_file):
    store = zarr.DirectoryStore(zarr_file)
    root = zarr.open_group(store=store, mode='r')  # 'r' mode for read-only

    ppdb = PandasPdb()

    for key in ['ATOM', 'HETATM', 'ANISOU', 'OTHERS']:
        group = root[key]
        column_names = group["column_names"][:]
        ppdb.df[key] = pd.DataFrame(load_group_as_numpy_arrays(group))[column_names]

    ppdb.to_pdb(pdb_file)

In [14]:
zarr_to_pdb('8tav.zarr', '8tav_from_zarr.pdb')

In [15]:
# Compare the structures
are_identical = compare_structures('8tav.pdb', '8tav_from_zarr.pdb')

# Print the result
if are_identical:
    print("The structures are identical.")
else:
    print("The structures are not identical.")

RMSD: 0.0000 Å
The structures are identical.
