In [1]:
# Note: Cell is tagged to not show up in the mkdocs build
%load_ext autoreload
%autoreload 2

<div class="admonition abstract highlight">
    <p class="admonition-title">In short</p>
    <p>This tutorial shows how to create datasets with PDBs through the .zarr format.</p>
</div>

### Dummy PDB example

In [10]:
import zarr
import platformdirs

import numpy as np
import datamol as dm
import pandas as pd

from polaris.dataset import DatasetFactory
from polaris.dataset.converters import SDFConverter, PDBConverter

SAVE_DIR = dm.fs.join(platformdirs.user_cache_dir(appname="polaris-tutorials"), "002")

In [11]:
pdb_content = """\
ATOM      1  N   ASN A   1      38.267  13.340  12.748  1.00 18.15           N  
ATOM      2  CA  ASN A   1      37.251  14.218  12.226  1.00 16.56           C  
ATOM      3  C   ASN A   1      36.022  13.500  11.637  1.00 16.50           C  
ATOM      4  O   ASN A   1      35.023  14.079  11.216  1.00 16.60           O  
ATOM      5  CB  ASN A   1      37.767  15.426  11.473  1.00 16.60           C  
TER
END
"""

# Specify the file name
pdb_filename = dm.fs.join(SAVE_DIR, "tutorial.pdb")

# Write the string to a PDB file
with open(pdb_filename, "w") as pdb_file:
    pdb_file.write(pdb_content)

print(f"PDB file '{pdb_filename}' created successfully.")

PDB file '/Users/lu.zhu/Library/Caches/polaris-tutorials/002/tutorial.pdb' created successfully.


### Create dataset from PDB file

In [15]:
save_dst = dm.fs.join(SAVE_DIR, "tutorial_pdb.zarr")

factory = DatasetFactory(zarr_root_path=save_dst)
factory.reset(save_dst)

factory.register_converter("pdb", PDBConverter(pdb_column="pdb"))
factory.add_from_file([pdb_filename])

# Build the dataset
dataset = factory.build()

### Check the dataset

In [17]:
dataset

0,1
name,
description,
tags,
user_attributes,
owner,
polaris_version,0.7.10.dev7+gb61dfdd.d20240809
default_adapters,pdbPDB_TO_ARRAY
zarr_root_path,/Users/lu.zhu/Library/Caches/polaris-tutorials/002/tutorial_pdb.zarr
readme,
annotations,pdbis_pointerTruemodalityPROTEIN_3DdescriptionNoneuser_attributesdtypeobject

0,1
pdb,PDB_TO_ARRAY

0,1
pdb,is_pointerTruemodalityPROTEIN_3DdescriptionNoneuser_attributesdtypeobject

0,1
is_pointer,True
modality,PROTEIN_3D
description,
user_attributes,
dtype,object


### Check data table

In [18]:
dataset.table

Unnamed: 0,pdb
0,pdb#tutorial


### Get PDB data from specific row
A array of list of `biotite.Atom` will be returned.
See more details at [fastpdb](https://github.com/biotite-dev/fastpdb) and [Atom](https://github.com/biotite-dev/biotite/blob/main/src/biotite/structure/atoms.py).

In [19]:
dataset.get_data(0, "pdb")

array([
	Atom(np.array([38.267, 13.34 , 12.748], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="ASN", hetero=False, atom_name="N", element="N", b_factor=18.15, charge=0, occupancy=1.0),
	Atom(np.array([37.251, 14.218, 12.226], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="ASN", hetero=False, atom_name="CA", element="C", b_factor=16.56, charge=0, occupancy=1.0),
	Atom(np.array([36.022, 13.5  , 11.637], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="ASN", hetero=False, atom_name="C", element="C", b_factor=16.5, charge=0, occupancy=1.0),
	Atom(np.array([35.023, 14.079, 11.216], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="ASN", hetero=False, atom_name="O", element="O", b_factor=16.6, charge=0, occupancy=1.0),
	Atom(np.array([37.767, 15.426, 11.473], dtype=float32), chain_id="A", res_id=1, ins_code="", res_name="ASN", hetero=False, atom_name="CB", element="C", b_factor=16.6, charge=0, occupancy=1.0)
])

The process of completing the dataset's metadata and uploading it to the hub follows the same steps as outlined in the tutorial [dataset_zarr.ipynb](docs/tutorials/dataset_zarr.ipynb)

The End. 